diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml
index 591bb3e2d..982cf5f83 100644
--- a/.github/workflows/Test.yml
+++ b/.github/workflows/Test.yml
@@ -125,7 +125,7 @@ jobs:
       - uses: actions/checkout@v5
       - uses: julia-actions/setup-julia@v2
         with:
-          version: '1.10'
+          version: '1.11'
       - name: Develop packages
         run: |
           julia -e "
diff --git a/docs/Project.toml b/docs/Project.toml
index afd6fcd38..022ce7094 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,13 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 
 [compact]
 Documenter = "1.8"
+
+[sources]
+GPUArrays = {path = ".."}
+GPUArraysCore = {path = "../lib/GPUArraysCore"}
+JLArrays = {path = "../lib/JLArrays"}
diff --git a/docs/make.jl b/docs/make.jl
index a37b0cd9b..3b0d6d4f7 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,8 +1,8 @@
-using Documenter, GPUArrays
+using Documenter, GPUArrays, GPUArraysCore, JLArrays
 
 function main()
     makedocs(
-        modules = [GPUArrays],
+        modules = [GPUArrays, GPUArraysCore, JLArrays],
         format = Documenter.HTML(
             # Use clean URLs on CI
             prettyurls = get(ENV, "CI", nothing) == "true",
@@ -11,19 +11,15 @@ function main()
         ),
         sitename = "GPUArrays.jl",
         pages = [
-            "Home"          => "index.md",
-            "Interface"     => "interface.md",
-            "Functionality" => [
-                "functionality/host.md",
-                "functionality/device.md",
-            ],
-            "Test suite"    => "testsuite.md",
+            "Home" => "index.md",
+            "interface.md",
+            "api.md",
         ],
         doctest = true,
         warnonly = [:missing_docs],
     )
 
-    deploydocs(
+    return deploydocs(
         repo = "github.com/JuliaGPU/GPUArrays.jl.git"
     )
 end
diff --git a/docs/src/api.md b/docs/src/api.md
new file mode 100644
index 000000000..77a911a7b
--- /dev/null
+++ b/docs/src/api.md
@@ -0,0 +1,49 @@
+# API Reference
+
+## GPUArrays
+
+### Public
+
+```@autodocs
+Modules = [GPUArrays]
+Private = false
+```
+
+### Internals
+
+```@autodocs
+Modules = [GPUArrays]
+Public = false
+```
+
+## GPUArraysCore
+
+### Public
+
+```@autodocs
+Modules = [GPUArraysCore]
+Private = false
+```
+
+### Internals
+
+```@autodocs
+Modules = [GPUArraysCore]
+Public = false
+```
+
+## JLArrays
+
+### Public
+
+```@autodocs
+Modules = [JLArrays]
+Private = false
+```
+
+### Internals
+
+```@autodocs
+Modules = [JLArrays]
+Public = false
+```
diff --git a/docs/src/functionality/device.md b/docs/src/functionality/device.md
deleted file mode 100644
index 364aeedf1..000000000
--- a/docs/src/functionality/device.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# `AbstractDeviceArray`
-
-TODO: describe functionality
diff --git a/docs/src/functionality/host.md b/docs/src/functionality/host.md
deleted file mode 100644
index dc450f323..000000000
--- a/docs/src/functionality/host.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# `AbstractGPUArray`
-
-TODO: describe functionality
diff --git a/docs/src/interface.md b/docs/src/interface.md
index 9e4864ada..cef80e308 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -1,6 +1,6 @@
 # Interface
 
-To extend the above functionality to a new array type, you should use the types and
+To extend the GPUArrays functionality to a new array type, you should use the types and
 implement the interfaces listed on this page. GPUArrays is designed around having two
 different array types to represent a GPU array: one that exists only on the host, and
 one that actually can be instantiated on the device (i.e. in kernels).
@@ -31,9 +31,45 @@ KernelAbstractions.get_backend(a::CA) where CA <: CustomArray = CustomBackend()
 
 There are numerous examples of potential interfaces for GPUArrays, such as with [JLArrays](https://github.com/JuliaGPU/GPUArrays.jl/blob/master/lib/JLArrays/src/JLArrays.jl), [CuArrays](https://github.com/JuliaGPU/CUDA.jl/blob/master/src/gpuarrays.jl), and [ROCArrays](https://github.com/JuliaGPU/AMDGPU.jl/blob/master/src/gpuarrays.jl).
 
-## Caching Allocator
+## Device abstractions
 
-```@docs
-GPUArrays.@cached
-GPUArrays.@uncached
+!!! warning
+    Work in progress.
+
+## Test suite
+
+GPUArrays provides an extensive test suite that covers all of the functionality that should
+be available after implementing the required interfaces. This test suite is part of this
+package, but for dependency reasons it is not available when importing the package. Instead,
+you should include the code from your `runtests.jl` as follows:
+
+```julia
+import GPUArrays
+gpuarrays = pathof(GPUArrays)
+gpuarrays_root = dirname(dirname(gpuarrays))
+include(joinpath(gpuarrays_root, "test", "testsuite.jl"))
+```
+
+With this set-up, you can run the test suite like this:
+
+```julia
+TestSuite.test(MyGPUArrayType)
+```
+
+If you don't want to run the whole suite, you can also run parts of it:
+
+```julia
+T = JLArray
+GPUArrays.allowscalar(false) # fail tests when slow indexing path into Array type is used.
+
+TestSuite.test_gpuinterface(T) # interface functions like gpu_call, threadidx, etc
+TestSuite.test_base(T) # basic functionality like launching a kernel on the GPU and Base operations
+TestSuite.test_blas(T) # tests the blas interface
+TestSuite.test_broadcasting(T) # tests the broadcasting implementation
+TestSuite.test_construction(T) # tests all kinds of different ways of constructing the array
+TestSuite.test_linalg(T) # linalg function tests
+TestSuite.test_mapreduce(T) # mapreduce sum, etc
+TestSuite.test_indexing(T) # indexing tests
+TestSuite.test_random(T) # randomly constructed arrays
+TestSuite.test_io(T)
 ```
diff --git a/docs/src/testsuite.md b/docs/src/testsuite.md
deleted file mode 100644
index c953eff05..000000000
--- a/docs/src/testsuite.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# Test suite
-
-GPUArrays provides an extensive test suite that covers all of the functionality that should
-be available after implementing the required interfaces. This test suite is part of this
-package, but for dependency reasons it is not available when importing the package. Instead,
-you should include the code from your `runtests.jl` as follows:
-
-```julia
-import GPUArrays
-gpuarrays = pathof(GPUArrays)
-gpuarrays_root = dirname(dirname(gpuarrays))
-include(joinpath(gpuarrays_root, "test", "testsuite.jl"))
-```
-
-With this set-up, you can run the test suite like this:
-
-```julia
-TestSuite.test(MyGPUArrayType)
-```
-If you don't want to run the whole suite, you can also run parts of it:
-
-
-```julia
-T = JLArray
-GPUArrays.allowscalar(false) # fail tests when slow indexing path into Array type is used.
-
-TestSuite.test_gpuinterface(T) # interface functions like gpu_call, threadidx, etc
-TestSuite.test_base(T) # basic functionality like launching a kernel on the GPU and Base operations
-TestSuite.test_blas(T) # tests the blas interface
-TestSuite.test_broadcasting(T) # tests the broadcasting implementation
-TestSuite.test_construction(T) # tests all kinds of different ways of constructing the array
-TestSuite.test_linalg(T) # linalg function tests
-TestSuite.test_mapreduce(T) # mapreduce sum, etc
-TestSuite.test_indexing(T) # indexing tests
-TestSuite.test_random(T) # randomly constructed arrays
-TestSuite.test_io(T)
-```
diff --git a/lib/GPUArraysCore/src/GPUArraysCore.jl b/lib/GPUArraysCore/src/GPUArraysCore.jl
index bcf24e601..3dc17ac09 100644
--- a/lib/GPUArraysCore/src/GPUArraysCore.jl
+++ b/lib/GPUArraysCore/src/GPUArraysCore.jl
@@ -18,21 +18,64 @@ for device-side objects.
 """
 abstract type AbstractGPUArray{T, N} <: DenseArray{T, N} end
 
+"""
+    AbstractGPUVector{T}
+
+Shortcut for `AbstractGPUArray{T, 1}`.
+"""
 const AbstractGPUVector{T} = AbstractGPUArray{T, 1}
+
+"""
+    AbstractGPUMatrixT}
+
+Shortcut for `AbstractGPUArray{T, 2}`.
+"""
 const AbstractGPUMatrix{T} = AbstractGPUArray{T, 2}
+
+"""
+    AbstractGPUVecOrMat{T}
+
+Shortcut for `Union{AbstractGPUArray{T, 1}, AbstractGPUArray{T, 2}}`.
+"""
 const AbstractGPUVecOrMat{T} = Union{AbstractGPUArray{T, 1}, AbstractGPUArray{T, 2}}
 
 # convenience aliases for working with wrapped arrays
+
+"""
+    WrappedGPUArray{T, N}
+
+Convenience alias for working with wrapped arrays from [Adapt.jl](https://github.com/JuliaGPU/Adapt.jl).
+"""
 const WrappedGPUArray{T,N} = WrappedArray{T,N,AbstractGPUArray,AbstractGPUArray{T,N}}
+
+"""
+    AnyGPUArray{T, N}
+
+Shortcut for `Union{AbstractGPUArray{T,N}, WrappedGPUArray{T,N}}`.
+"""
 const AnyGPUArray{T,N} = Union{AbstractGPUArray{T,N}, WrappedGPUArray{T,N}}
+
+"""
+    AnyGPUVector{T}
+
+Shortcut for `AnyGPUArray{T, 1}`.
+"""
 const AnyGPUVector{T} = AnyGPUArray{T, 1}
+
+"""
+    AnyGPUMatrix{T}
+
+Shortcut for `AnyGPUArray{T, 2}`.
+"""
 const AnyGPUMatrix{T} = AnyGPUArray{T, 2}
 
 
 ## broadcasting
 
 """
-Abstract supertype for GPU array styles. The `N` parameter is the dimensionality.
+    AbstractGPUArrayStyle{N} <: Base.Broadcast.AbstractArrayStyle{N}
+
+Abstract supertype for GPU array broadcasting styles. The `N` parameter is the dimensionality.
 
 Downstream implementations should provide a concrete array style type that inherits from
 this supertype.
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 097f01530..0d0d851ae 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -34,6 +34,15 @@ end
 
 const MAXTHREADS = 256
 
+"""
+    JLBackend <: KernelAbstractions.GPU
+
+Backend object associated with JLArrays for [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl).
+
+# Fields
+
+- `static::Bool=false`
+"""
 struct JLBackend <: KernelAbstractions.GPU
     static::Bool
     JLBackend(;static::Bool=false) = new(static)
@@ -89,6 +98,13 @@ function check_eltype(T)
   end
 end
 
+"""
+    JLArray{T, N}
+
+CPU-located array type that emulates the behavior of GPU arrays.
+
+Useful for testing GPU-oriented code when no actual GPU is available.
+"""
 mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
     data::DataRef{Vector{UInt8}}
 
@@ -123,6 +139,18 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
     end
 end
 
+"""
+    JLSparseVector{Tv, Ti}
+
+Sparse vector backed by `JLVector`s, similar to `SparseArrays.SparseVector`.
+
+# Fields
+
+- `iPtr::JLVector{Ti, 1}`: indices of non-zero coefficients
+- `nzVal::JLVector{Tv, 1}`: values of non-zero coefficients
+- `len::Int`: size of the vector
+- `nnz::Ti`: number of non-zero coefficients
+"""
 mutable struct JLSparseVector{Tv, Ti} <: GPUArrays.AbstractGPUSparseVector{Tv, Ti}
     iPtr::JLArray{Ti, 1}
     nzVal::JLArray{Tv, 1}
@@ -138,6 +166,19 @@ SparseArrays.nnz(x::JLSparseVector)          = x.nnz
 SparseArrays.nonzeroinds(x::JLSparseVector)  = x.iPtr
 SparseArrays.nonzeros(x::JLSparseVector)     = x.nzVal
 
+"""
+    JLSparseMatrixCSC{Tv, Ti}
+
+Sparse matrix in Compressed Sparse Column format, backed by `JLVector`s (similar to `SparseArrays.SparseMatrixCSC`).
+
+# Fields
+
+- `colPtr::JLArray{Ti, 1}`: column `j` maps to indices `colPtr[j]:(colPtr[j+1]-1)` in `rowVal` and `nzVal`
+- `rowVal::JLArray{Ti, 1}`: row indices for non-zero coefficients
+- `nzVal::JLArray{Tv, 1}`: values of non-zero coefficients
+- `dims::NTuple{2,Int}`: size of the matrix
+- `nnz::Ti`: number of non-zero coefficients
+"""
 mutable struct JLSparseMatrixCSC{Tv, Ti} <: GPUArrays.AbstractGPUSparseMatrixCSC{Tv, Ti}
     colPtr::JLArray{Ti, 1}
     rowVal::JLArray{Ti, 1}
@@ -166,6 +207,19 @@ function Base.getindex(A::JLSparseMatrixCSC{Tv, Ti}, i::Integer, j::Integer) whe
     ((r1 > r2) || (A.rowVal[r1] != i)) ? zero(Tv) : A.nzVal[r1]
 end
 
+"""
+    JLSparseMatrixCSR{Tv, Ti}
+
+Sparse matrix in Compressed Sparse Row format, backed by `JLVector`s (similar to the transpose of a `SparseArrays.SparseMatrixCSC`).
+
+# Fields
+
+- `rowPtr::JLArray{Ti, 1}`: row `i` maps to indices `rowPtr[i]:(rowPtr[i+1]-1)` in `colVal` and `nzVal`
+- `colVal::JLArray{Ti, 1}`: col indices for non-zero coefficients
+- `nzVal::JLArray{Tv, 1}`: values of non-zero coefficients
+- `dims::NTuple{2,Int}`: size of the matrix
+- `nnz::Ti`: number of non-zero coefficients
+"""
 mutable struct JLSparseMatrixCSR{Tv, Ti} <: GPUArrays.AbstractGPUSparseMatrixCSR{Tv, Ti}
     rowPtr::JLArray{Ti, 1}
     colVal::JLArray{Ti, 1}
@@ -273,8 +327,25 @@ end
 
 ## convenience constructors
 
+"""
+    JLVector{T}
+
+Shortcut for `JLArray{T,1}`.
+"""
 const JLVector{T} = JLArray{T,1}
+
+"""
+    JLMatrix{T}
+
+Shortcut for `JLArray{T,2}`.
+"""
 const JLMatrix{T} = JLArray{T,2}
+
+"""
+    JLVecOrMat{T}
+
+Shortcut for `Union{JLVector{T},JLMatrix{T}}`.
+"""
 const JLVecOrMat{T} = Union{JLVector{T},JLMatrix{T}}
 
 # type and dimensionality specified
@@ -309,18 +380,65 @@ export DenseJLArray, DenseJLVector, DenseJLMatrix, DenseJLVecOrMat,
        AnyJLArray, AnyJLVector, AnyJLMatrix, AnyJLVecOrMat
 
 # dense arrays: stored contiguously in memory
+"""
+    DenseJLArray
+
+Supertype for `JLArray`s stored contiguously in memory.
+"""
 DenseJLArray{T,N} = JLArray{T,N}
+
+"""
+    DenseJLVector{T}
+
+Shortcut for `DenseJLArray{T,1}`.
+"""
 DenseJLVector{T} = DenseJLArray{T,1}
+
+"""
+    DenseJLMatrix{T}
+
+Shortcut for `DenseJLArray{T,2}`.
+"""
 DenseJLMatrix{T} = DenseJLArray{T,2}
+
+"""
+    DenseJLVecOrMat{T}
+
+Shortcut for `Union{DenseJLVector{T}, DenseJLMatrix{T}}`.
+"""
 DenseJLVecOrMat{T} = Union{DenseJLVector{T}, DenseJLMatrix{T}}
 
 # strided arrays
 StridedSubJLArray{T,N,I<:Tuple{Vararg{Union{Base.RangeIndex, Base.ReshapedUnitRange,
                                             Base.AbstractCartesianIndex}}}} =
   SubArray{T,N,<:JLArray,I}
+
+"""
+    StridedJLArray{T,N}
+
+Supertype for (views of) `JLArray`s in a strided fashion.
+"""
 StridedJLArray{T,N} = Union{JLArray{T,N}, StridedSubJLArray{T,N}}
+
+"""
+    StridedJLVector{T}
+
+Shortcut for `StridedJLArray{T,1}`.
+"""
 StridedJLVector{T} = StridedJLArray{T,1}
+
+"""
+    StridedJLMatrix{T}
+
+Shortcut for `StridedJLArray{T,2}`.
+"""
 StridedJLMatrix{T} = StridedJLArray{T,2}
+
+"""
+    StridedJLVecOrMat{T}
+
+Shortcut for `Union{StridedJLVector{T}, StridedJLMatrix{T}}`.
+"""
 StridedJLVecOrMat{T} = Union{StridedJLVector{T}, StridedJLMatrix{T}}
 
 Base.pointer(x::StridedJLArray{T}) where {T} = Base.unsafe_convert(Ptr{T}, x)
@@ -328,10 +446,32 @@ Base.pointer(x::StridedJLArray{T}) where {T} = Base.unsafe_convert(Ptr{T}, x)
     Base.unsafe_convert(Ptr{T}, x) + Base._memory_offset(x, i)
 end
 
-# anything that's (secretly) backed by a JLArray
+"""
+    AnyJLArray{T,N}
+
+Supertype for anything that is (secretly) backed by a `JLArray`.
+"""
 AnyJLArray{T,N} = Union{JLArray{T,N}, WrappedArray{T,N,JLArray,JLArray{T,N}}}
+
+"""
+    AnyJLVector{T}
+
+Shortcut for `AnyJLArray{T,1}`.
+"""
 AnyJLVector{T} = AnyJLArray{T,1}
+
+"""
+    AnyJLMatrix{T}
+
+Shortcut for `AnyJLArray{T,2}`.
+"""
 AnyJLMatrix{T} = AnyJLArray{T,2}
+
+"""
+    AnyJLVecOrMat{T}
+
+Shortcut for `Union{AnyJLVector{T}, AnyJLMatrix{T}}`.
+"""
 AnyJLVecOrMat{T} = Union{AnyJLVector{T}, AnyJLMatrix{T}}
 
 
@@ -436,6 +576,12 @@ end
 JLArray{T,N}(xs::JLArray{T,N}) where {T,N} = xs
 
 # adapt for the GPU
+
+"""
+    jl(x)
+
+Adapt an object `x` to the `JLArray` backend.
+"""
 jl(xs) = adapt(JLArray, xs)
 ## don't convert isbits types since they are already considered GPU-compatible
 Adapt.adapt_storage(::Type{JLArray}, xs::AbstractArray) =
diff --git a/src/device/sparse.jl b/src/device/sparse.jl
index b8346eafe..db7fc3d4f 100644
--- a/src/device/sparse.jl
+++ b/src/device/sparse.jl
@@ -12,9 +12,25 @@ using SparseArrays
 export GPUSparseDeviceVector, GPUSparseDeviceMatrixCSC, GPUSparseDeviceMatrixCSR,
        GPUSparseDeviceMatrixBSR, GPUSparseDeviceMatrixCOO
 
+"""
+    AbstractGPUSparseDeviceMatrix{Tv, Ti}
+
+Supertype for GPU sparse matrices with value type `Tv` and index type `Ti`.
+"""
 abstract type AbstractGPUSparseDeviceMatrix{Tv, Ti} <: AbstractSparseMatrix{Tv, Ti} end
 
+"""
+    GPUSparseDeviceVector{Tv,Ti,Vi,Vv}
+
+Sparse vector with generic backing, similar to `SparseArrays.SparseVector`.
 
+# Fields
+
+- `iPtr::Vi`: indices of non-zero coefficients
+- `nzVal::Vv`: values of non-zero coefficients
+- `len::Int`: size of the vector
+- `nnz::Ti`: number of non-zero coefficients
+"""
 struct GPUSparseDeviceVector{Tv,Ti,Vi,Vv, A} <: AbstractSparseVector{Tv,Ti}
     iPtr::Vi
     nzVal::Vv
@@ -28,6 +44,19 @@ SparseArrays.nnz(g::GPUSparseDeviceVector) = g.nnz
 SparseArrays.nonzeroinds(g::GPUSparseDeviceVector) = g.iPtr
 SparseArrays.nonzeros(g::GPUSparseDeviceVector) = g.nzVal
 
+"""
+    GPUSparseDeviceMatrixCSC{Tv,Ti,Vi,Vv}
+
+Sparse matrix in Compressed Sparse Column format with generic backing.
+
+# Fields
+
+- `colPtr::Vi`: column `j` maps to indices `colPtr[j]:(colPtr[j+1]-1)` in `rowVal` and `nzVal`
+- `rowVal::Vi`: row indices for non-zero coefficients
+- `nzVal::Vv`: values of non-zero coefficients
+- `dims::NTuple{2,Int}`: size of the matrix
+- `nnz::Ti`: number of non-zero coefficients
+"""
 struct GPUSparseDeviceMatrixCSC{Tv,Ti,Vi,Vv,A} <: AbstractGPUSparseDeviceMatrix{Tv, Ti}
     colPtr::Vi
     rowVal::Vi
@@ -66,6 +95,19 @@ function SparseArrays.nnz(x::GPUSparseDeviceColumnView)
     return length(SparseArrays.nzrange(A, colidx))
 end
 
+"""
+    GPUSparseDeviceMatrixCSR{Tv,Ti,Vi,Vv}
+
+Sparse matrix in Compressed Sparse Row format with generic backing.
+
+# Fields
+
+- `rowPtr::Vi`: row `i` maps to indices `rowPtr[i]:(rowPtr[i+1]-1)` in `colVal` and `nzVal`
+- `colVal::Vi`: column indices for non-zero coefficients
+- `nzVal::Vv`: values of non-zero coefficients
+- `dims::NTuple{2,Int}`: size of the matrix
+- `nnz::Ti`: number of non-zero coefficients
+"""
 struct GPUSparseDeviceMatrixCSR{Tv,Ti,Vi,Vv,A} <: AbstractGPUSparseDeviceMatrix{Tv,Ti}
     rowPtr::Vi
     colVal::Vi
@@ -84,16 +126,44 @@ end
     end
 end
 
+"""
+    GPUSparseDeviceMatrixBSR{Tv,Ti,Vi,Vv}
+
+Sparse matrix in Block Compressed Sparse Row format with generic backing.
+
+# Fields
+
+- `rowPtr::Vi`: row `i` maps to indices `rowPtr[i]:(rowPtr[i+1]-1)` in `colVal` and `nzVal`
+- `colVal::Vi`: column indices for the top-left corners of the blocks
+- `nzVal::Vv`: values of non-zero coefficients
+- `dims::NTuple{2,Int}`: size of the matrix
+- `blockDim::Ti`: number of rows = number of columns in a block
+- `dir::Char`
+- `nnz::Ti`: number of non-zero coefficients
+"""
 struct GPUSparseDeviceMatrixBSR{Tv,Ti,Vi,Vv,A} <: AbstractGPUSparseDeviceMatrix{Tv,Ti}
     rowPtr::Vi
     colVal::Vi
     nzVal::Vv
     dims::NTuple{2,Int}
-    blockDim::Ti
-    dir::Char
+    blockDim::Ti  # TODO: rectangular blocks?
+    dir::Char  # TODO: document
     nnz::Ti
 end
 
+"""
+    GPUSparseDeviceMatrixCOO{Tv,Ti,Vi,Vv}
+
+Sparse matrix in COOrdinate format with generic backing.
+
+# Fields
+
+- `rowInd::Vi`: row indices for non-zero coefficients
+- `colInd::Vi`: column indices for non-zero coefficients
+- `nzVal::Vv`: values of non-zero coefficients
+- `dims::NTuple{2,Int}`: size of the matrix
+- `nnz::Ti`: number of non-zero coefficients
+"""
 struct GPUSparseDeviceMatrixCOO{Tv,Ti,Vi,Vv, A} <: AbstractGPUSparseDeviceMatrix{Tv,Ti}
     rowInd::Vi
     colInd::Vi
diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl
index 0080935e9..bab7ee54c 100644
--- a/src/host/abstractarray.jl
+++ b/src/host/abstractarray.jl
@@ -54,6 +54,27 @@ end
 # per-object state, with a flag to indicate whether the object has been freed.
 # this is to support multiple calls to `unsafe_free!` on the same object,
 # while only lowering the reference count of the underlying data once.
+
+"""
+    DataRef
+
+A helper class to manage the storage of an array.
+
+There's multiple reasons we don't just put the data directly in a `GPUArray` struct:
+- to share data between multiple arrays, e.g., to create views;
+- to be able to early-free data and release GC pressure.
+
+To support this, wrap the data in a `DataRef` instead, and use it with the following methods:
+- `ref[]`: get the data;
+- `copy(ref)`: create a new reference, increasing the reference count;
+- `unsafe_free!(ref)`: decrease the reference count, and free the data if it reaches 0.
+
+The contained `RefCounted` struct should not be used directly.
+
+The flag `freed` is here to indicate whether the object has been freed.
+This is to support multiple calls to `unsafe_free!` on the same object,
+while only lowering the reference count of the underlying data once.
+"""
 mutable struct DataRef{D}
     rc::RefCounted{D}
     freed::Bool
diff --git a/src/host/construction.jl b/src/host/construction.jl
index 1a4d8f7b5..829626808 100644
--- a/src/host/construction.jl
+++ b/src/host/construction.jl
@@ -83,7 +83,11 @@ function hasfieldcount(@nospecialize(dt))
     return true
 end
 
-# for finding specific element types, e.g., when Float64 is unsupported
+"""
+    contains_eltype(T, typ)
+
+For finding specific element type `T` inside `typ`, e.g., when `Float64` is unsupported.
+"""
 function contains_eltype(T, typ)
     if T === typ
       return true
@@ -99,15 +103,19 @@ function contains_eltype(T, typ)
     return false
 end
 
-# Types that are allocated inline include:
-# 1. plain bitstypes (`Int`, `(Float16, Float32)`, plain immutable structs, etc).
-#    these are simply stored contiguously in the buffer.
-# 2. structs of unions (`struct Foo; x::Union{Int, Float32}; end`)
-#    these are stored with a selector at the end (handled by Julia).
-# 3. bitstype unions (`Union{Int, Float32}`, etc)
-#    these are stored contiguously and require a selector array (handled by us)
-#
-# This function explains why a type is not allocated inline.
+"""
+    explain_allocatedinline(@nospecialize(T)[, depth; maxdepth])
+
+This function explains why a type is not allocated inline.
+
+Types that are allocated inline include:
+1. plain bitstypes (`Int`, `(Float16, Float32)`, plain immutable structs, etc).
+   these are simply stored contiguously in the buffer.
+2. structs of unions (`struct Foo; x::Union{Int, Float32}; end`)
+   these are stored with a selector at the end (handled by Julia).
+3. bitstype unions (`Union{Int, Float32}`, etc)
+   these are stored contiguously and require a selector array (handled by us)
+"""
 function explain_allocatedinline(@nospecialize(T), depth=0; maxdepth=10)
     depth > maxdepth && return ""