init

jaksle · jaksle · commit 3e94a5f6d871 · 2025-11-01T20:37:27.000+01:00
diff --git a/Project.toml b/Project.toml
@@ -1,20 +1,22 @@
 name = "KernelDensity"
 uuid = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
-authors = ["Simon Byrne and various contributors"]
 version = "0.6.8"
+authors = ["Simon Byrne and various contributors"]
 
 [deps]
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
+IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 Distributions = "0.23, 0.24, 0.25"
 DocStringExtensions = "0.8, 0.9"
 FFTW = "1"
 Interpolations = "0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15"
+IrrationalConstants = "0.2.6"
 StatsBase = "0.33, 0.34"
 julia = "1"
 
diff --git a/src/KernelDensity.jl b/src/KernelDensity.jl
@@ -5,9 +5,17 @@ using StatsBase
 using Distributions
 using Interpolations
 
-import Distributions: twoπ, pdf
+import IrrationalConstants: twoπ
+import Base: getproperty, propertynames
+
+import Distributions: pdf
+import Distributions: ncomponents, component, probs
+
 import FFTW: rfft, irfft
 
+export DiscretisedPDF, KernelEstimate, BandwidthMethod, Silverman, LSCV
+export kernel_estimate, precompute!
+
 export kde, kde_lscv, UnivariateKDE, BivariateKDE, InterpKDE, pdf
 
 abstract type AbstractKDE end
@@ -18,4 +26,7 @@ include("univariate.jl")
 include("bivariate.jl")
 include("interp.jl")
 
-end # module
+include("initialisation.jl")
+include("bandwidth_selection.jl")
+
+end
diff --git a/src/bandwidth_selection.jl b/src/bandwidth_selection.jl
@@ -0,0 +1,74 @@
+
+
+# Silverman's rule of thumb for KDE bandwidth selection
+function get_bandwidth(::Silverman, data::AbstractVector{<:Real}, kernelType = Normal, prior = DiscreteUniform(1,length(data)), alpha::Float64 = 0.9)
+    # Determine length of data
+    ndata = length(data)
+    ndata <= 1 && return alpha
+
+    # Calculate width using variance and IQR
+    var_width = std(data)
+    q25, q75 = quantile(data, (0.25, 0.75))
+    quantile_width = (q75 - q25) / 1.34
+
+    # Deal with edge cases with 0 IQR or variance
+    width = min(var_width, quantile_width)
+    if width == 0.0
+        if var_width == 0.0
+            width = 1.0
+        else
+            width = var_width
+        end
+    end
+
+    # Set bandwidth using Silverman's rule of thumb
+    return alpha * width * ndata^(-0.2), nothing
+end
+
+@kwdef struct LSCV <: BandwidthMethod
+    nPoints::Int = 2048
+    initBandwidth::Float64 = NaN
+end
+
+# Select bandwidth using least-squares cross validation, from:
+#   Density Estimation for Statistics and Data Analysis
+#   B. W. Silverman (1986)
+#   sections 3.4.3 (pp. 48-52) and 3.5 (pp. 61-66)
+function get_bandwidth(lscv::LSCV, data::AbstractVector{<:Real}, kernelType = Normal, prior = DiscreteUniform(1,length(data)))
+    K = lscv.nPoints
+    initBandwidth::Float64 = isnan(lscv.initBandwidth) ? get_bandwidth(Silverman(), data)[1] : lscv.initBandwidth
+    ndata = length(data)
+    lo, hi = extrema(data)
+    midpoints = range(lo - 4.0*initBandwidth, hi + 4.0*initBandwidth, K)
+    initDen = tabulate(data, midpoints, prior).values
+
+    # the ft here is K/ba*sqrt(2pi) * u(s), it is K times the Yl in Silverman's book
+    ft = rfft(initDen)
+
+    ft2 = abs2.(ft)
+    c = -twoπ/(step(midpoints)*K)
+    hlb, hub = 0.25*initBandwidth, 1.5*initBandwidth
+
+    optimalBandwidth = optimize(hlb, hub) do h
+        dist = kernel_dist(kernelType, h)
+        ψ = 0.0
+        for j in 1:length(ft2)-1
+            ks = real(cf(dist, j*c))
+            ψ += ft2[j+1]*(ks-2.0)*ks
+        end
+        ψ*step(midpoints)/K + pdf(dist,0.0)/ndata
+    end
+
+    dist = kernel_dist(kernelType, optimalBandwidth)
+    for j in 0:length(ft)-1
+        ft[j+1] *= cf(dist, j*c)
+    end
+
+   convolved = irfft(ft, K)
+
+    # fix rounding error.
+    convolved .= max.(0., convolved)
+
+    # Invert the Fourier transform to get the KDE
+    optimalBandwidth, DiscretisedPDF(convolved, midpoints)
+end
diff --git a/src/initialisation.jl b/src/initialisation.jl
@@ -0,0 +1,141 @@
+
+
+struct DiscretisedPDF{N,R,T}
+    values::Array{T,N}
+    labels::NTuple{N,R}
+    DiscretisedPDF(values::Array{T,N}, labels::NTuple{N,R}) where {N,T<:Real,R<:AbstractRange} = new{N,R,T}(values, labels)
+end
+
+DiscretisedPDF(values::Vector, label::AbstractRange) = DiscretisedPDF(values, (label,))
+
+# provides d.xs, d.ys, d.zs for convenience
+function Base.getproperty(d::DiscretisedPDF, prop::Symbol)
+    if prop == :xs
+        return d.labels[1]
+    elseif prop == :ys
+        return d.labels[2]
+    elseif prop == :zs
+        return d.labels[3]
+    else getfield(d, prop) 
+    end
+end
+Base.propertynames(::DiscretisedPDF) = (:values, :labels, :xs, :ys, :zs)
+
+
+mutable struct KernelEstimate{VF<:VariateForm, VS<:ValueSupport, KernelType<:Distribution, PriorDist<:UnivariateDistribution{Discrete}, PDF<:DiscretisedPDF} <: AbstractMixtureModel{VF, VS, KernelType}
+    const data::Matrix{Float64}
+    const prior::PriorDist
+    const kernel::KernelType
+    const bandwidth::Float64
+    precomputedPDF::Union{Nothing, PDF}
+
+    # these guarantee type agreement and nothing more
+    function KernelEstimate(data::Matrix{Float64}, prior::PriorDist, kernel::KernelType, bandwidth::Float64, precomputedPDF::PDF) where {
+            PriorDist<:UnivariateDistribution{Discrete},
+            KernelType <: Distribution,
+            PDF <: DiscretisedPDF}
+        VF, VS = supertype(KernelType).parameters
+        new{VF,VS,KernelType,PriorDist,PDF}(data, prior, kernel, bandwidth, precomputedPDF)
+    end
+    function KernelEstimate(data::Matrix{Float64}, prior::PriorDist, kernel::KernelType, bandwidth::Float64, ::Nothing) where {
+            PriorDist<:UnivariateDistribution{Discrete},
+            KernelType <: Distribution}
+        VF, VS = supertype(KernelType).parameters
+        R = Base.return_types(range,(Float64,Float64,Int))[1] 
+        PDF = DiscretisedPDF{size(data)[1],R,eltype(data)}
+        new{VF,VS,KernelType,PriorDist,PDF}(data, prior, kernel, bandwidth, nothing)
+    end
+
+end
+
+UnivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Univariate, VS, K, P, PDF}
+MultivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Multivariate, VS, K, P, PDF}
+
+abstract type BandwidthMethod end
+
+# default algorithm
+struct Silverman<:BandwidthMethod end
+
+# implementing common interface of AbstractMixtureModel
+ncomponents(ke::KernelEstimate) = size(ke.data)[2]
+component(ke::UnivariateKernelEstimate, k) = ke.kernel - ke.data[1,k]
+component(ke::MultivariateKernelEstimate, k) = ke.kernel - ke.data[:,k]
+probs(ke::KernelEstimate) = probs(ke.prior)
+
+
+# creating KernelEstimate instance
+
+# make kernel density given bandwidth
+function kernel_estimate(data::Vector{<:Real}, bandwidth::Real, kernelType = Normal, prior::UnivariateDistribution{Discrete} = DiscreteUniform(1,length(data)))
+    kernel = kernel_dist(kernelType, bandwidth)
+    KernelEstimate(reshape(data, 1, length(data)), prior, kernel, Float64(bandwidth), nothing)
+end
+
+# find bandwidth, then make kernel density
+function kernel_estimate(data::Vector{<:Real}, method::BandwidthMethod = Silverman(), kernelType = Normal, prior::UnivariateDistribution{Discrete} = DiscreteUniform(1,length(data)))
+    bandwidth, kde = get_bandwidth(method, data, kernelType, prior)
+    kernel = kernel_dist(kernelType, bandwidth)
+    KernelEstimate(reshape(data,1,length(data)), prior, kernel, bandwidth, kde)
+end
+
+
+# construct kernel from bandwidth
+kernel_dist(::Type{Normal}, bandwidth::Real) = Normal(0.0, bandwidth)
+kernel_dist(::Type{Uniform}, bandwidth::Real) = Uniform(-√3*bandwidth, √3*bandwidth)
+
+const LocationScale = Union{Laplace, Logistic, SymTriangularDist, Cosine, Epanechnikov}
+kernel_dist(::Type{D},w::Real) where {D<:LocationScale} = D(0.0, w/std(D(0.0,1.0)))
+
+# 1D precomputation 
+
+function precompute!(ke::UnivariateKernelEstimate, nPoints::Int = 2048)
+    lo, hi = extrema(ke.data)
+    midpoints = range(lo - 4.0*ke.bandwidth, hi + 4.0*ke.bandwidth, nPoints)
+    ke.precomputedPDF = conv(tabulate(vec(ke.data), midpoints, ke.prior), ke.kernel)
+end
+
+function tabulate(data::AbstractVector{<:Real}, midpoints::AbstractRange, prior::UnivariateDistribution{Discrete})
+    npoints = length(midpoints)
+    s = step(midpoints)
+
+    # Set up a grid for discretized data
+    grid = zeros(Float64, npoints)
+    ainc = 1.0 / (s*s)
+
+    # weighted discretization (cf. Jones and Lotwick)
+    for (i,x) in enumerate(data)
+        k = searchsortedfirst(midpoints,x)
+        j = k-1
+        if 1 <= j <= npoints-1
+            grid[j] += (midpoints[k]-x)*ainc*pdf(prior,i)
+            grid[k] += (x-midpoints[j])*ainc*pdf(prior,i)
+        end
+    end
+
+    return DiscretisedPDF(grid, midpoints)
+end
+
+function conv(den::DiscretisedPDF{1, R, T}, kernel::UnivariateDistribution) where {T<:Real,R<:AbstractRange}
+    # Transform to Fourier basis
+    K = length(den.values)
+    ft = rfft(den.values)
+
+    # Convolve fft with characteristic function of kernel
+    # empirical cf
+    #  = \sum_{n=1}^N e^{i*t*X_n} / N
+    #  = \sum_{k=0}^K e^{i*t*(a+k*s)} N_k / N
+    #  = e^{i*t*a} \sum_{k=0}^K e^{-2pi*i*k*(-t*s*K/2pi)/K} N_k / N
+    #  = A * fft(N_k/N)[-t*s*K/2pi + 1]
+    c = -twoπ/(step(den.xs)*K)
+    for j in 0:length(ft)-1
+        ft[j+1] *= cf(kernel,j*c)
+    end
+
+    # Invert the Fourier transform to get the KDE
+    convolved = irfft(ft, K)
+
+    # fix rounding error.
+    convolved .= max.(0., convolved)
+
+    DiscretisedPDF(convolved, den.xs)
+end
diff --git a/src/pull.md b/src/pull.md
@@ -0,0 +1,15 @@
+The current capabilities of `KernelDensity` are quite limited, especially if we compare them to what is avaiable e.g. in SciPy, and this supposed to be is the main source of kernel estimation tools in Julia statistical ecosystem.  The current interface and type system were made for calculating discretised pdf using Fourier transform and not for much else. This is a proposition of completely new type system and interface.
+
+The main principles of this proposition are:
+- The new `KernelEstimate` datatype contains full information about the kernel fit, so a statistician can infer all they possibly want from it.
+- `KernelEstimate` is a subtype of `MixtureDistribution` so it fits well the rest of `JuliaStats`, can be much more easily used in other tasks and has a lot of useful default methods already implemented in `Distributions`.
+- This type system can be used for kernel estimation in any dimension.
+- The discretised values of the pdf are distinct from `KernelEstimate`. However, they can be stored in it and if so, getting pdf from interpolation of it becomes avaiable. If it is not present, different methods for getting pdf can be used (e.g. from the naive sum formula, which for small samples is completely reasonable). It can be also added using `precompute!`.
+
+The current approach does not allow for adaptive kde. You can allow for it with the same design, but for the sake of efficiency and code simplicity it might be better to add something like `AdaptiveKernelEstimate` if such a need arises.
+
+I've curretly implemented methods only for 1D. Translating 2D case should be easy, but it might be best to forget about it and implement any dimension from the get go. But this is better to do that after getting some feedback.
+
+Old interface may, or may not stay for backwards compability.
+
+As it stands this proposition does not add any new numerical methods, but even simply linking it to `Distributions` adds a lot of capabilities to this library, which for example Scipy or Sklearn do not have, making it I hope quite more attractive.
diff --git a/src/test.jl b/src/test.jl
@@ -0,0 +1,39 @@
+# use scale instead of std
+# LocationScale ?
+# you can use support to detect 
+# argument for boundary?
+# give any kernel?
+# fix cosine distribution
+
+# add precompute!
+
+using Distributions
+using Plots
+using .KernelDensity
+
+#
+
+points = [1. 2 3 4 5]
+cat = Categorical([1/5,1/5,1/5,1/5,1/5])
+
+ds = DiscretisedPDF(fill(1/20,20),LinRange(0,1,20))
+
+k = KernelEstimate(points,cat, Normal(0,1),1.0, nothing)
+
+X = randn(10^5)
+
+k1 = kernel_estimate(X,0.2,Normal)
+
+k2 = kernel_estimate(X,Silverman(),Epanechnikov)
+
+precompute!(k2)
+
+
+kde = k1.precomputedPDF
+plot(kde.xs,kde.values)
+
+KernelEstimate(reshape(X,1,length(X)), cat, kernel_dist(Normal,1.0), 1.0)
+
+ncompoments(k)
+component(k,3)
+probs(k)