diff --git a/Project.toml b/Project.toml index e3b9195a..c431d83c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,20 +1,22 @@ name = "KernelDensity" uuid = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b" +version = "0.6.8" authors = ["Simon Byrne and various contributors"] -version = "0.6.10" [deps] Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" +IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] Distributions = "0.23, 0.24, 0.25" DocStringExtensions = "0.8, 0.9" FFTW = "1" -Interpolations = "0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16" +Interpolations = "0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15" +IrrationalConstants = "0.2.4" StatsBase = "0.33, 0.34" julia = "1" diff --git a/src/KernelDensity.jl b/src/KernelDensity.jl index 3fd158ca..af4c84b3 100644 --- a/src/KernelDensity.jl +++ b/src/KernelDensity.jl @@ -5,18 +5,29 @@ using StatsBase using Distributions using Interpolations -import Distributions: twoπ, pdf +import IrrationalConstants: twoπ +import Base: getproperty, propertynames + +import Distributions: pdf +import Distributions: ncomponents, component, probs + import FFTW: rfft, irfft +export DiscretisedPDF, KernelEstimate, BandwidthMethod, Silverman, LSCV +export kernel_estimate, precompute! + export kde, kde_lscv, UnivariateKDE, BivariateKDE, InterpKDE, pdf abstract type AbstractKDE end -# define broadcasting behavior Base.Broadcast.broadcastable(x::AbstractKDE) = Ref(x) include("univariate.jl") include("bivariate.jl") include("interp.jl") -end # module +include("initialisation.jl") +include("bandwidth_selection.jl") +include("feature_computation.jl") + +end diff --git a/src/bandwidth_selection.jl b/src/bandwidth_selection.jl new file mode 100644 index 00000000..9f11e179 --- /dev/null +++ b/src/bandwidth_selection.jl @@ -0,0 +1,84 @@ + + +# Silverman's rule of thumb for KDE bandwidth selection +function get_bandwidth(::Silverman, data::AbstractVector{<:Real}, kernelType = Normal, prior = DiscreteUniform(1,length(data)), alpha::Float64 = 0.9) + # Determine length of data + ndata = length(data) + ndata <= 1 && return alpha + + # Calculate width using variance and IQR + var_width = std(data) + q25, q75 = quantile(data, (0.25, 0.75)) + quantile_width = (q75 - q25) / 1.34 + + # Deal with edge cases with 0 IQR or variance + width = min(var_width, quantile_width) + if width == 0.0 + if var_width == 0.0 + width = 1.0 + else + width = var_width + end + end + + # Set bandwidth using Silverman's rule of thumb + return alpha * width * ndata^(-0.2), nothing +end + +struct LSCV <: BandwidthMethod # default args in constructor for backwards comp + nPoints::Int + initBandwidth::Float64 + boundary::Tuple{Float64,Float64} + LSCV(nPoints = 2048, initBandwidth = NaN, boundary = (NaN,NaN)) = new(nPoints, initBandwidth, boundary) +end + +# Select bandwidth using least-squares cross validation, from: +# Density Estimation for Statistics and Data Analysis +# B. W. Silverman (1986) +# sections 3.4.3 (pp. 48-52) and 3.5 (pp. 61-66) +function get_bandwidth(lscv::LSCV, data::AbstractVector{<:Real}, kernelType = Normal, prior = DiscreteUniform(1,length(data))) + K = lscv.nPoints + initBandwidth = isnan(lscv.initBandwidth) ? get_bandwidth(Silverman(), data)[1] : lscv.initBandwidth + boundaryLow, boundaryHigh = lscv.boundary + if isnan(boundaryLow) || isnan(boundaryHigh) + lo, hi = extrema(data) + boundaryLow = isnan(boundaryLow) ? lo - 4.0*initBandwidth : boundaryLow + boundaryHigh = isnan(boundaryHigh) ? hi + 4.0*initBandwidth : boundaryHigh + end + + ndata = length(data) + + midpoints = range(boundaryLow, boundaryHigh, K) + + initDen = tabulate(data, midpoints, prior).values + + # the ft here is K/ba*sqrt(2pi) * u(s), it is K times the Yl in Silverman's book + ft = rfft(initDen) + + ft2 = abs2.(ft) + c = -twoπ/(step(midpoints)*K) + hlb, hub = 0.25*initBandwidth, 1.5*initBandwidth + + optimalBandwidth = optimize(hlb, hub) do h + dist = kernel_dist(kernelType, h) + ψ = 0.0 + for j in 1:length(ft2)-1 + ks = real(cf(dist, j*c)) + ψ += ft2[j+1]*(ks-2.0)*ks + end + ψ*step(midpoints)/K + pdf(dist,0.0)/ndata + end + + dist = kernel_dist(kernelType, optimalBandwidth) + for j in 0:length(ft)-1 + ft[j+1] *= cf(dist, j*c) + end + + convolved = irfft(ft, K) + + # fix rounding error. + convolved .= max.(0., convolved) + + # Invert the Fourier transform to get the KDE + optimalBandwidth, DiscretisedPDF(convolved, midpoints) +end \ No newline at end of file diff --git a/src/exampleUse.jl b/src/exampleUse.jl new file mode 100644 index 00000000..8c05c1f3 --- /dev/null +++ b/src/exampleUse.jl @@ -0,0 +1,48 @@ + +using Plots + +using Distributions +using .KernelDensity + +## + +points = [1. 2 3 4 5] +cat = Categorical([1/5,1/5,1/5,1/5,1/5]) + +ds = DiscretisedPDF(fill(1/20,20),LinRange(0,1,20)) + +k = KernelEstimate(points,cat, Normal(0,1),1.0, nothing) +ncomponents(k1) +component(k1,3) +probs(k1) + +## + +X = randn(10^3) + +k1 = kernel_estimate(X, 0.2, Normal) + +k2 = kernel_estimate(X, Silverman(), Epanechnikov) + +k3 = kernel_estimate(X, LSCV(), Epanechnikov) + +precompute!(k2,2048,(-5,5)) + +pdf(k2, 1) +pdf.(k2, [1, 2, 3]) +pdf(k2, 1, :precomputed) +pdf.(k2, [1, 2, 3], :precomputed) + +cdf(k2, 1) +mean(k2), var(k2) +quantile(k2, 0.9) + +rand(k2) + +kde = k2.precomputedPDF[] # simpler access can be added +plot(kde.xs,kde.values,label = "precomputed") + +xs = LinRange(-4,4,100) +plot!(xs,pdf.(k2, xs),label = "naive") + + diff --git a/src/feature_computation.jl b/src/feature_computation.jl new file mode 100644 index 00000000..92c71f38 --- /dev/null +++ b/src/feature_computation.jl @@ -0,0 +1,97 @@ + +# 1D pdf precomputation + +function precompute!(ke::UnivariateKernelEstimate, nPoints::Integer = 2048, + boundary::Tuple{<:Real,<:Real} =((lo, hi) = extrema(ke.data); (lo - 4.0*ke.bandwidth, hi + 4.0*ke.bandwidth))) + + # find the element type of range in ke.precomputedPDF + T = eltype(typeof(ke).parameters[5].parameters[2]) + midpoints = range(T(boundary[1]), T(boundary[2]), Int(nPoints)) + ke.precomputedPDF[] = conv(tabulate(vec(ke.data), midpoints, ke.prior), ke.kernel) +end + +function tabulate(data::AbstractVector{<:Real}, midpoints::AbstractRange, prior::UnivariateDistribution{Discrete}) + npoints = length(midpoints) + s = step(midpoints) + + # Set up a grid for discretized data + grid = zeros(Float64, npoints) + ainc = 1.0 / (s*s) + + # weighted discretization (cf. Jones and Lotwick) + for (i,x) in enumerate(data) + k = searchsortedfirst(midpoints,x) + j = k-1 + if 1 <= j <= npoints-1 + grid[j] += (midpoints[k]-x)*ainc*pdf(prior,i) + grid[k] += (x-midpoints[j])*ainc*pdf(prior,i) + end + end + + return DiscretisedPDF(grid, midpoints) +end + +function conv(den::DiscretisedPDF{1, R, T}, kernel::UnivariateDistribution) where {T<:Real,R<:AbstractRange} + # Transform to Fourier basis + K = length(den.values) + ft = rfft(den.values) + + # Convolve fft with characteristic function of kernel + # empirical cf + # = \sum_{n=1}^N e^{i*t*X_n} / N + # = \sum_{k=0}^K e^{i*t*(a+k*s)} N_k / N + # = e^{i*t*a} \sum_{k=0}^K e^{-2pi*i*k*(-t*s*K/2pi)/K} N_k / N + # = A * fft(N_k/N)[-t*s*K/2pi + 1] + c = -twoπ/(step(den.xs)*K) + for j in 0:length(ft)-1 + ft[j+1] *= cf(kernel,j*c) + end + + # Invert the Fourier transform to get the KDE + convolved = irfft(ft, K) + + # fix rounding error. + convolved .= max.(0., convolved) + + DiscretisedPDF(convolved, den.xs) +end + +# pdf methods + +# pdf(ke, x) is implemented in Distributions.jl + +function pdf(ke::UnivariateKernelEstimate, x::Real, method::Symbol) + if method == :precomputed + den = ke.precomputedPDF[] + den === nothing && error("PDF must be first precomputed.") + itp_u = interpolate(den.values, BSpline(Quadratic(Line(OnGrid())))) + itp = scale(itp_u, den.xs) + etp = extrapolate(itp, 0.) + return etp.itp(x) + elseif method == :naive + return pdf(ke, x) + else + error("Method not available.") + end +end + +# custom broadcast prepares for interpolation only once for all xs +function Base.Broadcast.broadcasted(::typeof(pdf), ke::UnivariateKernelEstimate, xs, method::Symbol) + if method == :precomputed + isassigned(ke.precomputedPDF) || error("PDF must be first precomputed.") + den = ke.precomputedPDF[] + itp_u = interpolate(den.values, BSpline(Quadratic(Line(OnGrid())))) + itp = scale(itp_u, den.xs) + etp = extrapolate(itp, 0.) + return etp.itp.(xs) + elseif method == :naive + return pdf.(ke, x) + else + error("Method not available.") + end +end + + +# it is possible to add cdf(ke, :precomputed) + +# it is possibble to add cf(ke, t) \ No newline at end of file diff --git a/src/initialisation.jl b/src/initialisation.jl new file mode 100644 index 00000000..8cd4693e --- /dev/null +++ b/src/initialisation.jl @@ -0,0 +1,95 @@ + + +struct DiscretisedPDF{N,R,T} + values::Array{T,N} + labels::NTuple{N,R} + DiscretisedPDF(values::Array{T,N}, labels::NTuple{N,R}) where {N,T<:Real,R<:AbstractRange} = new{N,R,T}(values, labels) +end + +DiscretisedPDF(values::Vector, label::AbstractRange) = DiscretisedPDF(values, (label,)) + +# provides d.xs, d.ys, d.zs for convenience +function Base.getproperty(d::DiscretisedPDF, prop::Symbol) + if prop == :xs + return d.labels[1] + elseif prop == :ys + return d.labels[2] + elseif prop == :zs + return d.labels[3] + else getfield(d, prop) + end +end +Base.propertynames(::DiscretisedPDF) = (:values, :labels, :xs, :ys, :zs) + +# note: in v1.8+ 4 first fields can be made constant using const +struct KernelEstimate{VF<:VariateForm, VS<:ValueSupport, KernelType<:Distribution, PriorDist<:UnivariateDistribution{Discrete}, PDF<:DiscretisedPDF} <: AbstractMixtureModel{VF, VS, KernelType} + data::Matrix{Float64} + prior::PriorDist + kernel::KernelType + bandwidth::Float64 + precomputedPDF::Base.RefValue{PDF} # the value here is mutable + + # these constructors guarantee type agreement + function KernelEstimate(data::Matrix{Float64}, prior::PriorDist, kernel::KernelType, bandwidth::Float64, precomputedPDF::PDF) where { + PriorDist<:UnivariateDistribution{Discrete}, + KernelType <: Distribution, + PDF <: DiscretisedPDF} + VF, VS = supertype(KernelType).parameters + new{VF,VS,KernelType,PriorDist,PDF}(data, prior, kernel, bandwidth, Ref(precomputedPDF)) + end + function KernelEstimate(data::Matrix{Float64}, prior::PriorDist, kernel::KernelType, bandwidth::Float64, ::Nothing) where { + PriorDist<:UnivariateDistribution{Discrete}, + KernelType <: Distribution} + VF, VS = supertype(KernelType).parameters + + # the default PDF type is based on Float64 and Int + R = Base.return_types(range,(Float64,Float64,Int))[1] + PDF = DiscretisedPDF{size(data)[1],R,Float64} + new{VF,VS,KernelType,PriorDist,PDF}(data, prior, kernel, bandwidth, Ref{PDF}()) + end + +end + +UnivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Univariate, VS, K, P, PDF} +MultivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Multivariate, VS, K, P, PDF} + +# KernelEstimate is a scalar +Base.broadcastable(ke::KernelEstimate) = Ref(ke) + +# It is possible to add linear transformations a*ke + b + +abstract type BandwidthMethod end + +# default algorithm +struct Silverman<:BandwidthMethod end + +# implementing common interface of AbstractMixtureModel +ncomponents(ke::KernelEstimate) = size(ke.data)[2] +component(ke::UnivariateKernelEstimate, k) = ke.kernel + ke.data[1,k] +component(ke::MultivariateKernelEstimate, k) = ke.kernel + ke.data[:,k] +probs(ke::KernelEstimate) = probs(ke.prior) + + +# creating KernelEstimate instance + +# make kernel density given bandwidth +function kernel_estimate(data::Vector{<:Real}, bandwidth::Real, kernelType = Normal, prior::UnivariateDistribution{Discrete} = DiscreteUniform(1,length(data))) + kernel = kernel_dist(kernelType, bandwidth) + KernelEstimate(reshape(data, 1, length(data)), prior, kernel, Float64(bandwidth), nothing) +end + +# find bandwidth, then make kernel density +function kernel_estimate(data::Vector{<:Real}, method::BandwidthMethod = Silverman(), kernelType = Normal, prior::UnivariateDistribution{Discrete} = DiscreteUniform(1,length(data))) + bandwidth, kde = get_bandwidth(method, data, kernelType, prior) + kernel = kernel_dist(kernelType, bandwidth) + KernelEstimate(reshape(data,1,length(data)), prior, kernel, bandwidth, kde) +end + +# Can add kernel_estimate which takes prior as a vector. + +# construct kernel from bandwidth +kernel_dist(::Type{Normal}, bandwidth::Real) = Normal(0.0, bandwidth) +kernel_dist(::Type{Uniform}, bandwidth::Real) = Uniform(-√3*bandwidth, √3*bandwidth) + +const LocationScale = Union{Laplace, Logistic, SymTriangularDist, Cosine, Epanechnikov} +kernel_dist(::Type{D},w::Real) where {D<:LocationScale} = D(0.0, w/std(D(0.0,1.0))) diff --git a/src/univariate.jl b/src/univariate.jl index 5f71a809..9d835f4a 100644 --- a/src/univariate.jl +++ b/src/univariate.jl @@ -21,11 +21,11 @@ mutable struct UnivariateKDE{R<:AbstractRange} <: AbstractKDE end # construct kernel from bandwidth -kernel_dist(::Type{Normal},w::Real) = Normal(0.0,w) -kernel_dist(::Type{Uniform},w::Real) = (s = 1.7320508075688772*w; Uniform(-s,s)) +#kernel_dist(::Type{Normal},w::Real) = Normal(0.0,w) +#kernel_dist(::Type{Uniform},w::Real) = (s = 1.7320508075688772*w; Uniform(-s,s)) -const LocationScale = Union{Laplace,Logistic,SymTriangularDist} -kernel_dist(::Type{D},w::Real) where {D} = (s = w/std(D(0.0,1.0)); D(0.0,s)) +#const LocationScale = Union{Laplace,Logistic,SymTriangularDist} +#kernel_dist(::Type{D},w::Real) where {D} = (s = w/std(D(0.0,1.0)); D(0.0,s)) # Silverman's rule of thumb for KDE bandwidth selection