Skip to content

Commit 3e94a5f

Browse files
committed
init
1 parent 8cd5a1c commit 3e94a5f

File tree

6 files changed

+285
-3
lines changed

6 files changed

+285
-3
lines changed

Project.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,22 @@
11
name = "KernelDensity"
22
uuid = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
3-
authors = ["Simon Byrne and various contributors"]
43
version = "0.6.8"
4+
authors = ["Simon Byrne and various contributors"]
55

66
[deps]
77
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
88
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
99
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
1010
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
11+
IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
1112
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
1213

1314
[compat]
1415
Distributions = "0.23, 0.24, 0.25"
1516
DocStringExtensions = "0.8, 0.9"
1617
FFTW = "1"
1718
Interpolations = "0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15"
19+
IrrationalConstants = "0.2.6"
1820
StatsBase = "0.33, 0.34"
1921
julia = "1"
2022

src/KernelDensity.jl

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,17 @@ using StatsBase
55
using Distributions
66
using Interpolations
77

8-
import Distributions: twoπ, pdf
8+
import IrrationalConstants: twoπ
9+
import Base: getproperty, propertynames
10+
11+
import Distributions: pdf
12+
import Distributions: ncomponents, component, probs
13+
914
import FFTW: rfft, irfft
1015

16+
export DiscretisedPDF, KernelEstimate, BandwidthMethod, Silverman, LSCV
17+
export kernel_estimate, precompute!
18+
1119
export kde, kde_lscv, UnivariateKDE, BivariateKDE, InterpKDE, pdf
1220

1321
abstract type AbstractKDE end
@@ -18,4 +26,7 @@ include("univariate.jl")
1826
include("bivariate.jl")
1927
include("interp.jl")
2028

21-
end # module
29+
include("initialisation.jl")
30+
include("bandwidth_selection.jl")
31+
32+
end

src/bandwidth_selection.jl

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
2+
3+
# Silverman's rule of thumb for KDE bandwidth selection
4+
function get_bandwidth(::Silverman, data::AbstractVector{<:Real}, kernelType = Normal, prior = DiscreteUniform(1,length(data)), alpha::Float64 = 0.9)
5+
# Determine length of data
6+
ndata = length(data)
7+
ndata <= 1 && return alpha
8+
9+
# Calculate width using variance and IQR
10+
var_width = std(data)
11+
q25, q75 = quantile(data, (0.25, 0.75))
12+
quantile_width = (q75 - q25) / 1.34
13+
14+
# Deal with edge cases with 0 IQR or variance
15+
width = min(var_width, quantile_width)
16+
if width == 0.0
17+
if var_width == 0.0
18+
width = 1.0
19+
else
20+
width = var_width
21+
end
22+
end
23+
24+
# Set bandwidth using Silverman's rule of thumb
25+
return alpha * width * ndata^(-0.2), nothing
26+
end
27+
28+
@kwdef struct LSCV <: BandwidthMethod
29+
nPoints::Int = 2048
30+
initBandwidth::Float64 = NaN
31+
end
32+
33+
# Select bandwidth using least-squares cross validation, from:
34+
# Density Estimation for Statistics and Data Analysis
35+
# B. W. Silverman (1986)
36+
# sections 3.4.3 (pp. 48-52) and 3.5 (pp. 61-66)
37+
function get_bandwidth(lscv::LSCV, data::AbstractVector{<:Real}, kernelType = Normal, prior = DiscreteUniform(1,length(data)))
38+
K = lscv.nPoints
39+
initBandwidth::Float64 = isnan(lscv.initBandwidth) ? get_bandwidth(Silverman(), data)[1] : lscv.initBandwidth
40+
ndata = length(data)
41+
lo, hi = extrema(data)
42+
midpoints = range(lo - 4.0*initBandwidth, hi + 4.0*initBandwidth, K)
43+
initDen = tabulate(data, midpoints, prior).values
44+
45+
# the ft here is K/ba*sqrt(2pi) * u(s), it is K times the Yl in Silverman's book
46+
ft = rfft(initDen)
47+
48+
ft2 = abs2.(ft)
49+
c = -twoπ/(step(midpoints)*K)
50+
hlb, hub = 0.25*initBandwidth, 1.5*initBandwidth
51+
52+
optimalBandwidth = optimize(hlb, hub) do h
53+
dist = kernel_dist(kernelType, h)
54+
ψ = 0.0
55+
for j in 1:length(ft2)-1
56+
ks = real(cf(dist, j*c))
57+
ψ += ft2[j+1]*(ks-2.0)*ks
58+
end
59+
ψ*step(midpoints)/K + pdf(dist,0.0)/ndata
60+
end
61+
62+
dist = kernel_dist(kernelType, optimalBandwidth)
63+
for j in 0:length(ft)-1
64+
ft[j+1] *= cf(dist, j*c)
65+
end
66+
67+
convolved = irfft(ft, K)
68+
69+
# fix rounding error.
70+
convolved .= max.(0., convolved)
71+
72+
# Invert the Fourier transform to get the KDE
73+
optimalBandwidth, DiscretisedPDF(convolved, midpoints)
74+
end

src/initialisation.jl

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
2+
3+
struct DiscretisedPDF{N,R,T}
4+
values::Array{T,N}
5+
labels::NTuple{N,R}
6+
DiscretisedPDF(values::Array{T,N}, labels::NTuple{N,R}) where {N,T<:Real,R<:AbstractRange} = new{N,R,T}(values, labels)
7+
end
8+
9+
DiscretisedPDF(values::Vector, label::AbstractRange) = DiscretisedPDF(values, (label,))
10+
11+
# provides d.xs, d.ys, d.zs for convenience
12+
function Base.getproperty(d::DiscretisedPDF, prop::Symbol)
13+
if prop == :xs
14+
return d.labels[1]
15+
elseif prop == :ys
16+
return d.labels[2]
17+
elseif prop == :zs
18+
return d.labels[3]
19+
else getfield(d, prop)
20+
end
21+
end
22+
Base.propertynames(::DiscretisedPDF) = (:values, :labels, :xs, :ys, :zs)
23+
24+
25+
mutable struct KernelEstimate{VF<:VariateForm, VS<:ValueSupport, KernelType<:Distribution, PriorDist<:UnivariateDistribution{Discrete}, PDF<:DiscretisedPDF} <: AbstractMixtureModel{VF, VS, KernelType}
26+
const data::Matrix{Float64}
27+
const prior::PriorDist
28+
const kernel::KernelType
29+
const bandwidth::Float64
30+
precomputedPDF::Union{Nothing, PDF}
31+
32+
# these guarantee type agreement and nothing more
33+
function KernelEstimate(data::Matrix{Float64}, prior::PriorDist, kernel::KernelType, bandwidth::Float64, precomputedPDF::PDF) where {
34+
PriorDist<:UnivariateDistribution{Discrete},
35+
KernelType <: Distribution,
36+
PDF <: DiscretisedPDF}
37+
VF, VS = supertype(KernelType).parameters
38+
new{VF,VS,KernelType,PriorDist,PDF}(data, prior, kernel, bandwidth, precomputedPDF)
39+
end
40+
function KernelEstimate(data::Matrix{Float64}, prior::PriorDist, kernel::KernelType, bandwidth::Float64, ::Nothing) where {
41+
PriorDist<:UnivariateDistribution{Discrete},
42+
KernelType <: Distribution}
43+
VF, VS = supertype(KernelType).parameters
44+
R = Base.return_types(range,(Float64,Float64,Int))[1]
45+
PDF = DiscretisedPDF{size(data)[1],R,eltype(data)}
46+
new{VF,VS,KernelType,PriorDist,PDF}(data, prior, kernel, bandwidth, nothing)
47+
end
48+
49+
end
50+
51+
UnivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Univariate, VS, K, P, PDF}
52+
MultivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Multivariate, VS, K, P, PDF}
53+
54+
abstract type BandwidthMethod end
55+
56+
# default algorithm
57+
struct Silverman<:BandwidthMethod end
58+
59+
# implementing common interface of AbstractMixtureModel
60+
ncomponents(ke::KernelEstimate) = size(ke.data)[2]
61+
component(ke::UnivariateKernelEstimate, k) = ke.kernel - ke.data[1,k]
62+
component(ke::MultivariateKernelEstimate, k) = ke.kernel - ke.data[:,k]
63+
probs(ke::KernelEstimate) = probs(ke.prior)
64+
65+
66+
# creating KernelEstimate instance
67+
68+
# make kernel density given bandwidth
69+
function kernel_estimate(data::Vector{<:Real}, bandwidth::Real, kernelType = Normal, prior::UnivariateDistribution{Discrete} = DiscreteUniform(1,length(data)))
70+
kernel = kernel_dist(kernelType, bandwidth)
71+
KernelEstimate(reshape(data, 1, length(data)), prior, kernel, Float64(bandwidth), nothing)
72+
end
73+
74+
# find bandwidth, then make kernel density
75+
function kernel_estimate(data::Vector{<:Real}, method::BandwidthMethod = Silverman(), kernelType = Normal, prior::UnivariateDistribution{Discrete} = DiscreteUniform(1,length(data)))
76+
bandwidth, kde = get_bandwidth(method, data, kernelType, prior)
77+
kernel = kernel_dist(kernelType, bandwidth)
78+
KernelEstimate(reshape(data,1,length(data)), prior, kernel, bandwidth, kde)
79+
end
80+
81+
82+
# construct kernel from bandwidth
83+
kernel_dist(::Type{Normal}, bandwidth::Real) = Normal(0.0, bandwidth)
84+
kernel_dist(::Type{Uniform}, bandwidth::Real) = Uniform(-√3*bandwidth, 3*bandwidth)
85+
86+
const LocationScale = Union{Laplace, Logistic, SymTriangularDist, Cosine, Epanechnikov}
87+
kernel_dist(::Type{D},w::Real) where {D<:LocationScale} = D(0.0, w/std(D(0.0,1.0)))
88+
89+
# 1D precomputation
90+
91+
function precompute!(ke::UnivariateKernelEstimate, nPoints::Int = 2048)
92+
lo, hi = extrema(ke.data)
93+
midpoints = range(lo - 4.0*ke.bandwidth, hi + 4.0*ke.bandwidth, nPoints)
94+
ke.precomputedPDF = conv(tabulate(vec(ke.data), midpoints, ke.prior), ke.kernel)
95+
end
96+
97+
function tabulate(data::AbstractVector{<:Real}, midpoints::AbstractRange, prior::UnivariateDistribution{Discrete})
98+
npoints = length(midpoints)
99+
s = step(midpoints)
100+
101+
# Set up a grid for discretized data
102+
grid = zeros(Float64, npoints)
103+
ainc = 1.0 / (s*s)
104+
105+
# weighted discretization (cf. Jones and Lotwick)
106+
for (i,x) in enumerate(data)
107+
k = searchsortedfirst(midpoints,x)
108+
j = k-1
109+
if 1 <= j <= npoints-1
110+
grid[j] += (midpoints[k]-x)*ainc*pdf(prior,i)
111+
grid[k] += (x-midpoints[j])*ainc*pdf(prior,i)
112+
end
113+
end
114+
115+
return DiscretisedPDF(grid, midpoints)
116+
end
117+
118+
function conv(den::DiscretisedPDF{1, R, T}, kernel::UnivariateDistribution) where {T<:Real,R<:AbstractRange}
119+
# Transform to Fourier basis
120+
K = length(den.values)
121+
ft = rfft(den.values)
122+
123+
# Convolve fft with characteristic function of kernel
124+
# empirical cf
125+
# = \sum_{n=1}^N e^{i*t*X_n} / N
126+
# = \sum_{k=0}^K e^{i*t*(a+k*s)} N_k / N
127+
# = e^{i*t*a} \sum_{k=0}^K e^{-2pi*i*k*(-t*s*K/2pi)/K} N_k / N
128+
# = A * fft(N_k/N)[-t*s*K/2pi + 1]
129+
c = -twoπ/(step(den.xs)*K)
130+
for j in 0:length(ft)-1
131+
ft[j+1] *= cf(kernel,j*c)
132+
end
133+
134+
# Invert the Fourier transform to get the KDE
135+
convolved = irfft(ft, K)
136+
137+
# fix rounding error.
138+
convolved .= max.(0., convolved)
139+
140+
DiscretisedPDF(convolved, den.xs)
141+
end

src/pull.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
The current capabilities of `KernelDensity` are quite limited, especially if we compare them to what is avaiable e.g. in SciPy, and this supposed to be is the main source of kernel estimation tools in Julia statistical ecosystem. The current interface and type system were made for calculating discretised pdf using Fourier transform and not for much else. This is a proposition of completely new type system and interface.
2+
3+
The main principles of this proposition are:
4+
- The new `KernelEstimate` datatype contains full information about the kernel fit, so a statistician can infer all they possibly want from it.
5+
- `KernelEstimate` is a subtype of `MixtureDistribution` so it fits well the rest of `JuliaStats`, can be much more easily used in other tasks and has a lot of useful default methods already implemented in `Distributions`.
6+
- This type system can be used for kernel estimation in any dimension.
7+
- The discretised values of the pdf are distinct from `KernelEstimate`. However, they can be stored in it and if so, getting pdf from interpolation of it becomes avaiable. If it is not present, different methods for getting pdf can be used (e.g. from the naive sum formula, which for small samples is completely reasonable). It can be also added using `precompute!`.
8+
9+
The current approach does not allow for adaptive kde. You can allow for it with the same design, but for the sake of efficiency and code simplicity it might be better to add something like `AdaptiveKernelEstimate` if such a need arises.
10+
11+
I've curretly implemented methods only for 1D. Translating 2D case should be easy, but it might be best to forget about it and implement any dimension from the get go. But this is better to do that after getting some feedback.
12+
13+
Old interface may, or may not stay for backwards compability.
14+
15+
As it stands this proposition does not add any new numerical methods, but even simply linking it to `Distributions` adds a lot of capabilities to this library, which for example Scipy or Sklearn do not have, making it I hope quite more attractive.

src/test.jl

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# use scale instead of std
2+
# LocationScale ?
3+
# you can use support to detect
4+
# argument for boundary?
5+
# give any kernel?
6+
# fix cosine distribution
7+
8+
# add precompute!
9+
10+
using Distributions
11+
using Plots
12+
using .KernelDensity
13+
14+
#
15+
16+
points = [1. 2 3 4 5]
17+
cat = Categorical([1/5,1/5,1/5,1/5,1/5])
18+
19+
ds = DiscretisedPDF(fill(1/20,20),LinRange(0,1,20))
20+
21+
k = KernelEstimate(points,cat, Normal(0,1),1.0, nothing)
22+
23+
X = randn(10^5)
24+
25+
k1 = kernel_estimate(X,0.2,Normal)
26+
27+
k2 = kernel_estimate(X,Silverman(),Epanechnikov)
28+
29+
precompute!(k2)
30+
31+
32+
kde = k1.precomputedPDF
33+
plot(kde.xs,kde.values)
34+
35+
KernelEstimate(reshape(X,1,length(X)), cat, kernel_dist(Normal,1.0), 1.0)
36+
37+
ncompoments(k)
38+
component(k,3)
39+
probs(k)

0 commit comments

Comments
 (0)