Skip to content

Commit 53c04b3

Browse files
committed
feature_computation
1 parent 3e94a5f commit 53c04b3

File tree

6 files changed

+143
-85
lines changed

6 files changed

+143
-85
lines changed

src/KernelDensity.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,6 @@ include("interp.jl")
2828

2929
include("initialisation.jl")
3030
include("bandwidth_selection.jl")
31+
include("feature_computation.jl")
3132

3233
end

src/bandwidth_selection.jl

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ end
2828
@kwdef struct LSCV <: BandwidthMethod
2929
nPoints::Int = 2048
3030
initBandwidth::Float64 = NaN
31+
boundary::Tuple{Float64,Float64} = (NaN,NaN)
3132
end
3233

3334
# Select bandwidth using least-squares cross validation, from:
@@ -36,10 +37,18 @@ end
3637
# sections 3.4.3 (pp. 48-52) and 3.5 (pp. 61-66)
3738
function get_bandwidth(lscv::LSCV, data::AbstractVector{<:Real}, kernelType = Normal, prior = DiscreteUniform(1,length(data)))
3839
K = lscv.nPoints
39-
initBandwidth::Float64 = isnan(lscv.initBandwidth) ? get_bandwidth(Silverman(), data)[1] : lscv.initBandwidth
40+
initBandwidth = isnan(lscv.initBandwidth) ? get_bandwidth(Silverman(), data)[1] : lscv.initBandwidth
41+
boundaryLow, boundaryHigh = lscv.boundary
42+
if isnan(boundaryLow) || isnan(boundaryHigh)
43+
lo, hi = extrema(data)
44+
boundaryLow = isnan(boundaryLow) ? lo - 4.0*initBandwidth : boundaryLow
45+
boundaryHigh = isnan(boundaryHigh) ? hi + 4.0*initBandwidth : boundaryHigh
46+
end
47+
4048
ndata = length(data)
41-
lo, hi = extrema(data)
42-
midpoints = range(lo - 4.0*initBandwidth, hi + 4.0*initBandwidth, K)
49+
50+
midpoints = range(boundaryLow, boundaryHigh, K)
51+
4352
initDen = tabulate(data, midpoints, prior).values
4453

4554
# the ft here is K/ba*sqrt(2pi) * u(s), it is K times the Yl in Silverman's book

src/test.jl renamed to src/exampleUse.jl

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,47 @@
55
# give any kernel?
66
# fix cosine distribution
77

8-
# add precompute!
8+
using Plots
99

1010
using Distributions
11-
using Plots
1211
using .KernelDensity
1312

14-
#
13+
##
1514

1615
points = [1. 2 3 4 5]
1716
cat = Categorical([1/5,1/5,1/5,1/5,1/5])
1817

1918
ds = DiscretisedPDF(fill(1/20,20),LinRange(0,1,20))
2019

2120
k = KernelEstimate(points,cat, Normal(0,1),1.0, nothing)
21+
ncomponents(k1)
22+
component(k1,3)
23+
probs(k1)
24+
25+
##
2226

2327
X = randn(10^5)
2428

25-
k1 = kernel_estimate(X,0.2,Normal)
29+
k1 = kernel_estimate(X, 0.2, Normal)
30+
31+
k2 = kernel_estimate(X, Silverman(), Epanechnikov)
32+
33+
k3 = kernel_estimate(X, LSCV(), Epanechnikov)
34+
35+
36+
precompute!(k2,2048,(-5,5))
2637

27-
k2 = kernel_estimate(X,Silverman(),Epanechnikov)
38+
pdf(k2, 1)
39+
pdf.(k2, [1, 2, 3])
40+
pdf(k2, 1, :precomputed)
41+
pdf.(k2, [1, 2, 3], :precomputed)
2842

29-
precompute!(k2)
43+
cdf(k2, 1)
44+
mean(k2), var(k2)
45+
quantile(k2, 0.9)
3046

3147

32-
kde = k1.precomputedPDF
48+
kde = k2.precomputedPDF
3349
plot(kde.xs,kde.values)
3450

35-
KernelEstimate(reshape(X,1,length(X)), cat, kernel_dist(Normal,1.0), 1.0)
3651

37-
ncompoments(k)
38-
component(k,3)
39-
probs(k)

src/feature_computation.jl

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
2+
# 1D pdf precomputation
3+
4+
function precompute!(ke::UnivariateKernelEstimate, nPoints::Integer = 2048,
5+
boundary::Tuple{<:Real,<:Real} =((lo, hi) = extrema(ke.data); (lo - 4.0*ke.bandwidth, hi + 4.0*ke.bandwidth)))
6+
7+
# find the element type of range in ke.precomputedPDF
8+
T = eltype(typeof(ke).parameters[5].parameters[2])
9+
midpoints = range(T(boundary[1]), T(boundary[2]), Int(nPoints))
10+
ke.precomputedPDF = conv(tabulate(vec(ke.data), midpoints, ke.prior), ke.kernel)
11+
end
12+
13+
function tabulate(data::AbstractVector{<:Real}, midpoints::AbstractRange, prior::UnivariateDistribution{Discrete})
14+
npoints = length(midpoints)
15+
s = step(midpoints)
16+
17+
# Set up a grid for discretized data
18+
grid = zeros(Float64, npoints)
19+
ainc = 1.0 / (s*s)
20+
21+
# weighted discretization (cf. Jones and Lotwick)
22+
for (i,x) in enumerate(data)
23+
k = searchsortedfirst(midpoints,x)
24+
j = k-1
25+
if 1 <= j <= npoints-1
26+
grid[j] += (midpoints[k]-x)*ainc*pdf(prior,i)
27+
grid[k] += (x-midpoints[j])*ainc*pdf(prior,i)
28+
end
29+
end
30+
31+
return DiscretisedPDF(grid, midpoints)
32+
end
33+
34+
function conv(den::DiscretisedPDF{1, R, T}, kernel::UnivariateDistribution) where {T<:Real,R<:AbstractRange}
35+
# Transform to Fourier basis
36+
K = length(den.values)
37+
ft = rfft(den.values)
38+
39+
# Convolve fft with characteristic function of kernel
40+
# empirical cf
41+
# = \sum_{n=1}^N e^{i*t*X_n} / N
42+
# = \sum_{k=0}^K e^{i*t*(a+k*s)} N_k / N
43+
# = e^{i*t*a} \sum_{k=0}^K e^{-2pi*i*k*(-t*s*K/2pi)/K} N_k / N
44+
# = A * fft(N_k/N)[-t*s*K/2pi + 1]
45+
c = -twoπ/(step(den.xs)*K)
46+
for j in 0:length(ft)-1
47+
ft[j+1] *= cf(kernel,j*c)
48+
end
49+
50+
# Invert the Fourier transform to get the KDE
51+
convolved = irfft(ft, K)
52+
53+
# fix rounding error.
54+
convolved .= max.(0., convolved)
55+
56+
DiscretisedPDF(convolved, den.xs)
57+
end
58+
59+
# pdf methods
60+
61+
# pdf(ke, x) is implemented in Distributions.jl
62+
63+
function pdf(ke::UnivariateKernelEstimate, x::Real, method::Symbol)
64+
if method == :precomputed
65+
den = ke.precomputedPDF
66+
den === nothing || error("PDF must be first precomputed.")
67+
itp_u = interpolate(den.values, BSpline(Quadratic(Line(OnGrid()))))
68+
itp = scale(itp_u, den.xs)
69+
etp = extrapolate(itp, 0.)
70+
return etp.itp(x)
71+
elseif method == :naive
72+
return pdf(ke, x)
73+
else
74+
error("Method not available.")
75+
end
76+
end
77+
78+
# custom broadcast prepares for interpolation only once for all xs
79+
function Base.Broadcast.broadcasted(::typeof(pdf), ke::UnivariateKernelEstimate, xs, method::Symbol)
80+
if method == :precomputed
81+
den = ke.precomputedPDF
82+
den === nothing || error("PDF must be first precomputed.")
83+
itp_u = interpolate(den.values, BSpline(Quadratic(Line(OnGrid()))))
84+
itp = scale(itp_u, den.xs)
85+
etp = extrapolate(itp, 0.)
86+
return etp.itp.(xs)
87+
elseif method == :naive
88+
return pdf.(ke, x)
89+
else
90+
error("Method not available.")
91+
end
92+
end
93+
94+
95+
# it is possible to add cdf(ke, :precomputed)
96+
97+
# it is possibble to add cf(ke, t)

src/initialisation.jl

Lines changed: 10 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ mutable struct KernelEstimate{VF<:VariateForm, VS<:ValueSupport, KernelType<:Dis
2929
const bandwidth::Float64
3030
precomputedPDF::Union{Nothing, PDF}
3131

32-
# these guarantee type agreement and nothing more
32+
# these constructors guarantee type agreement
3333
function KernelEstimate(data::Matrix{Float64}, prior::PriorDist, kernel::KernelType, bandwidth::Float64, precomputedPDF::PDF) where {
3434
PriorDist<:UnivariateDistribution{Discrete},
3535
KernelType <: Distribution,
@@ -41,8 +41,10 @@ mutable struct KernelEstimate{VF<:VariateForm, VS<:ValueSupport, KernelType<:Dis
4141
PriorDist<:UnivariateDistribution{Discrete},
4242
KernelType <: Distribution}
4343
VF, VS = supertype(KernelType).parameters
44+
45+
# the default PDF type is based on Float64 and Int
4446
R = Base.return_types(range,(Float64,Float64,Int))[1]
45-
PDF = DiscretisedPDF{size(data)[1],R,eltype(data)}
47+
PDF = DiscretisedPDF{size(data)[1],R,Float64}
4648
new{VF,VS,KernelType,PriorDist,PDF}(data, prior, kernel, bandwidth, nothing)
4749
end
4850

@@ -51,6 +53,11 @@ end
5153
UnivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Univariate, VS, K, P, PDF}
5254
MultivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Multivariate, VS, K, P, PDF}
5355

56+
# KernelEstimate is a scalar
57+
Base.broadcastable(ke::KernelEstimate) = Ref(ke)
58+
59+
# It is possible to add linear transformations a*ke + b
60+
5461
abstract type BandwidthMethod end
5562

5663
# default algorithm
@@ -78,64 +85,11 @@ function kernel_estimate(data::Vector{<:Real}, method::BandwidthMethod = Silverm
7885
KernelEstimate(reshape(data,1,length(data)), prior, kernel, bandwidth, kde)
7986
end
8087

88+
# Can add kernel_estimate which takes prior as a vector.
8189

8290
# construct kernel from bandwidth
8391
kernel_dist(::Type{Normal}, bandwidth::Real) = Normal(0.0, bandwidth)
8492
kernel_dist(::Type{Uniform}, bandwidth::Real) = Uniform(-√3*bandwidth, 3*bandwidth)
8593

8694
const LocationScale = Union{Laplace, Logistic, SymTriangularDist, Cosine, Epanechnikov}
8795
kernel_dist(::Type{D},w::Real) where {D<:LocationScale} = D(0.0, w/std(D(0.0,1.0)))
88-
89-
# 1D precomputation
90-
91-
function precompute!(ke::UnivariateKernelEstimate, nPoints::Int = 2048)
92-
lo, hi = extrema(ke.data)
93-
midpoints = range(lo - 4.0*ke.bandwidth, hi + 4.0*ke.bandwidth, nPoints)
94-
ke.precomputedPDF = conv(tabulate(vec(ke.data), midpoints, ke.prior), ke.kernel)
95-
end
96-
97-
function tabulate(data::AbstractVector{<:Real}, midpoints::AbstractRange, prior::UnivariateDistribution{Discrete})
98-
npoints = length(midpoints)
99-
s = step(midpoints)
100-
101-
# Set up a grid for discretized data
102-
grid = zeros(Float64, npoints)
103-
ainc = 1.0 / (s*s)
104-
105-
# weighted discretization (cf. Jones and Lotwick)
106-
for (i,x) in enumerate(data)
107-
k = searchsortedfirst(midpoints,x)
108-
j = k-1
109-
if 1 <= j <= npoints-1
110-
grid[j] += (midpoints[k]-x)*ainc*pdf(prior,i)
111-
grid[k] += (x-midpoints[j])*ainc*pdf(prior,i)
112-
end
113-
end
114-
115-
return DiscretisedPDF(grid, midpoints)
116-
end
117-
118-
function conv(den::DiscretisedPDF{1, R, T}, kernel::UnivariateDistribution) where {T<:Real,R<:AbstractRange}
119-
# Transform to Fourier basis
120-
K = length(den.values)
121-
ft = rfft(den.values)
122-
123-
# Convolve fft with characteristic function of kernel
124-
# empirical cf
125-
# = \sum_{n=1}^N e^{i*t*X_n} / N
126-
# = \sum_{k=0}^K e^{i*t*(a+k*s)} N_k / N
127-
# = e^{i*t*a} \sum_{k=0}^K e^{-2pi*i*k*(-t*s*K/2pi)/K} N_k / N
128-
# = A * fft(N_k/N)[-t*s*K/2pi + 1]
129-
c = -twoπ/(step(den.xs)*K)
130-
for j in 0:length(ft)-1
131-
ft[j+1] *= cf(kernel,j*c)
132-
end
133-
134-
# Invert the Fourier transform to get the KDE
135-
convolved = irfft(ft, K)
136-
137-
# fix rounding error.
138-
convolved .= max.(0., convolved)
139-
140-
DiscretisedPDF(convolved, den.xs)
141-
end

src/pull.md

Lines changed: 0 additions & 15 deletions
This file was deleted.

0 commit comments

Comments
 (0)