feature_computation

jaksle · jaksle · commit 53c04b3c6367 · 2025-11-03T00:43:22.000+01:00
diff --git a/src/KernelDensity.jl b/src/KernelDensity.jl
@@ -28,5 +28,6 @@ include("interp.jl")
 
 include("initialisation.jl")
 include("bandwidth_selection.jl")
+include("feature_computation.jl")
 
 end
diff --git a/src/bandwidth_selection.jl b/src/bandwidth_selection.jl
@@ -28,6 +28,7 @@ end
 @kwdef struct LSCV <: BandwidthMethod
     nPoints::Int = 2048
     initBandwidth::Float64 = NaN
+    boundary::Tuple{Float64,Float64} = (NaN,NaN)
 end
 
 # Select bandwidth using least-squares cross validation, from:
@@ -36,10 +37,18 @@ end
 #   sections 3.4.3 (pp. 48-52) and 3.5 (pp. 61-66)
 function get_bandwidth(lscv::LSCV, data::AbstractVector{<:Real}, kernelType = Normal, prior = DiscreteUniform(1,length(data)))
     K = lscv.nPoints
-    initBandwidth::Float64 = isnan(lscv.initBandwidth) ? get_bandwidth(Silverman(), data)[1] : lscv.initBandwidth
+    initBandwidth = isnan(lscv.initBandwidth) ? get_bandwidth(Silverman(), data)[1] : lscv.initBandwidth
+    boundaryLow, boundaryHigh = lscv.boundary
+    if isnan(boundaryLow) || isnan(boundaryHigh)
+        lo, hi = extrema(data)
+        boundaryLow = isnan(boundaryLow) ? lo - 4.0*initBandwidth : boundaryLow
+        boundaryHigh = isnan(boundaryHigh) ? hi + 4.0*initBandwidth : boundaryHigh
+    end
+
     ndata = length(data)
-    lo, hi = extrema(data)
-    midpoints = range(lo - 4.0*initBandwidth, hi + 4.0*initBandwidth, K)
+    
+    midpoints = range(boundaryLow, boundaryHigh, K)
+
     initDen = tabulate(data, midpoints, prior).values
 
     # the ft here is K/ba*sqrt(2pi) * u(s), it is K times the Yl in Silverman's book
diff --git a/src/exampleUse.jl b/src/exampleUse.jl
@@ -5,35 +5,47 @@
 # give any kernel?
 # fix cosine distribution
 
-# add precompute!
+using Plots
 
 using Distributions
-using Plots
 using .KernelDensity
 
-#
+##
 
 points = [1. 2 3 4 5]
 cat = Categorical([1/5,1/5,1/5,1/5,1/5])
 
 ds = DiscretisedPDF(fill(1/20,20),LinRange(0,1,20))
 
 k = KernelEstimate(points,cat, Normal(0,1),1.0, nothing)
+ncomponents(k1)
+component(k1,3)
+probs(k1)
+
+##
 
 X = randn(10^5)
 
-k1 = kernel_estimate(X,0.2,Normal)
+k1 = kernel_estimate(X, 0.2, Normal)
+
+k2 = kernel_estimate(X, Silverman(), Epanechnikov)
+
+k3 = kernel_estimate(X, LSCV(), Epanechnikov)
+
+
+precompute!(k2,2048,(-5,5))
 
-k2 = kernel_estimate(X,Silverman(),Epanechnikov)
+pdf(k2, 1)
+pdf.(k2, [1, 2, 3])
+pdf(k2, 1, :precomputed)
+pdf.(k2, [1, 2, 3], :precomputed)
 
-precompute!(k2)
+cdf(k2, 1)
+mean(k2), var(k2)
+quantile(k2, 0.9)
 
 
-kde = k1.precomputedPDF
+kde = k2.precomputedPDF
 plot(kde.xs,kde.values)
 
-KernelEstimate(reshape(X,1,length(X)), cat, kernel_dist(Normal,1.0), 1.0)
 
-ncompoments(k)
-component(k,3)
-probs(k)
diff --git a/src/feature_computation.jl b/src/feature_computation.jl
@@ -0,0 +1,97 @@
+
+# 1D pdf precomputation 
+
+function precompute!(ke::UnivariateKernelEstimate, nPoints::Integer = 2048, 
+        boundary::Tuple{<:Real,<:Real} =((lo, hi) = extrema(ke.data); (lo - 4.0*ke.bandwidth, hi + 4.0*ke.bandwidth)))
+    
+    # find the element type of range in ke.precomputedPDF
+    T = eltype(typeof(ke).parameters[5].parameters[2])
+    midpoints = range(T(boundary[1]), T(boundary[2]), Int(nPoints))
+    ke.precomputedPDF = conv(tabulate(vec(ke.data), midpoints, ke.prior), ke.kernel)
+end
+
+function tabulate(data::AbstractVector{<:Real}, midpoints::AbstractRange, prior::UnivariateDistribution{Discrete})
+    npoints = length(midpoints)
+    s = step(midpoints)
+
+    # Set up a grid for discretized data
+    grid = zeros(Float64, npoints)
+    ainc = 1.0 / (s*s)
+
+    # weighted discretization (cf. Jones and Lotwick)
+    for (i,x) in enumerate(data)
+        k = searchsortedfirst(midpoints,x)
+        j = k-1
+        if 1 <= j <= npoints-1
+            grid[j] += (midpoints[k]-x)*ainc*pdf(prior,i)
+            grid[k] += (x-midpoints[j])*ainc*pdf(prior,i)
+        end
+    end
+
+    return DiscretisedPDF(grid, midpoints)
+end
+
+function conv(den::DiscretisedPDF{1, R, T}, kernel::UnivariateDistribution) where {T<:Real,R<:AbstractRange}
+    # Transform to Fourier basis
+    K = length(den.values)
+    ft = rfft(den.values)
+
+    # Convolve fft with characteristic function of kernel
+    # empirical cf
+    #  = \sum_{n=1}^N e^{i*t*X_n} / N
+    #  = \sum_{k=0}^K e^{i*t*(a+k*s)} N_k / N
+    #  = e^{i*t*a} \sum_{k=0}^K e^{-2pi*i*k*(-t*s*K/2pi)/K} N_k / N
+    #  = A * fft(N_k/N)[-t*s*K/2pi + 1]
+    c = -twoπ/(step(den.xs)*K)
+    for j in 0:length(ft)-1
+        ft[j+1] *= cf(kernel,j*c)
+    end
+
+    # Invert the Fourier transform to get the KDE
+    convolved = irfft(ft, K)
+
+    # fix rounding error.
+    convolved .= max.(0., convolved)
+
+    DiscretisedPDF(convolved, den.xs)
+end
+
+# pdf methods
+
+# pdf(ke, x) is implemented in Distributions.jl 
+
+function pdf(ke::UnivariateKernelEstimate, x::Real, method::Symbol)
+    if method == :precomputed
+        den = ke.precomputedPDF
+        den === nothing || error("PDF must be first precomputed.")
+        itp_u = interpolate(den.values, BSpline(Quadratic(Line(OnGrid()))))
+        itp = scale(itp_u, den.xs)
+        etp = extrapolate(itp, 0.)
+        return etp.itp(x)
+    elseif method == :naive
+        return pdf(ke, x)
+    else
+        error("Method not available.")
+    end
+end
+
+# custom broadcast prepares for interpolation only once for all xs
+function Base.Broadcast.broadcasted(::typeof(pdf), ke::UnivariateKernelEstimate, xs, method::Symbol)
+        if method == :precomputed
+            den = ke.precomputedPDF
+            den === nothing || error("PDF must be first precomputed.")
+            itp_u = interpolate(den.values, BSpline(Quadratic(Line(OnGrid()))))
+            itp = scale(itp_u, den.xs)
+            etp = extrapolate(itp, 0.)
+        return etp.itp.(xs)
+    elseif method == :naive
+        return pdf.(ke, x)
+    else
+        error("Method not available.")
+    end
+end
+
+
+# it is possible to add cdf(ke, :precomputed)
+
+# it is possibble to add cf(ke, t)
diff --git a/src/initialisation.jl b/src/initialisation.jl
@@ -29,7 +29,7 @@ mutable struct KernelEstimate{VF<:VariateForm, VS<:ValueSupport, KernelType<:Dis
     const bandwidth::Float64
     precomputedPDF::Union{Nothing, PDF}
 
-    # these guarantee type agreement and nothing more
+    # these constructors guarantee type agreement
     function KernelEstimate(data::Matrix{Float64}, prior::PriorDist, kernel::KernelType, bandwidth::Float64, precomputedPDF::PDF) where {
             PriorDist<:UnivariateDistribution{Discrete},
             KernelType <: Distribution,
@@ -41,8 +41,10 @@ mutable struct KernelEstimate{VF<:VariateForm, VS<:ValueSupport, KernelType<:Dis
             PriorDist<:UnivariateDistribution{Discrete},
             KernelType <: Distribution}
         VF, VS = supertype(KernelType).parameters
+
+        # the default PDF type is based on Float64 and Int
         R = Base.return_types(range,(Float64,Float64,Int))[1] 
-        PDF = DiscretisedPDF{size(data)[1],R,eltype(data)}
+        PDF = DiscretisedPDF{size(data)[1],R,Float64}
         new{VF,VS,KernelType,PriorDist,PDF}(data, prior, kernel, bandwidth, nothing)
     end
 
@@ -51,6 +53,11 @@ end
 UnivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Univariate, VS, K, P, PDF}
 MultivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Multivariate, VS, K, P, PDF}
 
+# KernelEstimate is a scalar
+Base.broadcastable(ke::KernelEstimate) = Ref(ke)
+
+# It is possible to add linear transformations a*ke + b
+
 abstract type BandwidthMethod end
 
 # default algorithm
@@ -78,64 +85,11 @@ function kernel_estimate(data::Vector{<:Real}, method::BandwidthMethod = Silverm
     KernelEstimate(reshape(data,1,length(data)), prior, kernel, bandwidth, kde)
 end
 
+# Can add kernel_estimate which takes prior as a vector. 
 
 # construct kernel from bandwidth
 kernel_dist(::Type{Normal}, bandwidth::Real) = Normal(0.0, bandwidth)
 kernel_dist(::Type{Uniform}, bandwidth::Real) = Uniform(-√3*bandwidth, √3*bandwidth)
 
 const LocationScale = Union{Laplace, Logistic, SymTriangularDist, Cosine, Epanechnikov}
 kernel_dist(::Type{D},w::Real) where {D<:LocationScale} = D(0.0, w/std(D(0.0,1.0)))
-
-# 1D precomputation 
-
-function precompute!(ke::UnivariateKernelEstimate, nPoints::Int = 2048)
-    lo, hi = extrema(ke.data)
-    midpoints = range(lo - 4.0*ke.bandwidth, hi + 4.0*ke.bandwidth, nPoints)
-    ke.precomputedPDF = conv(tabulate(vec(ke.data), midpoints, ke.prior), ke.kernel)
-end
-
-function tabulate(data::AbstractVector{<:Real}, midpoints::AbstractRange, prior::UnivariateDistribution{Discrete})
-    npoints = length(midpoints)
-    s = step(midpoints)
-
-    # Set up a grid for discretized data
-    grid = zeros(Float64, npoints)
-    ainc = 1.0 / (s*s)
-
-    # weighted discretization (cf. Jones and Lotwick)
-    for (i,x) in enumerate(data)
-        k = searchsortedfirst(midpoints,x)
-        j = k-1
-        if 1 <= j <= npoints-1
-            grid[j] += (midpoints[k]-x)*ainc*pdf(prior,i)
-            grid[k] += (x-midpoints[j])*ainc*pdf(prior,i)
-        end
-    end
-
-    return DiscretisedPDF(grid, midpoints)
-end
-
-function conv(den::DiscretisedPDF{1, R, T}, kernel::UnivariateDistribution) where {T<:Real,R<:AbstractRange}
-    # Transform to Fourier basis
-    K = length(den.values)
-    ft = rfft(den.values)
-
-    # Convolve fft with characteristic function of kernel
-    # empirical cf
-    #  = \sum_{n=1}^N e^{i*t*X_n} / N
-    #  = \sum_{k=0}^K e^{i*t*(a+k*s)} N_k / N
-    #  = e^{i*t*a} \sum_{k=0}^K e^{-2pi*i*k*(-t*s*K/2pi)/K} N_k / N
-    #  = A * fft(N_k/N)[-t*s*K/2pi + 1]
-    c = -twoπ/(step(den.xs)*K)
-    for j in 0:length(ft)-1
-        ft[j+1] *= cf(kernel,j*c)
-    end
-
-    # Invert the Fourier transform to get the KDE
-    convolved = irfft(ft, K)
-
-    # fix rounding error.
-    convolved .= max.(0., convolved)
-
-    DiscretisedPDF(convolved, den.xs)
-end
diff --git a/src/pull.md b/src/pull.md