{-# LANGUAGE BangPatterns, FlexibleContexts, UnboxedTuples #-} -- | -- Module : Statistics.Sample.KernelDensity -- Copyright : (c) 2011 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Kernel density estimation. This module provides a fast, robust, -- non-parametric way to estimate the probability density function of -- a sample. -- -- This estimator does not use the commonly employed \"Gaussian rule -- of thumb\". As a result, it outperforms many plug-in methods on -- multimodal samples with widely separated modes. module Statistics.Sample.KernelDensity ( -- * Estimation functions kde , kde_ -- * References -- $references ) where import Numeric.MathFunctions.Constants (m_sqrt_2_pi) import Prelude hiding (const, min, max, sum) import Statistics.Function (minMax, nextHighestPowerOfTwo) import Statistics.Math.RootFinding (fromRoot, ridders) import Statistics.Sample.Histogram (histogram_) import Statistics.Sample.Internal (sum) import Statistics.Transform (CD, dct, idct) import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import qualified Data.Vector as V -- | Gaussian kernel density estimator for one-dimensional data, using -- the method of Botev et al. -- -- The result is a pair of vectors, containing: -- -- * The coordinates of each mesh point. The mesh interval is chosen -- to be 20% larger than the range of the sample. (To specify the -- mesh interval, use 'kde_'.) -- -- * Density estimates at each mesh point. kde :: (G.Vector v CD, G.Vector v Double, G.Vector v Int) => Int -- ^ The number of mesh points to use in the uniform discretization -- of the interval @(min,max)@. If this value is not a power of -- two, then it is rounded up to the next power of two. -> v Double -> (v Double, v Double) kde n0 xs = kde_ n0 (lo - range / 10) (hi + range / 10) xs where (lo,hi) = minMax xs range | G.length xs <= 1 = 1 -- Unreasonable guess | lo == hi = 1 -- All elements are equal | otherwise = hi - lo {-# INLINABLE kde #-} {-# SPECIAlIZE kde :: Int -> U.Vector Double -> (U.Vector Double, U.Vector Double) #-} {-# SPECIAlIZE kde :: Int -> V.Vector Double -> (V.Vector Double, V.Vector Double) #-} -- | Gaussian kernel density estimator for one-dimensional data, using -- the method of Botev et al. -- -- The result is a pair of vectors, containing: -- -- * The coordinates of each mesh point. -- -- * Density estimates at each mesh point. kde_ :: (G.Vector v CD, G.Vector v Double, G.Vector v Int) => Int -- ^ The number of mesh points to use in the uniform discretization -- of the interval @(min,max)@. If this value is not a power of -- two, then it is rounded up to the next power of two. -> Double -- ^ Lower bound (@min@) of the mesh range. -> Double -- ^ Upper bound (@max@) of the mesh range. -> v Double -> (v Double, v Double) kde_ n0 min max xs | G.null xs = error "Statistics.KernelDensity.kde: empty sample" | n0 <= 1 = error "Statistics.KernelDensity.kde: invalid number of points" | otherwise = (mesh, density) where mesh = G.generate ni $ \z -> min + (d * fromIntegral z) where d = r / (n-1) density = G.map (/(2 * r)) . idct $ G.zipWith f a (G.enumFromTo 0 (n-1)) where f b z = b * exp (sqr z * sqr pi * t_star * (-0.5)) !n = fromIntegral ni !ni = nextHighestPowerOfTwo n0 !r = max - min a = dct . G.map (/ sum h) $ h where h = G.map (/ len) $ histogram_ ni min max xs !len = fromIntegral (G.length xs) !t_star = fromRoot (0.28 * len ** (-0.4)) . ridders 1e-14 (0,0.1) $ \x -> x - (len * (2 * sqrt pi) * go 6 (f 7 x)) ** (-0.4) where f q t = 2 * pi ** (q*2) * sum (G.zipWith g iv a2v) where g i a2 = i ** q * a2 * exp ((-i) * sqr pi * t) a2v = G.map (sqr . (*0.5)) $ G.tail a iv = G.map sqr $ G.enumFromTo 1 (n-1) go s !h | s == 1 = h | otherwise = go (s-1) (f s time) where time = (2 * const * k0 / len / h) ** (2 / (3 + 2 * s)) const = (1 + 0.5 ** (s+0.5)) / 3 k0 = U.product (G.enumFromThenTo 1 3 (2*s-1)) / m_sqrt_2_pi sqr x = x * x {-# INLINABLE kde_ #-} {-# SPECIAlIZE kde_ :: Int -> Double -> Double -> U.Vector Double -> (U.Vector Double, U.Vector Double) #-} {-# SPECIAlIZE kde_ :: Int -> Double -> Double -> V.Vector Double -> (V.Vector Double, V.Vector Double) #-} -- $references -- -- Botev. Z.I., Grotowski J.F., Kroese D.P. (2010). Kernel density -- estimation via diffusion. /Annals of Statistics/ -- 38(5):2916–2957. <http://arxiv.org/pdf/1011.2602>