{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
module Normalize
( logTransform
, normalize
, normalizeBySample
) where
import Data.Ord
import Data.List
import qualified Data.Map.Strict as Map
import qualified Data.Sequence as Seq
import qualified Data.Foldable as F
import Data.Function (on)
import qualified Data.Vector as V
import qualified Data.Text as T
import Statistics.Quantile
import qualified Statistics.Sample as Stat
import Control.Lens
import Types
logTransform :: Base -> Map.Map Sample (V.Vector Entity) -> Map.Map Sample (V.Vector Entity)
logTransform (Base base) = (fmap . fmap) (over value (logBase 2))
standardScore :: V.Vector Entity -> V.Vector Entity
standardScore xs = V.map (over value (\x -> (x - mu) / sigma)) xs
where
mu = Stat.mean . V.map _value $ xs
sigma = Stat.stdDev . V.map _value $ xs
normalize :: Method
-> Map.Map Sample (V.Vector Entity)
-> Map.Map Sample (V.Vector Entity)
normalize StandardScore = Map.map standardScore
normalize UpperQuartile = Map.map upperQuartileNormalize
normalize None = id
normalizeBySample :: SynonymFlag
-> Maybe EntitySep
-> NormSampleString
-> Map.Map Sample (V.Vector Entity)
-> Map.Map Sample (V.Vector Entity)
normalizeBySample synonymFlag entitySep normSampleString =
Map.map ( V.fromList
. concatMap (divideBySample synonymFlag . reverse . sort)
)
. Map.map groupDivisors
. Map.mapKeysWith
(V.++)
( Sample
. T.replace (unNormSampleString normSampleString) ""
. unSample
)
. Map.mapWithKey (tagDivisors entitySep normSampleString)
groupDivisors :: V.Vector (EntityName, (Divisor, Entity))
-> V.Vector [(Divisor, Entity)]
groupDivisors = V.fromList
. fmap (F.toList . snd)
. Map.toAscList
. Map.fromListWith (Seq.><)
. fmap (over _2 Seq.singleton)
. V.toList
divideBySample :: SynonymFlag -> [(Divisor, Entity)] -> [Entity]
divideBySample _ [] =
error $ "Empty division in divideBySample."
divideBySample _ [(Divisor True, _)] = []
divideBySample _ ((Divisor False, _):_) = []
divideBySample (SynonymFlag True) all@((Divisor True, x):(Divisor True, y):_) =
divideBySample (SynonymFlag False)
. (: (filter (not . unDivisor . fst) all))
. maximumBy (comparing (_value . snd))
. filter (unDivisor . fst)
$ all
divideBySample (SynonymFlag False) ((Divisor True, x):(Divisor True, y):_) =
error $ "Too many divisors found including: "
++ (show x)
++ " and "
++ (show y)
divideBySample _ ((Divisor True, x):xs) =
fmap ((-~) value (_value x) . snd) xs
tagDivisors :: Maybe EntitySep
-> NormSampleString
-> Sample
-> V.Vector Entity
-> V.Vector (EntityName, (Divisor, Entity))
tagDivisors entitySep needle haystack =
fmap (tagDivisor entitySep needle haystack)
tagDivisor :: Maybe EntitySep
-> NormSampleString
-> Sample
-> Entity
-> (EntityName, (Divisor, Entity))
tagDivisor sep (NormSampleString needle) (Sample haystack) !e =
( entityName sep
, ( Divisor . T.isInfixOf needle $ haystack
, over sample (T.replace needle "") e
)
)
where
entityName :: (Maybe EntitySep) -> EntityName
entityName Nothing = EntityName . _entity $ e
entityName (Just (EntitySep s)) =
EntityName . head . T.splitOn s . _entity $ e
upperQuartileNormalize :: V.Vector Entity -> V.Vector Entity
upperQuartileNormalize xs =
fmap (over value (/ uqVal zeroFiltered)) zeroFiltered
where
zeroFiltered = V.filter ((> 0) . _value) xs
uqVal = continuousBy (ContParam 1 1) 3 4 . fmap _value