module Bio.Util ( wilson, invnormcdf, choose, estimateComplexity, showNum, showOOM, float2mini, mini2float, log1p, expm1, phredplus, phredminus, phredsum, (<#>), phredconverse ) where import Data.Bits import Data.Char ( intToDigit ) import Data.List ( foldl' ) import Data.Word ( Word8 ) -- ^ Random useful stuff I didn't know where to put. -- | calculates the Wilson Score interval. -- If @(l,m,h) = wilson c x n@, then @m@ is the binary proportion and -- @(l,h)@ it's @c@-confidence interval for @x@ positive examples out of -- @n@ observations. @c@ is typically something like 0.05. wilson :: Double -> Int -> Int -> (Double, Double, Double) wilson c x n = ( (m - h) / d, p, (m + h) / d ) where nn = fromIntegral n p = fromIntegral x / nn z = invnormcdf (1-c*0.5) h = z * sqrt (( p * (1-p) + 0.25*z*z / nn ) / nn) m = p + 0.5 * z * z / nn d = 1 + z * z / nn showNum :: Show a => a -> String showNum = triplets [] . reverse . show where triplets acc [] = acc triplets acc (a:[]) = a:acc triplets acc (a:b:[]) = b:a:acc triplets acc (a:b:c:[]) = c:b:a:acc triplets acc (a:b:c:s) = triplets (',':c:b:a:acc) s showOOM :: Double -> String showOOM x | x < 0 = '-' : showOOM (negate x) | otherwise = findSuffix (x*10) ".kMGTPEZY" where findSuffix _ [] = "many" findSuffix y (s:ss) | y < 100 = intToDigit (round y `div` 10) : case (round y `mod` 10, s) of (0,'.') -> [] ; (0,_) -> [s] ; (d,_) -> [s, intToDigit d] | y < 1000 = intToDigit (round y `div` 100) : intToDigit ((round y `mod` 100) `div` 10) : if s == '.' then [] else [s] | y < 10000 = intToDigit (round y `div` 1000) : intToDigit ((round y `mod` 1000) `div` 100) : '0' : if s == '.' then [] else [s] | otherwise = findSuffix (y*0.001) ss -- Stolen from Lennart Augustsson's erf package, who in turn took it rom -- http://home.online.no/~pjacklam/notes/invnorm/ Accurate to about 1e-9. invnormcdf :: (Ord a, Floating a) => a -> a invnormcdf p = let a1 = -3.969683028665376e+01 a2 = 2.209460984245205e+02 a3 = -2.759285104469687e+02 a4 = 1.383577518672690e+02 a5 = -3.066479806614716e+01 a6 = 2.506628277459239e+00 b1 = -5.447609879822406e+01 b2 = 1.615858368580409e+02 b3 = -1.556989798598866e+02 b4 = 6.680131188771972e+01 b5 = -1.328068155288572e+01 c1 = -7.784894002430293e-03 c2 = -3.223964580411365e-01 c3 = -2.400758277161838e+00 c4 = -2.549732539343734e+00 c5 = 4.374664141464968e+00 c6 = 2.938163982698783e+00 d1 = 7.784695709041462e-03 d2 = 3.224671290700398e-01 d3 = 2.445134137142996e+00 d4 = 3.754408661907416e+00 pLow = 0.02425 nan = 0/0 in if p < 0 then nan else if p == 0 then -1/0 else if p < pLow then let q = sqrt(-2 * log p) in (((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q+c6) / ((((d1*q+d2)*q+d3)*q+d4)*q+1) else if p < 1 - pLow then let q = p - 0.5 r = q*q in (((((a1*r+a2)*r+a3)*r+a4)*r+a5)*r+a6)*q / (((((b1*r+b2)*r+b3)*r+b4)*r+b5)*r+1) else if p <= 1 then - invnormcdf (1 - p) else nan -- | Try to estimate complexity of a whole from a sample. Suppose we -- sampled @total@ things and among those @singles@ occured only once. -- How many different things are there? -- -- Let the total number be @m@. The copy number follows a Poisson -- distribution with paramter @\lambda@. Let @z := e^{\lambda}@, then -- we have: -- -- P( 0 ) = e^{-\lambda} = 1/z -- P( 1 ) = \lambda e^{-\lambda} = ln z / z -- P(>=1) = 1 - e^{-\lambda} = 1 - 1/z -- -- singles = m ln z / z -- total = m (1 - 1/z) -- -- D := total/singles = (1 - 1/z) * z / ln z -- f := z - 1 - D ln z = 0 -- -- To get @z@, we solve using Newton iteration and then substitute to -- get @m@: -- -- df/dz = 1 - D/z -- z' := z - z (z - 1 - D ln z) / (z - D) -- m = singles * z /log z -- -- It converges as long as the initial @z@ is large enough, and @10D@ -- (in the line for @zz@ below) appears to work well. estimateComplexity :: (Integral a, Floating b, Ord b) => a -> a -> Maybe b estimateComplexity total singles | total <= singles = Nothing | singles <= 0 = Nothing | otherwise = Just m where d = fromIntegral total / fromIntegral singles step z = z * (z - 1 - d * log z) / (z - d) iter z = case step z of zd | abs zd < 1e-12 -> z | otherwise -> iter $! z-zd zz = iter $! 10*d m = fromIntegral singles * zz / log zz -- | Computes @-10 * log_10 (10 ** (-x\/10) + 10 ** (-y\/10))@ without -- losing precision. Used to add numbers on "the Phred scale", -- otherwise known as (deci-)bans. {-# INLINE phredplus #-} phredplus :: Double -> Double -> Double phredplus x y = if x < y then pp x y else pp y x where pp u v = u - 10 / log 10 * log1p (exp ((u-v) * log 10 / 10)) -- | Computes @-10 * log_10 (10 ** (-x\/10) - 10 ** (-y\/10))@ without -- losing precision. Used to subtract numbers on "the Phred scale", -- otherwise known as (deci-)bans. {-# INLINE phredminus #-} phredminus :: Double -> Double -> Double phredminus x y = if x < y then pm x y else pm y x where pm u v = u - 10 / log 10 * log1p (- exp ((u-v) * log 10 / 10)) -- | Computes @-10 * log_10 (sum [10 ** (-x\/10) | x <- xs])@ without losing -- precision. {-# INLINE phredsum #-} phredsum :: [Double] -> Double phredsum = foldl' (<#>) (1/0) infixl 3 <#>, `phredminus`, `phredplus` {-# INLINE (<#>) #-} (<#>) :: Double -> Double -> Double (<#>) = phredplus -- | Computes @1-p@ without leaving the "Phred scale" phredconverse :: Double -> Double phredconverse v = - 10 / log 10 * log1p (- exp ((-v) * log 10 / 10)) -- | Computes @log (1+x)@ to a relative precision of @10^-8@ even for -- very small @x@. Stolen from http://www.johndcook.com/cpp_log_one_plus_x.html {-# INLINE log1p #-} log1p :: (Floating a, Ord a) => a -> a log1p x | x < -1 = error "log1p: argument must be greater than -1" -- x is large enough that the obvious evaluation is OK: | x > 0.0001 || x < -0.0001 = log $ 1 + x -- Use Taylor approx. log(1 + x) = x - x^2/2 with error roughly x^3/3 -- Since |x| < 10^-4, |x|^3 < 10^-12, relative error less than 10^-8: | otherwise = (1 - 0.5*x) * x -- | Computes @exp x - 1@ to a relative precision of @10^-10@ even for -- very small @x@. Stolen from http://www.johndcook.com/cpp_expm1.html expm1 :: (Floating a, Ord a) => a -> a expm1 x | x > -0.00001 && x < 0.00001 = (1 + 0.5 * x) * x -- Taylor approx | otherwise = exp x - 1 -- direct eval -- | Binomial coefficient: @n `choose` k == n! / ((n-k)! k!)@ {-# INLINE choose #-} choose :: Integral a => a -> a -> a n `choose` k = product [n-k+1 .. n] `div` product [2..k] -- | Conversion to 0.4.4 format minifloat: This minifloat fits into a -- byte. It has no sign, four bits of precision, and the range is from -- 0 to 63488, initially in steps of 1/8. Nice to store quality scores -- with reasonable precision and range. float2mini :: RealFloat a => a -> Word8 float2mini f | f' < 0 = error "no negative minifloats" -- negative zero is fine! | f < 2 = f' | e >= 17 = 0xff | s < 16 = error $ "oops: " ++ show (e,s) | s < 32 = (e-1) `shiftL` 4 .|. (s .&. 0xf) | s == 32 = e `shiftL` 4 | otherwise = error $ "oops: " ++ show (e,s) where f' = round (8*f) e = fromIntegral $ exponent f s = round $ 32 * significand f -- | Conversion from 0.4.4 format minifloat, see 'float2mini'. mini2float :: Fractional a => Word8 -> a mini2float w | e == 0 = fromIntegral w / 8.0 | otherwise = 2^e * fromIntegral m / 16.0 where m = (w .&. 0xF) .|. 0x10 e = w `shiftR` 4