{-# LANGUAGE BangPatterns #-}

module Bio.ChIPSeq.FragLen
    ( fragLength
    , naiveCCWithSmooth
    ) where

import           Bio.Data.Bed
import           Lens.Micro                ((^.))
import           Control.Parallel.Strategies (parMap, rpar)
import qualified Data.ByteString.Char8       as B
import qualified Data.HashMap.Strict         as M
import qualified Data.HashSet                as S
import           Data.List                   (foldl', maximumBy)
import           Data.Ord                    (comparing)

-- | estimate fragment length for a ChIP-seq experiment
fragLength :: (Int, Int) -> [BED] -> Int
fragLength :: (Int, Int) -> [BED] -> Int
fragLength (Int
start, Int
end) [BED]
beds = (Int, Int) -> Int
forall a b. (a, b) -> a
fst ((Int, Int) -> Int) -> (Int, Int) -> Int
forall a b. (a -> b) -> a -> b
$ ((Int, Int) -> (Int, Int) -> Ordering)
-> [(Int, Int)] -> (Int, Int)
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
maximumBy (((Int, Int) -> Int) -> (Int, Int) -> (Int, Int) -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Int, Int) -> Int
forall a b. (a, b) -> b
snd) ([(Int, Int)] -> (Int, Int)) -> [(Int, Int)] -> (Int, Int)
forall a b. (a -> b) -> a -> b
$
    Int -> [BED] -> [Int] -> [(Int, Int)]
naiveCCWithSmooth Int
4 [BED]
beds [Int
start, Int
startInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
2 .. Int
end]
{-# INLINE fragLength #-}

-- sizeDistribution :: (Int, Int) -> [BED] -> (Int, Int)

fromBED :: [BED] -> [(B.ByteString, (S.HashSet Int, S.HashSet Int))]
fromBED :: [BED] -> [(ByteString, (HashSet Int, HashSet Int))]
fromBED = ((ByteString, ([Int], [Int]))
 -> (ByteString, (HashSet Int, HashSet Int)))
-> [(ByteString, ([Int], [Int]))]
-> [(ByteString, (HashSet Int, HashSet Int))]
forall a b. (a -> b) -> [a] -> [b]
map (ByteString, ([Int], [Int]))
-> (ByteString, (HashSet Int, HashSet Int))
forall a a a.
(Hashable a, Hashable a) =>
(a, ([a], [a])) -> (a, (HashSet a, HashSet a))
toSet ([(ByteString, ([Int], [Int]))]
 -> [(ByteString, (HashSet Int, HashSet Int))])
-> ([BED] -> [(ByteString, ([Int], [Int]))])
-> [BED]
-> [(ByteString, (HashSet Int, HashSet Int))]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. HashMap ByteString ([Int], [Int]) -> [(ByteString, ([Int], [Int]))]
forall k v. HashMap k v -> [(k, v)]
M.toList (HashMap ByteString ([Int], [Int])
 -> [(ByteString, ([Int], [Int]))])
-> ([BED] -> HashMap ByteString ([Int], [Int]))
-> [BED]
-> [(ByteString, ([Int], [Int]))]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (([Int], [Int]) -> ([Int], [Int]) -> ([Int], [Int]))
-> [(ByteString, ([Int], [Int]))]
-> HashMap ByteString ([Int], [Int])
forall k v.
(Eq k, Hashable k) =>
(v -> v -> v) -> [(k, v)] -> HashMap k v
M.fromListWith ([Int], [Int]) -> ([Int], [Int]) -> ([Int], [Int])
forall a a. ([a], [a]) -> ([a], [a]) -> ([a], [a])
f ([(ByteString, ([Int], [Int]))]
 -> HashMap ByteString ([Int], [Int]))
-> ([BED] -> [(ByteString, ([Int], [Int]))])
-> [BED]
-> HashMap ByteString ([Int], [Int])
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (BED -> (ByteString, ([Int], [Int])))
-> [BED] -> [(ByteString, ([Int], [Int]))]
forall a b. (a -> b) -> [a] -> [b]
map BED -> (ByteString, ([Int], [Int]))
parseLine
    where
        parseLine :: BED -> (B.ByteString, ([Int], [Int]))
        parseLine :: BED -> (ByteString, ([Int], [Int]))
parseLine BED
x = case BED
xBED -> Getting (Maybe Bool) BED (Maybe Bool) -> Maybe Bool
forall s a. s -> Getting a s a -> a
^.Getting (Maybe Bool) BED (Maybe Bool)
forall b. BEDLike b => Lens' b (Maybe Bool)
strand of
            Just Bool
True  -> (BED
xBED -> Getting ByteString BED ByteString -> ByteString
forall s a. s -> Getting a s a -> a
^.Getting ByteString BED ByteString
forall b. BEDLike b => Lens' b ByteString
chrom, ([BED
xBED -> Getting Int BED Int -> Int
forall s a. s -> Getting a s a -> a
^.Getting Int BED Int
forall b. BEDLike b => Lens' b Int
chromStart], []))
            Just Bool
False -> (BED
xBED -> Getting ByteString BED ByteString -> ByteString
forall s a. s -> Getting a s a -> a
^.Getting ByteString BED ByteString
forall b. BEDLike b => Lens' b ByteString
chrom, ([], [BED
xBED -> Getting Int BED Int -> Int
forall s a. s -> Getting a s a -> a
^.Getting Int BED Int
forall b. BEDLike b => Lens' b Int
chromEnd]))
            Maybe Bool
_          -> [Char] -> (ByteString, ([Int], [Int]))
forall a. HasCallStack => [Char] -> a
error [Char]
"Unknown Strand!"
        f :: ([a], [a]) -> ([a], [a]) -> ([a], [a])
f ([a]
a,[a]
b) ([a]
a',[a]
b') = ([a]
a[a] -> [a] -> [a]
forall a. [a] -> [a] -> [a]
++[a]
a', [a]
b[a] -> [a] -> [a]
forall a. [a] -> [a] -> [a]
++[a]
b')
        toSet :: (a, ([a], [a])) -> (a, (HashSet a, HashSet a))
toSet (a
chr, ([a]
forwd, [a]
rev)) = (a
chr, ([a] -> HashSet a
forall a. (Eq a, Hashable a) => [a] -> HashSet a
S.fromList [a]
forwd, [a] -> HashSet a
forall a. (Eq a, Hashable a) => [a] -> HashSet a
S.fromList [a]
rev))
{-# INLINE fromBED #-}

-- | fast relative cross-correlation with smoothing
apprxCorr :: S.HashSet Int -> S.HashSet Int -> Int -> Int -> Int
apprxCorr :: HashSet Int -> HashSet Int -> Int -> Int -> Int
apprxCorr HashSet Int
forwd HashSet Int
rev Int
smooth Int
d = (Int -> Int -> Int) -> Int -> HashSet Int -> Int
forall a b. (a -> b -> a) -> a -> HashSet b -> a
S.foldl' Int -> Int -> Int
f Int
0 HashSet Int
rev
    where
        f :: Int -> Int -> Int
        f :: Int -> Int -> Int
f !Int
acc Int
r | (Int -> Bool) -> [Int] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Int -> HashSet Int -> Bool
forall a. (Eq a, Hashable a) => a -> HashSet a -> Bool
`S.member` HashSet Int
forwd) [Int
rInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
dInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
smooth..Int
rInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
dInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
smooth] = Int
acc Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
                 | Bool
otherwise = Int
acc
{-# INLINE apprxCorr #-}

-- | calcuate cross corrlation with different shifts
naiveCCWithSmooth :: Int -> [BED] -> [Int] -> [(Int, Int)]
naiveCCWithSmooth :: Int -> [BED] -> [Int] -> [(Int, Int)]
naiveCCWithSmooth Int
smooth [BED]
input [Int]
range = [(ByteString, (HashSet Int, HashSet Int))] -> [(Int, Int)]
f ([(ByteString, (HashSet Int, HashSet Int))] -> [(Int, Int)])
-> ([BED] -> [(ByteString, (HashSet Int, HashSet Int))])
-> [BED]
-> [(Int, Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [BED] -> [(ByteString, (HashSet Int, HashSet Int))]
fromBED ([BED] -> [(Int, Int)]) -> [BED] -> [(Int, Int)]
forall a b. (a -> b) -> a -> b
$ [BED]
input
    where
        cc :: [(B.ByteString, (S.HashSet Int, S.HashSet Int))] -> Int -> Int
        cc :: [(ByteString, (HashSet Int, HashSet Int))] -> Int -> Int
cc [(ByteString, (HashSet Int, HashSet Int))]
xs Int
d = (Int -> Int -> Int) -> Int -> [Int] -> Int
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) Int
0 ([Int] -> Int)
-> ([(ByteString, (HashSet Int, HashSet Int))] -> [Int])
-> [(ByteString, (HashSet Int, HashSet Int))]
-> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((ByteString, (HashSet Int, HashSet Int)) -> Int)
-> [(ByteString, (HashSet Int, HashSet Int))] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map ((\(HashSet Int
forwd, HashSet Int
rev) -> HashSet Int -> HashSet Int -> Int -> Int -> Int
apprxCorr HashSet Int
forwd HashSet Int
rev Int
smooth Int
d)((HashSet Int, HashSet Int) -> Int)
-> ((ByteString, (HashSet Int, HashSet Int))
    -> (HashSet Int, HashSet Int))
-> (ByteString, (HashSet Int, HashSet Int))
-> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
.(ByteString, (HashSet Int, HashSet Int))
-> (HashSet Int, HashSet Int)
forall a b. (a, b) -> b
snd) ([(ByteString, (HashSet Int, HashSet Int))] -> Int)
-> [(ByteString, (HashSet Int, HashSet Int))] -> Int
forall a b. (a -> b) -> a -> b
$ [(ByteString, (HashSet Int, HashSet Int))]
xs
        f :: [(ByteString, (HashSet Int, HashSet Int))] -> [(Int, Int)]
f [(ByteString, (HashSet Int, HashSet Int))]
xs = [Int] -> [Int] -> [(Int, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int]
range ([Int] -> [(Int, Int)]) -> [Int] -> [(Int, Int)]
forall a b. (a -> b) -> a -> b
$ Strategy Int -> (Int -> Int) -> [Int] -> [Int]
forall b a. Strategy b -> (a -> b) -> [a] -> [b]
parMap Strategy Int
forall a. Strategy a
rpar ([(ByteString, (HashSet Int, HashSet Int))] -> Int -> Int
cc [(ByteString, (HashSet Int, HashSet Int))]
xs) [Int]
range
{-# INLINE naiveCCWithSmooth #-}