{-# LANGUAGE OverloadedStrings, BangPatterns, RecordWildCards #-}
{-# OPTIONS_GHC -Wall #-}
module SimpleSeed where

import Bio.Base
import Bio.Bam.Rec

import Data.Bits
import Data.List
import Data.Maybe

import qualified Data.IntMap as IM
import qualified Data.Vector.Generic as V
import qualified Data.Vector.Unboxed as U

-- | Discontiguous template "12 of 16", stolen from MegaBLAST:
-- 1,110,110,110,110,111, with two bits per base gives 0xFCF3CF3F

template :: Int
template = 0xFCF3CF3F

create_seed_words :: [Nucleotides] -> [(Int, Int)]
create_seed_words = drop 32 . go 0x0 (-16) 0x0 0
  where
    go !accf !i !accr !ir s =
        (accf .&. template, i) : (accr .&. template, ir) : case s of
            [    ] -> []
            (Ns n:ns) -> go (accf `shiftR` 2 .|. (codef U.! fromIntegral n) `shiftL` 30) (i+1)
                            (accr `shiftL` 2 .|. (coder U.! fromIntegral n)) (ir-1) ns

    -- These codes are chosen so that ambiguity codes result in zeroes.
    -- The seed word 0, which would otherwise be the low-complexity and
    -- useless poly-A, is later ignored.
    codef, coder :: U.Vector Int
    codef = U.fromList [0,0,1,0,2,0,0,0,3,0,0,0,0,0,0,0]
    coder = U.fromList [0,3,2,0,1,0,0,0,0,0,0,0,0,0,0,0]

-- Turns a list of seed words into a map.  Only the first entry is used,
-- duplicates are discarded silenty.

data I2 = I2 !Int !Int

newtype SeedMap = SM { unSM :: IM.IntMap Int }
  deriving Show

create_seed_map ::  [Nucleotides] -> SeedMap
create_seed_map = SM . cleanup . foldl' (\m (k,v) -> IM.insertWith' add k v m) IM.empty .
                  map (\(x,y) -> (x,(I2 1 y))) . create_seed_words . pad
  where pad ns = ns ++ take 15 ns
        add (I2 x i) (I2 y _) = I2 (x+y) i
        cleanup = IM.mapMaybe $ \(I2 n j) -> if n < 8 then Just j else Nothing

create_seed_maps :: [[Nucleotides]] -> SeedMap
create_seed_maps = SM . IM.unionsWith const . map (unSM . create_seed_map)

-- | Actual seeding.  We take every hit and guesstimate an alignment
-- region from it (by adding the overhanging sequence parts and rounding
-- a bit up).  Regions are overlapped into larger ones, counting votes.
-- The region with the most votes is used as seed region.  (This will
-- occasionally result in a very long initial alignment.  We can afford
-- that.)
--
-- If we have PE data where only one read is seeded, we can either
-- discard the pair or align the second mate very expensively.  While
-- possible, that sounds rather expensive and should probably depend on
-- the quality of the first mates' alignment.  Generally, we may want to
-- check the quality of the initial alignment anyway.
--
-- For proper overlapping, we need to normalize each region to strictly
-- positive or strictly negative coordinates.  After sorting and
-- overlapping, we only need to check if the last region overlaps the
-- first---there can be only one such overlap per strand.  We should
-- probably discard overly long regions.

do_seed :: Int -> SeedMap -> BamRec -> Maybe (Int,Int)
do_seed ln (SM sm) BamRec{..} = -- do S.hPut stdout $ S.concat [ b_qname, key, ":  ", S.pack (shows b_seq "\n") ]
                   --    mapM_ (\x -> hPutStrLn stdout $ "  " ++ show x) rgns
                   case rgns of
                           [         ] -> Nothing -- putStrLn "discard"
                           {- (a,b,_) : _ | a > 20000 || a < (-20000) -> error $ concat [
                                    "Weird region: ",
                                    shows (a,b,ln) "; ",
                                    "Primitive regions: ",
                                    shows (rgns_fwd ++ rgns_rev) "; ",
                                    "Resulting regions: ",
                                    show rgns ] -}
                           (a,b,_) : _ -> Just (a,b) -- putStrLn $ "seed to " ++ shows a ".." ++ shows b " ("
                                                         --     ++ shows (b-a) "/" ++ shows (V.length b_seq) ")"

  where
    seeds = filter ((/= 0) . fst) $ filter ((/= template) . fst) $
            filter ((>= 0) . snd) $ create_seed_words $ V.toList b_seq

    more x = (x * 9) `div` 8 + 16

    rgns = sortBy (\(_,_,c) (_,_,z) -> compare z c) $ filter reasonably_short $
                (wrap_with        id $ overlap $ sort $ map norm_right rgns_fwd) ++
                (wrap_with norm_left $ overlap $ sort $ map norm_left  rgns_rev)

    (rgns_fwd, rgns_rev) = let put (f,r) (i,j) | j >= 0    = (rgn:f, r)
                                               | otherwise = (f, rgn:r)
                                where rgn = (j - more i, j + more (V.length b_seq - i), 1::Int)
                           in foldl put ([],[]) [ (i,j) | (k,i) <- seeds, j <- maybeToList $ IM.lookup k sm ]

    norm_right (a,b,n) = if a  < 0 then (a+ln, b+ln, n) else (a,b,n)
    norm_left  (a,b,n) = if b >= 0 then (a-ln, b-ln, n) else (a,b,n)

    wrap_with _    [           ] = []
    wrap_with _    [     r     ] = [r]
    wrap_with f rs@((x,y,n):rs')
        | i <= y+ln && x+ln <= j = f (min (x+ln) i, max (y+ln) j, n+m) : init rs'
        | otherwise              = rs
      where
        (i,j,m) = last rs

    overlap ( (x,y,n) : (i,j,m) : rs ) | i <= y = overlap ( (x,max y j,n+m) : rs )
    overlap ( (x,y,n) : rs ) = (x,y,n) : overlap rs
    overlap [] = []

    -- First cut:  reasonable is less than the whole MT.  Tuning can
    -- come later.
    reasonably_short (x,y,_) = y-x < ln