{-# LANGUAGE OverloadedStrings, BangPatterns, RecordWildCards #-} {-# OPTIONS_GHC -Wall #-} module Align where import Bio.Base import Bio.Bam import Control.Applicative import Control.Monad import Control.Monad.ST (runST) import Data.Bits import Data.List (group) import Data.Sequence ( (<|), (><), ViewL((:<)) ) import qualified Data.Foldable as F import qualified Data.Sequence as Z import qualified Data.Vector.Generic as V import qualified Data.Vector.Storable as S import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Unboxed.Mutable as UM data Base = A | C | G | T | None deriving (Eq, Ord, Enum, Show) -- | For a reference sequence, we store five(!) probabilities for each -- base in phred format. The fifth is the probability of a gap. newtype RefSeq = RS (U.Vector Word8) deriving Show refseq_len :: RefSeq -> Int refseq_len (RS v) = U.length v `div` 5 prob_of :: Base -> Int -> RefSeq -> Word8 prob_of b i (RS v) = indexV "prob_of" v ( 5*i + fromEnum b ) -- | Turns a sequence into probabilities. @Right n@ is an ordinary -- 'Nucleotide', @Left n@ is one we think might be absent (e.g. because -- it was soft masked in the input). prep_reference :: [Either Nucleotides Nucleotides] -> RefSeq prep_reference = RS . U.concat . map (either (to probG) (to probB)) where to ps n = U.slice (5 * fromIntegral (unNs n)) 5 ps -- XXX we should probably add some noise here, so the placement of -- gaps isn't completely random, but merely unpredictable probB = U.fromListN 80 $ concatMap (\l -> l ++ [255]) raw_probs probG = U.fromListN 80 $ concatMap (\l -> map (+3) l ++ [3]) raw_probs raw_probs = [[ 25, 25, 25, 25 ] -- 0 ,[ 0, 25, 25, 25 ] -- A ,[ 25, 0, 25, 25 ] -- C ,[ 3, 3, 25, 25 ] -- M ,[ 25, 25, 0, 25 ] -- G ,[ 3, 25, 3, 25 ] -- R ,[ 25, 3, 3, 25 ] -- S ,[ 5, 5, 5, 25 ] -- V ,[ 25, 25, 25, 0 ] -- T ,[ 3, 25, 25, 3 ] -- W ,[ 25, 3, 25, 3 ] -- Y ,[ 5, 5, 25, 5 ] -- H ,[ 25, 25, 3, 3 ] -- K ,[ 5, 25, 5, 5 ] -- D ,[ 25, 5, 5, 5 ] -- B ,[ 6, 6, 6, 6 ]] -- N -- | Encoding of the query: one word per position, the two lowest bits -- encode the base, the rest is the quality score (shifted left by 2). newtype QuerySeq = QS { unQS :: U.Vector Word8 } deriving Show -- | Prepare query for subsequent alignment to the forward strand. prep_query_fwd :: BamRec -> QuerySeq prep_query_fwd BamRec{..} = QS $ U.fromListN len $ zipWith pair (V.toList b_seq) (V.toList b_qual) where pair b (Q q) = q `shiftL` 2 .|. indexV "prep_query_fwd" code (fromIntegral $ unNs b) code = U.fromListN 16 [0,0,1,0,2,0,0,0,3,0,0,0,0,0,0,0] len = V.length b_seq prep_query_rev :: BamRec -> QuerySeq prep_query_rev = revcompl_query . prep_query_fwd where revcompl_query (QS v) = QS $ U.map (xor 3) $ U.reverse v qseqToBamSeq :: QuerySeq -> Vector_Nucs_half Nucleotides qseqToBamSeq = V.fromList . U.toList . U.map (\x -> Ns $ 1 `shiftL` fromIntegral (x .&. 3)) . unQS qseqToBamQual :: QuerySeq -> S.Vector Qual qseqToBamQual = S.convert . U.map (Q . (`shiftR` 2)) . unQS -- | Memoization matrix for dynamic programming. We understand it as a -- matrix B columns wide and L rows deep, where B is the bandwidth and L -- the query length. Successive rows are understood to be skewed to the -- right. (This means all operations need the bandwidth as an -- argument.) newtype MemoMat = MemoMat (U.Vector Float) deriving Show newtype Bandwidth = BW Int deriving Show newtype RefPosn = RP Int deriving Show data AlignResult = AlignResult { viterbi_forward :: MemoMat -- DP matrix from running Viterbi , viterbi_score :: Float -- alignment score (log scale, vs. radom alignment) , viterbi_position :: Int -- position (start of the most probable alignment) , viterbi_backtrace :: S.Vector Cigar } -- backtrace (most probable alignment) deriving Show data Traced = Tr { tr_op :: CigOp, tr_score :: Float } instance Eq Traced where Tr _ a == Tr _ b = a == b instance Ord Traced where Tr _ a `compare` Tr _ b = a `compare` b -- | All sorts of alignment shit collected in one place, mostly so I can -- reuse the scoring functions. align :: Float -> RefSeq -> QuerySeq -> RefPosn -> Bandwidth -> AlignResult align gp (RS rs) (QS qs) (RP p0) (BW bw_) = runST (do let bw = abs bw_ v <- UM.unsafeNew $ bw * U.length qs + bw let readV row col | row < 0 || col < 0 || col >= bw || row > U.length qs = error $ "Read from memo: " ++ show (row,col) | ix < 0 || ix >= UM.length v = error $ "Read from memo: " ++ show ix | otherwise = UM.read v ix where ix = bw*row + col let score qpos _ | qpos < 0 || qpos >= U.length qs = error $ "Read from QS: " ++ show qpos score qpos rpos = let base = (indexV "align/score/base" qs qpos) .&. 3 :: Word8 qual = (indexV "align/score/qual" qs qpos) `shiftR` 2 :: Word8 prob = let ix = 5*rpos + fromIntegral base in if ix < 0 then error ("Huh? " ++ show ix) else if ix < U.length rs then indexV "align/score/prob/A" rs ix else if ix - U.length rs < U.length rs then indexV "align/score/prob/B" rs (ix - U.length rs) :: Word8 else error ("Huh? " ++ show (ix,qpos,rpos,p0,base)) -- Improbability of a mismatch, it's the -- probability of the reference not being -- correct or the query not being correct, -- whichever is higher. mismatch = fromIntegral (min qual prob) -- Improbability of a random match. It's -- 6 if we have a good base, corresponding -- to randomness. If we have a bad base, -- it's lower, because we aren't doing -- better than random. randmatch = fromIntegral (min qual 6) -- Score is our mismatch probability vs. -- random sequences. Note that this ends -- up being 0 for low quality bases, -6 -- for high quality matches, and 30+ for -- high quality mismatches. in mismatch - randmatch let gscore rpos = let prob = let ix = 5*rpos + 4 in if ix < 0 then error ("Huh? " ++ show ix) else if ix < U.length rs then indexV "align/gscore/prob/A" rs ix else if ix - U.length rs < U.length rs then indexV "align/gscore/prob/B" rs (ix - U.length rs) :: Word8 else error ("Huh? " ++ show (ix,rpos,p0)) in min gp $ fromIntegral prob let match row col = Tr Mat . (+ score (row-1) (p0+row+col-1)) <$> readV (row-1) (col+0) let gapH row col = Tr Del . (+ gscore (p0+row+col-1)) <$> readV (row+0) (col-1) let gapV row col = Tr Ins . (+ gp) <$> readV (row-1) (col+1) let cell row col = do x <- if row == 0 then return (Tr Nop 0) else match row col y <- if col == 0 then return x else min x <$> gapH row col z <- if row == 0 || col == bw-1 then return y else min y <$> gapV row col return z -- Fill the DP matrix. XXX: there's got to be way to express this -- using 'Vector's bulk operations. Would that be more readable? -- Faster? forM_ [0 .. U.length qs] $ \row -> forM_ [0 .. bw-1] $ \col -> UM.write v (bw*row + col) . tr_score =<< cell row col let pack_cigar = S.fromList . map (\x -> head x :* length x) . group let traceback acc row col = do op <- tr_op <$> cell row col case op of Mat -> traceback (Mat:acc) (row-1) (col+0) Ins -> traceback (Ins:acc) (row-1) (col+1) Del -> traceback (Del:acc) (row+0) (col-1) Nop | row == 0 -> return (p0+col, pack_cigar acc) viterbi_forward <- MemoMat <$> U.unsafeFreeze v (viterbi_score, mincol) <- minimum . flip zip [0..] <$> mapM (readV (U.length qs)) [0..bw-1] (viterbi_position, viterbi_backtrace) <- traceback [] (U.length qs) mincol return $ AlignResult{..}) -- For each position, a vector of pseudocounts in the same order as in -- 'RefSeq', followed by the same for based inserted after the current -- one. newtype NewRefSeq = NRS (Z.Seq NewColumn) -- Inserts come (conceptually) before the base whose coordinate they -- bear. So every column has inserts first, then the single aligned -- base. data NewColumn = NC { nc_inserts :: !(U.Vector Float) , nc_base :: !(U.Vector Float) } new_ref_seq :: RefSeq -> NewRefSeq new_ref_seq rs = NRS $ Z.replicate (refseq_len rs) (NC (U.replicate 0 0) (U.replicate 5 0)) mkNC :: U.Vector Float -> U.Vector Float -> NewColumn mkNC !i !b | U.length b /= 5 = error "mkNC" | otherwise = NC i b -- Add an alignment to the new reference. We compute the quality of the -- alignment (probability that it belongs vs. probability that it's -- random), that's how many votes we're going to cast. (A perfect -- alignment gives a whole vote, a random one gives none. Call this -- with an alignment that's worse than random at your own peril.) -- If we're voting for a base, we vote for the called one according to -- its quality and for all others with the error probability. -- A deletion is a vote against all bases, an insert is a vote for how -- ever many bases. The first five values sum up to the total votes so -- far, and they all count as votes against any further extension to an -- insert. We start with five pseudo-votes to get the numerics under -- control (or to have a uniform Dirichlet-prior, if you prefer). -- -- Note that this logic was arrived at by "thinking hard". A clean way -- to do it is to maximize the alignment score expected in the next -- round, assuming the alignments do not change. It might work out to -- the same thing... who knows? add_to_refseq :: NewRefSeq -> QuerySeq -> AlignResult -> NewRefSeq add_to_refseq (NRS nrs0) (QS qs0) AlignResult{..} = NRS $ rotateZ (Z.length nrs0 - viterbi_position) $ mat here back qs0 $ S.toList viterbi_backtrace where here :< back = Z.viewl $ rotateZ viterbi_position nrs0 rotateZ n = uncurry (flip (><)) . Z.splitAt n !odds = 10 ** (-viterbi_score / 10) -- often huge, !votes = 1 - recip (1+odds) -- often exactly 1 -- Grrr, this isn't going to work. We'll split it: -- One function deals with inserts. As long as we get inserted -- bases, we vote for them. Then we vote against the remainder and -- pass the buck. -- The other deals with a base. We vote for it if we matched it, -- against it if we deleted it. Then we recurse. ins !nc@(NC is b) !nrs !nins !qs cigs = case cigs of [ ] -> nc <| nrs ( _ :* 0 :cs) -> ins nc nrs nins qs cs (Ins :* n :cs) -> let is' = vote_for_at votes (U.sum b) nins (U.head qs) is in ins (mkNC is' b) nrs (nins+1) (U.tail qs) (Ins :* (n-1) : cs) _ -> let is' = vote_against_from votes nins is in mat (mkNC is' b) nrs qs cigs mat !nc@(NC is b) !nrs !qs cigs = case cigs of [ ] -> nc <| nrs ( _ :* 0 :cs) -> mat nc nrs qs cs (Del :* n :cs) -> let nc2 :< rest = Z.viewl nrs b' = vote_against votes b in mkNC is b' let nc2 :< rest = Z.viewl nrs b' = vote_for votes (U.head qs) b in mkNC is b' ins nc nrs (0::Int) qs cigs ( Int -> U.Vector Float -> U.Vector Float -- vote_against_from votes ix ps | trace ("vote_against_from " ++ show (ix, U.length ps)) False = undefined vote_against_from votes ix ps = U.accum (+) ps [(i,votes) | i <- [ix+4, ix+9 .. U.length ps-1]] vote_against :: Float -> U.Vector Float -> U.Vector Float -- vote_against votes ps | trace ("vote_against " ++ show (U.length ps)) False = undefined vote_against votes ps = U.accum (+) ps [(4,votes)] vote_for :: Float -> Word8 -> U.Vector Float -> U.Vector Float vote_for votes = vote_for_at votes 0 0 vote_for_at :: Float -> Float -> Int -> Word8 -> U.Vector Float -> U.Vector Float vote_for_at votes v0 idx bq ps = U.accum (+) ps' $ (base+5*idx,pt) : [(5*idx+i,pe)|i<-[0,1,2,3]] where base = fromIntegral $ bq .&. 3 qual = bq `shiftR` 2 perr = 10 ** (fromIntegral qual * (-0.1)) pe = votes * perr / 3 pt = votes * (1 - perr) - pe ps' | U.length ps >= 5*idx+5 = ps | otherwise = U.concat (ps : replicate (idx+1 - U.length ps `div` 5) (U.fromList [0,0,0,0,v0])) -- Back to compact representation. Every group of five votes gets -- converted to five probabilities, and those to quality scores. Then -- we concatenate. finalize_ref_seq :: NewRefSeq -> (RefSeq, XTab) finalize_ref_seq (NRS z) = ( RS $ U.concat $ F.foldr unpack [] z , Z.fromList $ scanl (+) 0 $ F.foldr tolen [] z) where unpack (NC ins bas) k = map5 call ins ++ call bas : k map5 f v = [ f (U.slice i 5 v) | i <- [0, 5 .. U.length v - 5] ] call v = U.map (\x -> round $ (-10) / log 10 * log ((x+1) / total)) v where total = U.sum v + 5 tolen (NC ins _bas) k = U.length ins `div` 5 + 1 : k -- Table for coordinate translation type XTab = Z.Seq Int {-# INLINE indexV #-} indexV :: String -> U.Vector Word8 -> Int -> Word8 -- indexV m v i | i < 0 = error $ m ++ ": index too large" -- | i >= U.length v = error $ m ++ ": negative index" -- | otherwise = v U.! i indexV _ = (U.!)