{- Merge Gregory W. Schwartz Collects the functions pertaining to the merging of headers with abundances. -} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE OverloadedStrings #-} module Merge ( mergeAbundance , mergeMates , bamToFasta ) where -- Standard import Data.Function (on) import Data.List import Data.Maybe import Data.Monoid import qualified Data.IntMap.Strict as IMap import qualified Data.Map.Strict as Map -- Cabal import Control.Lens import qualified Data.Text as T import qualified Data.Text.Read as T import Safe import TextShow -- Local import Types import Utility mergeAbundance :: FrequencyMap -> DuplicationRow -> DuplicationRow mergeAbundance (FrequencyMap m) (DuplicationRow row) = DuplicationRow . Map.insert "frequency" (showt . findWithError rowAccession $ m) $ row where rowAccession = flip (Safe.at) 1 . T.splitOn "|" . findWithError "fHeader" $ row -- | Get the position map of a sequence. getPositionMap :: Position -> Sequence -> PositionMap getPositionMap (Position p) = PositionMap . IMap.fromList . zip [p..] . T.unpack . unSequence -- | Fill the gaps in a position sequence list with a character. fillGapsWith :: Fill -> [(Position, Char)] -> [(Position, Char)] -> [(Position, Char)] fillGapsWith _ !acc [] = reverse acc fillGapsWith fill [] (x:xs) = fillGapsWith fill [x] xs fillGapsWith fill acc@((Position !previousPos, _):_) allNext@((Position !currentPos, !currentChar):xs) = if abs (currentPos - previousPos) > 1 then fillGapsWith fill ((Position (previousPos + 1), unFill fill) : acc) allNext else fillGapsWith fill ((Position currentPos, currentChar) : acc) xs -- | Merge two sequences by their positions. mergeSequences :: Fill -> (Position, Sequence) -> (Position, Sequence) -> Sequence mergeSequences fill left = Sequence . T.pack . fmap snd . fillGapsWith fill [] . fmap (over _1 Position) . IMap.toAscList . IMap.union (unPositionMap . uncurry getPositionMap $ left) . unPositionMap . uncurry getPositionMap -- | Merge a mate pair. Assumes that the lines are sorted by mate pair already -- (samtools sort -n). Also assumes that the left mate appears first. Removes -- supplementary alignments. mergePair :: Fill -> [BamRow] -> Maybe BamRow mergePair fill xs = case filter (not . isSupplementary) xs of [] -> Nothing [x] -> Just x [BamRow left, BamRow right] -> Just . BamRow . set (ix 9) ( unSequence $ mergeSequences fill (posSeq left) (posSeq right) ) $ left xs -> error ("Too many mate pairs (even after supplementary removal) for: " <> show xs) where posSeq xs = ( either error (Position . fst) . T.decimal $ xs !! 3 , Sequence $ xs !! 9 ) isSupplementary = (== "1") . drop 11 . decodeSAMFlag . read . T.unpack . (!! 1) . unBamRow -- | Merge all mate pairs. Assumes that the lines are sorted by mate pair already -- (samtools sort -n). mergeMates :: Fill -> [BamRow] -> [BamRow] mergeMates fill = catMaybes . fmap (mergePair fill) . groupBy ((==) `on` (head . unBamRow))