{-# LANGUAGE BangPatterns #-} -- Re-wrap alignments to obey the given length of the reference -- sequence. -- -- The idea is that a circular reference sequence has been extended -- artificially to facilitate alignment. Now the declared length in the -- header is wrong, and the alignments overhang the end. Here we split -- those alignments into two, one for the beginning, one for the end of -- the sequence, then soft-mask out the inappropriate parts. -- -- What's the best course of action, operationally? As usual, we need -- to decide whether to rely on sorted input and whether to produce -- sorted output, and how much to copy senselessly. -- -- In a sane world, this program runs precisely once, after alignment, -- and output is piped somewhere. So that's what we do: input is -- unsorted, so is output, output is piped (and hence uncompressed). -- We also fix the header while we're at it. -- -- We try to fix the map quality for the affected reads as follows: if -- a read has map quality 0 (meaning multiple equally good hits), we -- check the XA field. If it reports exactly one additional alignment, -- and it matches the primary alignment when transformed to canonical -- coordinates, we remove XA and set MAPQ to 37. import Bio.Bam import Bio.Bam.Rmdup import Control.Monad ( when ) import Data.Foldable ( toList ) import Data.Version ( showVersion ) import Paths_biohazard ( version ) import System.Environment ( getArgs, getProgName ) import System.Exit ( exitFailure ) import System.IO ( hPutStr ) import qualified Data.ByteString.Char8 as S import qualified Data.Map as M import qualified Data.Sequence as Z usage :: IO a usage = do pn <- getProgName hPutStr stderr $ pn ++ ", version " ++ showVersion version ++ "\nUsage: " ++ pn ++ " [chrom:length...]\n\ \Pipes a BAM file from stdin to stdout and for every 'chrom'\n\ \mentioned on the command line, wraps alignments to a new \n\ \target length of 'length'.\n" exitFailure main :: IO () main = getArgs >>= \args -> when (null args) usage >>= \_ -> enumHandle defaultBufSize stdin >=> run $ joinI $ decodeAnyBam $ \hdr -> do add_pg <- liftIO (addPG $ Just version) let (ltab, seqs') = parseArgs (meta_refs hdr) args joinI $ mapChunks (concatMap (rewrap (M.fromList ltab) . unpackBam)) $ protectTerm $ pipeBamOutput (add_pg hdr { meta_refs = seqs' }) parseArgs :: Refs -> [String] -> ([(Refseq,(Int,S.ByteString))], Refs) parseArgs refs | Z.null refs = error $ "no target sequences found (empty input?)" | otherwise = foldl parseArg ([],refs) where parseArg (sqs, h) arg = case break (==':') arg of (nm,':':r) -> case reads r of [(l,[])] | l > 0 -> case filter (S.isPrefixOf (S.pack nm) . sq_name . snd) $ zip [0..] $ toList h of [(k,a)] | sq_length a >= l -> ( (Refseq $ fromIntegral k,(l, sq_name a)):sqs, Z.update k (a { sq_length = l }) h ) | otherwise -> error $ "cannot wrap " ++ show nm ++ " to " ++ show l ++ ", which is more than the original " ++ show (sq_length a) [] -> error $ "no match for target sequence " ++ show nm _ -> error $ "target sequence " ++ show nm ++ " is ambiguous" _ -> error $ "couldn't parse length " ++ show r ++ " for " ++ show nm _ -> error $ "couldn't parse argument " ++ show arg -- | This runs both stages of the rewrapping: First normalize alignments -- (POS must be in the canonical interval) and fix XA, MPOS, MAPQ where -- appropriate, then duplicate the read and softmask the noncanonical -- parts. Rmdup fits in between the two, hence the split rewrap :: M.Map Refseq (Int,S.ByteString) -> BamRec -> [BamRec] rewrap m b = maybe [b] (\(l,nm) -> wrapTo l $ normalizeTo nm l b) $ M.lookup (b_rname b) m