module Bio.Alignment.ACE (readACE, writeACE, Assembly(..), ptest, reads ) where
import Prelude hiding (lines,words,readFile,unwords
,reads,pred
)
import Bio.Sequence.SeqData
import Bio.Alignment.AlignData (Dir(..), Gaps, Alignment, extractGaps)
import qualified Data.ByteString.Lazy.Char8 as B
import Data.ByteString.Lazy.Char8 (ByteString,words,pack,unpack,readFile,unwords)
import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Parsec.Pos (newPos)
import Control.Monad (liftM)
import Data.Char (chr)
data Assembly = Asm { contig :: (Sequence Nuc,Gaps), fragments :: Alignment Nuc}
deriving Show
reads :: Assembly -> Alignment Nuc
reads = fragments
type Str = ByteString
data ACE = AS Str Str | CO Str Str Str Str Str
| BQ | AF Str Str Str | BS [Str]
| RD Str Str Str Str
| QA Str Str Str Str Str
| DS [Str] | Other [Str] | Empty
deriving (Eq)
instance Show ACE where
show (AS x y) = "AS "++uw [x,y]
show (CO a b c d e) = "CO "++uw [a,b,c,d,e]
show BQ = "BQ"
show (AF a b c) = "AF "++uw [a,b,c]
show (RD a b c d) = "RD "++uw [a,b,c,d]
show (QA a b c d e) = "QA "++uw [a,b,c,d,e]
show (DS ss) = "DS "++uw ss
show (Other ss) = uw ss
show Empty = "(blank)"
show _ = "unknown ACE string"
uw :: [Str] -> String
uw = unpack . unwords
type AceParser a = GenParser (SourcePos,ACE) () a
parse1 :: (ACE -> Maybe a) -> AceParser a
parse1 p = token sho pos pred
where sho (_,t) = show t
pos (n,_) = n
pred (_,t) = p t
ptest :: Show a => String -> AceParser a -> [ACE] -> IO ()
ptest m p = parseTest p . source m
source :: String -> [ACE] -> [(SourcePos,ACE)]
source m = zip (iterate (\sp -> incSourceLine sp 1) (newPos m 1 0))
ace :: AceParser [[Assembly]]
ace = many1 ace1
ace1 :: AceParser [Assembly]
ace1 = do
as
many blank
many (do ctg >>= asm)
as :: AceParser (Int,Int)
as = parse1 (\t -> case t of AS cs rs -> do c <- B.readInt cs
r <- B.readInt rs
return (fst c,fst r)
_ -> Nothing) <?> "AS <int> <int>"
blank :: AceParser ()
blank = parse1 (\t -> case t of Empty -> Just ()
_ -> Nothing) <?> "empty line"
ctg :: AceParser (Sequence Nuc,Gaps)
ctg = do
name <- co
sd <- sdata
let (sd',gaps) = extractGaps sd
many blank
msq <- do bq
sq <- qdata
return (Just sq)
<|> return Nothing
many blank
return (Seq name sd' msq,gaps)
co, sdata, qdata :: AceParser Str
co = parse1 (\t -> case t of CO name a b c _compl -> do
_bs <- B.readInt a
_rds <- B.readInt b
_seg <- B.readInt c
return name
_ -> Nothing) <?> "CO name <int> <int> <int> bool"
sdata = do return . B.concat =<< many1 sdata1
where sdata1 = parse1 (\t -> case t of Other sd -> Just (unwords sd)
_ -> Nothing) <?> "sequence data"
qdata = do return . B.concat =<< many1 qdata1
where qdata1 = parse1 (\t -> case t of Other sd -> liftM (pack . map chr) (readInts sd)
_ -> Nothing) <?> "sequence data"
readInts :: [ByteString] -> Maybe [Int]
readInts [] = Just []
readInts (x:xs) = do (i,_) <- B.readInt x
is <- readInts xs
return $ (i:is)
bq :: AceParser ()
bq = parse1 (\t -> case t of BQ -> Just (); _ -> Nothing) <?> "BQ"
asm :: (Sequence Nuc,Gaps) -> AceParser Assembly
asm cg = do
many blank
afs <- many1 af
_bss <- many bs
many blank
rds cg afs
af :: AceParser (Str,Dir,Offset)
af = parse1 (\t -> case t of AF a b c -> mkAF a b c
_ -> Nothing) <?> "AF name (U|C) pad_start"
where mkAF a b c = do
b' <- case unpack b of "U" -> Just Fwd; "C" -> Just Rev; _ -> Nothing
c' <- liftM (fromIntegral . fst) (B.readInt c)
return (a,b',c')
bs :: AceParser (Int,Int,Str)
bs = parse1 (\t -> case t of BS [x,y,n] -> do
x' <- readInt' x
y' <- readInt' y
return (x',y',n)
_ -> Nothing) <?> "BS x y name"
readInt' :: Str -> Maybe Int
readInt' = liftM (fromIntegral . fst) . B.readInt
rds :: (Sequence Nuc,Gaps) -> [(Str,Dir,Offset)] -> AceParser Assembly
rds cg xs = do
r <- many1 rseq
let f (_name,d,off) (s,gs) = (off,d,s,gs)
return $ Asm { contig = cg, fragments = zipWith f xs r }
rseq :: AceParser (Sequence Nuc,Gaps)
rseq = do
(rn,_len,_,_) <- rd
(s,gaps) <- return . extractGaps =<< sdata
many1 blank
qa
ds
many blank
return (Seq rn s Nothing,gaps)
rd :: AceParser (Str,Int,Int,Int)
rd = parse1 (\t -> case t of RD a b c d -> do [x,y,z] <- readInts [b,c,d]
return (a,x,y,z)
_ -> Nothing) <?> "RD <string> <int> <int> <int>"
qa :: AceParser ()
qa = parse1 (\_ -> Just ())
ds :: AceParser ()
ds = parse1 (\t -> case t of DS _ -> Just (); _ -> Nothing) <?> "DS"
tokenize :: ByteString -> [ACE]
tokenize = map tokenize1 . B.lines
tokenize1 :: ByteString -> ACE
tokenize1 l = case words l of
[] -> Empty
(h:ws) -> case (unpack h,ws) of
("AS",[cs,rs]) -> AS cs rs
("CO",[nm,bss,rs,segs,comp]) -> CO nm bss rs segs comp
("BQ",[]) -> BQ
("AF",[a,b,c]) -> AF a b c
("BS",_) -> BS ws
("RD",[a,b,c,d]) -> RD a b c d
("QA",[a,b,c,d,e]) -> QA a b c d e
("DS",_) -> DS ws
_ -> Other (h:ws)
readACE :: FilePath -> IO [[Assembly]]
readACE f = parseit =<< B.readFile f
where parseit = \s -> case (parse ace f . source f . tokenize) s of
Left e -> fail (show e)
Right a -> return a
writeACE :: FilePath -> [Assembly] -> IO ()
writeACE = undefined