{-# LANGUAGE OverloadedStrings #-}
module SequenceFormats.Eigenstrat (EigenstratSnpEntry(..), EigenstratIndEntry(..),
readEigenstratInd, GenoEntry(..), GenoLine, Sex(..),
readEigenstratSnpStdIn, readEigenstratSnpFile, readBimStdIn, readBimFile,
readEigenstrat, writeEigenstrat, writeEigenstratIndFile, writeEigenstratSnp, writeBim,
writeEigenstratGeno) where
import SequenceFormats.Utils (consumeProducer, SeqFormatException(..), Chrom(..), readFileProd)
import Control.Applicative ((<|>))
import Control.Exception (throw)
import Control.Monad (void, forM_)
import Control.Monad.Catch (MonadThrow)
import Control.Monad.IO.Class (MonadIO, liftIO)
import qualified Data.Attoparsec.ByteString.Char8 as A
import Data.Char (isSpace)
import Data.Vector (Vector, fromList, toList)
import qualified Data.ByteString.Char8 as B
import Pipes (Producer, Pipe, (>->), for, cat, yield, Consumer)
import Pipes.Safe (MonadSafe)
import qualified Pipes.Safe.Prelude as PS
import qualified Pipes.Prelude as P
import qualified Pipes.ByteString as PB
import System.IO (withFile, IOMode(..), Handle, hPutStrLn)
data EigenstratSnpEntry = EigenstratSnpEntry {
snpChrom :: Chrom,
snpPos :: Int,
snpGeneticPos :: Double,
snpId :: B.ByteString,
snpRef :: Char,
snpAlt :: Char
} deriving (Eq, Show)
data EigenstratIndEntry = EigenstratIndEntry String Sex String deriving (Eq, Show)
data Sex = Male | Female | Unknown deriving (Eq, Show)
data GenoEntry = HomRef | Het | HomAlt | Missing deriving (Eq, Show)
type GenoLine = Vector GenoEntry
eigenstratSnpParser :: A.Parser EigenstratSnpEntry
eigenstratSnpParser = do
snpId_ <- A.skipMany A.space >> word
chrom <- A.skipMany1 A.space >> word
geneticPos <- A.skipMany1 A.space >> A.double
pos <- A.skipMany1 A.space >> A.decimal
ref <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGN")
alt <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGX")
void A.endOfLine
return $ EigenstratSnpEntry (Chrom (B.unpack chrom)) pos geneticPos snpId_ ref alt
bimParser :: A.Parser EigenstratSnpEntry
bimParser = do
chrom <- word
snpId_ <- A.skipMany1 A.space >> word
geneticPos <- A.skipMany1 A.space >> A.double
pos <- A.skipMany1 A.space >> A.decimal
ref <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGN")
alt <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGX")
void A.endOfLine
return $ EigenstratSnpEntry (Chrom (B.unpack chrom)) pos geneticPos snpId_ ref alt
word :: A.Parser B.ByteString
word = A.takeTill isSpace
eigenstratIndParser :: A.Parser EigenstratIndEntry
eigenstratIndParser = do
A.skipMany A.space
name <- word
A.skipMany1 A.space
sex <- parseSex
A.skipMany1 A.space
popName <- word
void A.endOfLine
return $ EigenstratIndEntry (B.unpack name) sex (B.unpack popName)
parseSex :: A.Parser Sex
parseSex = parseMale <|> parseFemale <|> parseUnknown
where
parseMale = A.char 'M' >> return Male
parseFemale = A.char 'F' >> return Female
parseUnknown = A.char 'U' >> return Unknown
readEigenstratInd :: (MonadIO m) => FilePath -> m [EigenstratIndEntry]
readEigenstratInd fn =
liftIO . withFile fn ReadMode $ \handle ->
P.toListM $ consumeProducer eigenstratIndParser (PB.fromHandle handle)
eigenstratGenoParser :: A.Parser GenoLine
eigenstratGenoParser = do
line <- A.takeWhile1 isValidNum
void A.endOfLine
return . fromList $ do
l <- B.unpack line
case l of
'0' -> return HomAlt
'1' -> return Het
'2' -> return HomRef
'9' -> return Missing
_ -> error "this should never happen"
where
isValidNum c = c == '0' || c == '1' || c == '2' || c == '9'
readEigenstratSnpStdIn :: (MonadThrow m, MonadIO m) => Producer EigenstratSnpEntry m ()
readEigenstratSnpStdIn = consumeProducer eigenstratSnpParser PB.stdin
readEigenstratSnpFile :: (MonadSafe m) => FilePath -> Producer EigenstratSnpEntry m ()
readEigenstratSnpFile = consumeProducer eigenstratSnpParser . readFileProd
readBimStdIn :: (MonadThrow m, MonadIO m) => Producer EigenstratSnpEntry m ()
readBimStdIn = consumeProducer bimParser PB.stdin
readBimFile :: (MonadSafe m) => FilePath -> Producer EigenstratSnpEntry m ()
readBimFile = consumeProducer bimParser . readFileProd
readEigenstrat :: (MonadSafe m) => FilePath
-> FilePath
-> FilePath
-> m ([EigenstratIndEntry], Producer (EigenstratSnpEntry, GenoLine) m ())
readEigenstrat genoFile snpFile indFile = do
indEntries <- readEigenstratInd indFile
let snpProd = readEigenstratSnpFile snpFile
genoProd = consumeProducer eigenstratGenoParser (readFileProd genoFile) >->
validateEigenstratEntries (length indEntries)
return (indEntries, P.zip snpProd genoProd)
validateEigenstratEntries :: (MonadThrow m) => Int -> Pipe GenoLine GenoLine m ()
validateEigenstratEntries nr = for cat $ \line ->
if length line /= nr
then do
let msg = "inconsistent nr of genotypes (" <> show (length line) <> ", but should be " <> show nr <> ") in \
\genotype line " <> show line
throw (SeqFormatException msg)
else
yield line
writeEigenstratIndFile :: (MonadIO m) => FilePath -> [EigenstratIndEntry] -> m ()
writeEigenstratIndFile f indEntries =
liftIO . withFile f WriteMode $ \h ->
forM_ indEntries $ \(EigenstratIndEntry name sex popName) ->
hPutStrLn h $ name <> "\t" <> sexToStr sex <> "\t" <> popName
where
sexToStr sex = case sex of
Male -> "M"
Female -> "F"
Unknown -> "U"
writeEigenstratSnp :: (MonadIO m) => Handle
-> Consumer EigenstratSnpEntry m ()
writeEigenstratSnp snpFileH =
let snpOutTextConsumer = PB.toHandle snpFileH
toTextPipe = P.map (\(EigenstratSnpEntry chrom pos gpos gid ref alt) ->
let snpLine = B.intercalate "\t" [gid, B.pack (unChrom chrom), B.pack (show gpos),
B.pack (show pos), B.singleton ref, B.singleton alt]
in snpLine <> "\n")
in toTextPipe >-> snpOutTextConsumer
writeBim :: (MonadIO m) => Handle
-> Consumer EigenstratSnpEntry m ()
writeBim snpFileH =
let snpOutTextConsumer = PB.toHandle snpFileH
toTextPipe = P.map (\(EigenstratSnpEntry chrom pos gpos gid ref alt) ->
B.intercalate "\t" [B.pack (unChrom chrom), gid, B.pack (show gpos), B.pack (show pos), B.singleton ref, B.singleton alt])
in toTextPipe >-> snpOutTextConsumer
writeEigenstratGeno :: (MonadIO m) => Handle
-> Consumer GenoLine m ()
writeEigenstratGeno genoFileH =
let genoOutTextConsumer = PB.toHandle genoFileH
toTextPipe = P.map (\genoLine ->
let genoLineStr = B.concat . map (B.pack . show . toEigenStratNum) . toList $ genoLine
in genoLineStr <> "\n")
in toTextPipe >-> genoOutTextConsumer
where
toEigenStratNum c = case c of
HomRef -> 2 :: Int
Het -> 1
HomAlt -> 0
Missing -> 9
writeEigenstrat :: (MonadSafe m) => FilePath
-> FilePath
-> FilePath
-> [EigenstratIndEntry]
-> Consumer (EigenstratSnpEntry, GenoLine) m ()
writeEigenstrat genoFile snpFile indFile indEntries = do
liftIO $ writeEigenstratIndFile indFile indEntries
let snpOutConsumer = PS.withFile snpFile WriteMode writeEigenstratSnp
genoOutConsumer = PS.withFile genoFile WriteMode writeEigenstratGeno
P.tee (P.map fst >-> snpOutConsumer) >-> P.map snd >-> genoOutConsumer