{-# LANGUAGE BinaryLiterals #-}
{-# LANGUAGE OverloadedStrings #-}
module SequenceFormats.Plink (readBimStdIn, readBimFile, writeBim, readFamFile, readPlinkBedFile) where
import SequenceFormats.Eigenstrat (EigenstratIndEntry (..),
EigenstratSnpEntry (..),
GenoEntry (..), GenoLine,
Sex (..))
import SequenceFormats.Utils (Chrom (..), consumeProducer,
readFileProd, word)
import Control.Applicative ((<|>))
import Control.Monad (void)
import Control.Monad.Catch (MonadThrow, throwM)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.Trans.State.Strict (runStateT)
import qualified Data.Attoparsec.ByteString as AB
import qualified Data.Attoparsec.ByteString.Char8 as A
import Data.Bits (shiftR, (.&.))
import qualified Data.ByteString as BB
import qualified Data.ByteString.Char8 as B
import Data.Vector (fromList)
import Pipes (Consumer, Producer, (>->))
import Pipes.Attoparsec (ParsingError (..), parse)
import qualified Pipes.ByteString as PB
import qualified Pipes.Prelude as P
import Pipes.Safe (MonadSafe, runSafeT)
import qualified Pipes.Safe.Prelude
import System.IO (Handle, IOMode (..),
withFile)
bimParser :: A.Parser EigenstratSnpEntry
bimParser = do
chrom <- word
snpId_ <- A.skipMany1 A.space >> word
geneticPos <- A.skipMany1 A.space >> A.double
pos <- A.skipMany1 A.space >> A.decimal
ref <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGN")
alt <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGX")
void A.endOfLine
return $ EigenstratSnpEntry (Chrom chrom) pos geneticPos snpId_ ref alt
famParser :: A.Parser EigenstratIndEntry
famParser = do
A.skipMany A.space
pop <- word
ind <- A.skipMany1 A.space >> word
_ <- A.skipMany1 A.space >> A.decimal
_ <- A.skipMany1 A.space >> A.decimal
sex <- A.skipMany1 A.space >> parseSex
_ <- A.skipMany1 A.space >> word
void A.endOfLine
return $ EigenstratIndEntry (B.unpack ind) sex (B.unpack pop)
where
parseSex = parseMale <|> parseFemale <|> parseUnknown
parseMale = A.char '1' >> return Male
parseFemale = A.char '2' >> return Female
parseUnknown = A.anyChar >> return Unknown
bedHeaderParser :: AB.Parser ()
bedHeaderParser = do
void $ AB.word8 0b01101100
void $ AB.word8 0b00011011
void $ AB.word8 0b00000001
bedGenotypeParser :: Int -> AB.Parser GenoLine
bedGenotypeParser nrInds = do
let nrBytes = if nrInds `rem` 4 == 0 then nrInds `quot` 4 else (nrInds `quot` 4) + 1
bytes <- BB.unpack <$> AB.take nrBytes
let indBitPairs = concatMap getBitPairs bytes
return . fromList . take nrInds . map bitPairToGenotype $ indBitPairs
where
getBitPairs byte = map (0b00000011 .&.) [byte, shiftR byte 2, shiftR byte 4, shiftR byte 6]
bitPairToGenotype 0b00000000 = HomRef
bitPairToGenotype 0b00000010 = Het
bitPairToGenotype 0b00000011 = HomAlt
bitPairToGenotype 0b00000001 = Missing
bitPairToGenotype _ = error "This should never happen"
readPlinkBedProd :: (MonadThrow m) => Int -> Producer B.ByteString m () -> m (Producer GenoLine m ())
readPlinkBedProd nrInds prod = do
(res, rest) <- runStateT (parse bedHeaderParser) prod
_ <- case res of
Nothing -> throwM $ ParsingError [] "Bed file exhausted prematurely"
Just (Left e) -> throwM e
Just (Right h) -> return h
return $ consumeProducer (bedGenotypeParser nrInds) rest
readPlinkBedFile :: (MonadSafe m) => FilePath -> Int -> m (Producer GenoLine m ())
readPlinkBedFile file nrInds = readPlinkBedProd nrInds (Pipes.Safe.Prelude.withFile file ReadMode PB.fromHandle)
readBimStdIn :: (MonadThrow m, MonadIO m) => Producer EigenstratSnpEntry m ()
readBimStdIn = consumeProducer bimParser PB.stdin
readBimFile :: (MonadSafe m) => FilePath -> Producer EigenstratSnpEntry m ()
readBimFile = consumeProducer bimParser . readFileProd
readFamFile :: (MonadIO m) => FilePath -> m [EigenstratIndEntry]
readFamFile fn =
liftIO . withFile fn ReadMode $ \handle ->
P.toListM $ consumeProducer famParser (PB.fromHandle handle)
readPlink :: (MonadSafe m) => FilePath
-> FilePath
-> FilePath
-> m ([EigenstratIndEntry], Producer (EigenstratSnpEntry, GenoLine) m ())
readPlink bedFile bimFile famFile = do
indEntries <- readFamFile famFile
let nrInds = length indEntries
snpProd = readBimFile bimFile
genoProd <- readPlinkBedFile bedFile nrInds
return (indEntries, P.zip snpProd genoProd)
writeBim :: (MonadIO m) => Handle
-> Consumer EigenstratSnpEntry m ()
writeBim snpFileH =
let snpOutTextConsumer = PB.toHandle snpFileH
toTextPipe = P.map (\(EigenstratSnpEntry chrom pos gpos gid ref alt) ->
B.intercalate "\t" [unChrom chrom, gid, B.pack (show gpos), B.pack (show pos), B.singleton ref, B.singleton alt])
in toTextPipe >-> snpOutTextConsumer