{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
module Hyrax.Abif.Generate
( generateAb1s
, generateAb1
, readWeightedFasta
, iupac
, unIupac
) where
import Protolude
import qualified Data.Text as Txt
import qualified Data.Text.Encoding as TxtE
import qualified Data.List as Lst
import qualified Data.Binary.Put as B
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BSL
import qualified System.FilePath as FP
import System.FilePath ((</>))
import qualified System.Directory as Dir
import Hyrax.Abif
import Hyrax.Abif.Write
import Hyrax.Abif.Fasta
data TraceData = TraceData { trData09G :: ![Int16]
, trData10A :: ![Int16]
, trData11T :: ![Int16]
, trData12C :: ![Int16]
, trValsPerBase :: !Int
, trFasta :: !Text
} deriving (Show)
generateAb1s :: FilePath -> FilePath -> IO ()
generateAb1s source dest = do
Dir.createDirectoryIfMissing True dest
weighted <- readWeightedFastas source
case weighted of
Left e -> putText e
Right rs -> do
let ab1s = (\(n, r) -> (n, generateAb1 (n, r))) <$> rs
traverse_ (\(name, ab1) -> BS.writeFile (dest </> Txt.unpack name <> ".ab1") $ BSL.toStrict ab1) ab1s
generateAb1 :: (Text, [(Double, Text)]) -> BSL.ByteString
generateAb1 (fName, sourceFasta) =
let
tr = generateTraceData sourceFasta
valsPerBase = trValsPerBase tr
generatedFastaLen = (Txt.length $ trFasta tr)
midPeek = valsPerBase `div` 2
peakLocations = take generatedFastaLen [midPeek, valsPerBase + midPeek..]
sampleName = fst . Txt.breakOn "_" $ fName
dirs = [ mkData 9 $ trData09G tr
, mkData 10 $ trData10A tr
, mkData 11 $ trData11T tr
, mkData 12 $ trData12C tr
, mkBaseOrder BaseG BaseA BaseT BaseC
, mkLane 1
, mkCalledBases $ trFasta tr
, mkMobilityFileName 1 "KB_3500_POP7_BDTv3.mob"
, mkMobilityFileName 2 "KB_3500_POP7_BDTv3.mob"
, mkPeakLocations $ fromIntegral <$> peakLocations
, mkDyeSignalStrength 53 75 79 48
, mkSampleName sampleName
, mkComment "Generated by HyraxBio AB1 generator"
]
abif = Abif { aHeader = mkHeader
, aRootDir = mkRoot
, aDirs = dirs
}
in
B.runPut (putAbif abif)
generateTraceData :: [(Double, Text)] -> TraceData
generateTraceData weighted =
let
weightedNucs' = (\(w, ns) -> (w,) . unIupac <$> Txt.unpack ns) <$> weighted
weightedNucs = Lst.transpose weightedNucs'
curve = [0, 0, 128, 512, 1024, 1024, 512, 128, 0, 0]
valsPerBase = length curve
data09G = concat $ getWeightedTrace curve 'G' <$> weightedNucs
data10A = concat $ getWeightedTrace curve 'A' <$> weightedNucs
data11T = concat $ getWeightedTrace curve 'T' <$> weightedNucs
data12C = concat $ getWeightedTrace curve 'C' <$> weightedNucs
fastaSeq = concat <$> (snd <<$>> weightedNucs)
fasta = Txt.pack $ iupac fastaSeq
in
TraceData { trData09G = data09G
, trData10A = data10A
, trData11T = data11T
, trData12C = data12C
, trFasta = fasta
, trValsPerBase = valsPerBase
}
where
getWeightedTrace :: [Int] -> Char -> [(Double, [Char])] -> [Int16]
getWeightedTrace curve nuc ws =
let
found = filter ((nuc `elem`) . snd) ws
score' = foldl' (+) 0 $ fst <$> found
score = min 1 . max 0 $ score'
wave = floor . (score *) . fromIntegral <$> curve
in
wave
readWeightedFasta :: ByteString -> Either Text [(Double, Text)]
readWeightedFasta fastaData =
case parseFasta $ TxtE.decodeUtf8 fastaData of
Left e -> Left e
Right fs -> getWeightedFasta fs
where
getWeightedFasta :: [Fasta] -> Either Text [(Double, Text)]
getWeightedFasta fs =
case sequenceA $ readWeighted <$> fs of
Left e -> Left e
Right r -> Right r
readWeighted :: Fasta -> Either Text (Double, Text)
readWeighted (Fasta hdr dta) =
case (readMaybe . Txt.unpack $ hdr :: Maybe Double) of
Just weight -> Right (min 1 . max 0 $ weight, Txt.strip dta)
Nothing -> Left $ "Invalid header reading, expecting numeric weight, got: " <> hdr
readWeightedFastas :: FilePath -> IO (Either Text [(Text, [(Double, Text)])])
readWeightedFastas source = do
files <- filter (Txt.isSuffixOf ".fasta" . Txt.pack) <$> getFiles source
let names = Txt.pack . FP.takeBaseName <$> files
contents <- traverse BS.readFile files
case sequenceA $ readWeightedFasta <$> contents of
Left e -> pure . Left $ e
Right rs -> pure . Right $ zip names rs
getFiles :: FilePath -> IO [FilePath]
getFiles p = do
entries <- (p </>) <<$>> Dir.listDirectory p
filterM Dir.doesFileExist entries
unIupac :: Char -> [Char]
unIupac c =
case c of
'T' -> "T"
'C' -> "C"
'A' -> "A"
'G' -> "G"
'U' -> "T"
'M' -> "AC"
'R' -> "AG"
'W' -> "AT"
'S' -> "CG"
'Y' -> "CT"
'K' -> "GT"
'V' -> "ACG"
'H' -> "ACT"
'D' -> "AGT"
'B' -> "CGT"
'N' -> "GATC"
'X' -> "GATC"
_ -> ""
iupac :: [[Char]] -> [Char]
iupac ns =
go <$> ns
where
go cs =
let
a = 'A' `elem` cs
c = 'C' `elem` cs
g = 'G' `elem` cs
t = 'T' `elem` cs
in
case (a, c, g, t) of
(True, False, False, False) -> 'A'
(False, True, False, False) -> 'C'
(False, False, True, False) -> 'G'
(False, False, False, True ) -> 'T'
(True, True, False, False) -> 'M'
(True, False, True, False) -> 'R'
(True, False, False, True ) -> 'W'
(False, True, True, False) -> 'S'
(False, True, False, True ) -> 'Y'
(False, False, True, True ) -> 'K'
(True, True, True, False) -> 'V'
(True, True, False, True ) -> 'H'
(True, False, True, True ) -> 'D'
(False, True, True, True ) -> 'B'
(True, True, True, True ) -> 'N'
_ -> '_'