{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
module Parse
( getAbundanceMap
, getFrequencyMap
, getAccSet
, parseDuplications
, parseBAM
) where
import Data.Char
import qualified Data.Map.Strict as Map
import qualified Data.Set as Set
import Control.Lens
import Data.Csv
import Data.Text.Read
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.Text as T
import qualified Data.Vector as V
import Safe
import Types
import Utility
getAbundanceMap :: B.ByteString -> AbundanceMap
getAbundanceMap =
AbundanceMap
. Map.unions
. fmap (\ !m -> Map.singleton
(findWithError ("target_id" :: T.Text) m)
(getDouble . findWithError ("est_counts" :: T.Text) $ m)
)
. V.toList
. either error snd
. decodeByNameWith decodeOpts
where
decodeOpts = defaultDecodeOptions { decDelimiter = fromIntegral (ord '\t') }
getDouble = either error fst . double
getFrequencyMap :: AbundanceMap -> FrequencyMap
getFrequencyMap (AbundanceMap m) = FrequencyMap . Map.map (/ totalCount) $ m
where
totalCount = Map.foldl (+) 0 m
getAccSet :: [DuplicationRow] -> AccSet
getAccSet = AccSet
. Set.fromList
. fmap ( flip (Safe.at) 1
. T.splitOn "|"
. findWithError "fHeader"
. unDuplicationRow
)
parseDuplications :: B.ByteString -> (Header, [DuplicationRow])
parseDuplications =
over _2 (fmap DuplicationRow . V.toList) . either error id . decodeByName
parseBAM :: T.Text -> [BamRow]
parseBAM = fmap (BamRow . T.splitOn "\t") . T.lines