module Bio.RealWorld.UCSC
( UCSCGene(..)
, getTSS
, getJunction
, readUCSCGenes
, readUCSCGenes'
) where
import qualified Data.ByteString.Char8 as B
import Conduit
import qualified Data.Vector.Unboxed as U
import System.IO
import Bio.RealWorld.ID
import Bio.Utils.Misc (readInt)
data UCSCGene = UCSCGene
{ _geneName :: !B.ByteString
, _chrom :: !B.ByteString
, _strand :: !Bool
, _transcript :: !(Int, Int)
, _cds :: !(Int, Int)
, _exons :: !(U.Vector (Int, Int))
, _introns :: !(U.Vector (Int, Int))
, _proteinId :: !UniprotID
, _alignId :: !UCSCID
} deriving (Show)
getTSS :: UCSCGene -> (B.ByteString, Int)
getTSS g = (_chrom g, fst $ _transcript g)
getJunction :: UCSCGene -> (B.ByteString, U.Vector Int)
getJunction g = (_chrom g, U.map fst $ _introns g)
readUCSCGenes :: FilePath -> Source IO UCSCGene
readUCSCGenes fl = do
handle <- liftIO $ openFile fl ReadMode
_ <- liftIO $ B.hGetLine handle
loop handle
where
loop h = do
eof <- liftIO $ hIsEOF h
if eof
then liftIO $ hClose h
else do
l <- liftIO $ B.hGetLine h
yield $ readGeneFromLine l
loop h
readUCSCGenes' :: FilePath -> IO [UCSCGene]
readUCSCGenes' fl = readUCSCGenes fl $$ sinkList
readGeneFromLine :: B.ByteString -> UCSCGene
readGeneFromLine xs =
let [f1,f2,f3,f4,f5,f6,f7,_,f9,f10,f11,f12] = B.split '\t' xs
str | f3 == "+" = True
| otherwise = False
trans = (readInt f4, readInt f5)
cds = (readInt f6, readInt f7)
exonStarts = map readInt . init . B.split ',' $ f9
exonEnds = map readInt . init . B.split ',' $ f10
exons = U.fromList $ zip exonStarts exonEnds
introns = U.fromList $ zip exonEnds $ tail exonStarts
in UCSCGene f1 f2 str trans cds exons introns (UniprotID f11) (UCSCID f12)