module Bio.Sequence.GeneOntology
(
GoTerm(..), GoDef(..)
, GoHierarchy, readObo
, readTerms
, Annotation(..), UniProtAcc, GoClass(..), EvidenceCode(..), readGOA, isCurated
, decomment
) where
import Data.ByteString.Lazy.Char8 (ByteString,pack,unpack,copy)
import qualified Data.ByteString.Lazy.Char8 as B
readObo :: FilePath -> IO GoHierarchy
readObo f = B.readFile f >>=
return . mkGoHier . decomment
readGOA :: FilePath -> IO [Annotation]
readGOA f = B.readFile f >>=
return . map mkAnn . decomment
readTerms :: FilePath -> IO [GoDef]
readTerms f = B.readFile f >>=
return . map mkGoDef . decomment
decomment :: ByteString -> [ByteString]
decomment = filter (\l -> not (B.null l) && B.head l /= '!') . B.lines
type GoHierarchy = [(GoDef,[GoTerm])]
mkGoHier :: [ByteString] -> [(GoDef,[GoTerm])]
mkGoHier ls = go $ dropWhile (not . termStart) ls
where termStart = (== B.pack "[Term]")
go [] = []
go (_:zs) = let (this,rest) = span (not . B.isPrefixOf (B.pack "[")) zs in
if null this then
if not (null rest) then error "Parse failure in mkGoHier/go"
else []
else (mk1 $ map ($ this) [getId, getName, getNamespace, getIsA]) : mkGoHier rest
mk1 xs@[i,n,ns,isa]
| or (map null [i,n,ns]) = error ("Failed to parse Go Term (missing field in entry):\n"
++unlines (map unpack $ concat xs))
| length i /= 1 || length n /= 1 || length ns /= 1
= error ("Failed to parse Go Term (incorrect field multiplicity):\n"
++unlines (map unpack $ concat xs))
| otherwise = (GoDef (getGo $ head i) (head n) (readNS $ head ns), map getGo isa)
mk1 _ = error "This shouldn't happen!"
getId = map ((!!1) . B.words) . filter (B.isPrefixOf (pack "id:"))
getName = map (B.unwords. tail . B.words) . filter (B.isPrefixOf (pack "name:"))
getNamespace = map ((!!1) . B.words) . filter (B.isPrefixOf (pack "namespace:"))
getIsA = map ((!!1) . B.words) . filter (B.isPrefixOf (pack "is_a:"))
readNS xs = case unpack xs of "biological_process" -> Proc
"molecular_function" -> Func
"cellular_component" -> Comp
_ -> error ("Unknown function: "++unpack xs)
newtype GoTerm = GO Int deriving (Eq,Ord)
data GoClass = Func | Proc | Comp
instance Read GoTerm where
readsPrec n ('G':'O':':':xs) = map (\(i,s)-> (GO i,s)) (readsPrec n xs)
readsPrec _ e = error ("couldn't parse GO term: "++show e)
instance Show GoTerm where show (GO x) = "GO:"++show x
getGo :: ByteString -> GoTerm
getGo bs = GO $ fst $ maybe e id (B.readInt $ B.drop 3 bs)
where e = error ("Unable to parse GO term"++unpack bs)
data GoDef = GoDef !GoTerm !ByteString !GoClass deriving (Show)
mkGoDef :: ByteString -> GoDef
mkGoDef = pick . B.split '\t'
where pick [go,desc,cls] = GoDef (read $ unpack go) (copy desc) (read $ unpack cls)
pick _xs = error ("Couldn't decipher GO definition from: "++show _xs)
instance Read GoClass where
readsPrec _ ('F':xs) = [(Func,xs)]
readsPrec _ ('P':xs) = [(Proc,xs)]
readsPrec _ ('C':xs) = [(Comp,xs)]
readsPrec _ _ = []
instance Show GoClass where
show Func = "F"
show Proc = "P"
show Comp = "C"
type UniProtAcc = ByteString
data Annotation = Ann !UniProtAcc !GoTerm !EvidenceCode deriving (Show)
mkAnn :: ByteString -> Annotation
mkAnn = pick . B.words
where pick (_db:up:rest) = pick' up $ findGo rest
pick _ = error "Internal error: mkAnn/pick"
pick' up' (go:_:ev:_) = Ann (copy up') (getGo go) (getEC ev)
pick' _ _ = error "Internal error: mkAnn/pick'"
findGo = dropWhile (not . B.isPrefixOf (pack "GO:"))
data EvidenceCode = IC
| IDA
| IEA
| IEP
| IGC
| IGI
| IMP
| IPI
| ISS
| NAS
| ND
| RCA
| TAS
| NR
deriving (Read,Show,Eq)
getEC :: ByteString -> EvidenceCode
getEC s = case B.uncons s of
Just ('I',s') -> case B.uncons s' of
Just ('C',_) -> IC
Just ('D',_) -> IDA
Just ('E',s'') -> case B.head s'' of 'A' -> IEA
'P' -> IEP
_ -> e 1
Just ('G',s'') -> case B.head s'' of 'C' -> IGC
'I' -> IGI
_ -> e 2
Just ('M',_) -> IMP
Just ('P',_) -> IPI
Just ('S',_) -> ISS
_ -> e 3
Just ('N',s') -> case B.head s' of 'A' -> NAS
'D' -> ND
'R' -> NR
_ -> e 4
Just ('R',_) -> RCA
Just ('T',_) -> TAS
_ -> e 5
where e :: Int -> a
e n = error ("Illegal GO evidence code ("++show n++"): "++unpack s)
isCurated :: EvidenceCode -> Bool
isCurated = not . (`elem` [ND,IEA])