module Bio.EntrezHTTP (
module Bio.EntrezHTTPData,
entrezHTTP,
retrieveGeneSymbolFasta,
fetchNucleotideString,
retrieveElementsEntrez,
portionListElements,
readEntrezTaxonSet,
readEntrezSimpleTaxons,
readEntrezParentIds,
readEntrezSummaries,
readEntrezSearch,
buildRegistration,
maybeBuildRegistration,
setStrand,
convertCoordinatesToStrand
) where
import Network.HTTP.Conduit
import qualified Data.ByteString.Lazy.Char8 as L8
import Text.XML.HXT.Core
import Network
import Data.Maybe
import Bio.EntrezHTTPData
import Bio.TaxonomyData --(Rank,SimpleTaxon)
import qualified Data.ByteString.Char8 as B
import Network.HTTP.Base
startSession :: String -> String -> String -> IO String
startSession program' database' query' = do
requestXml <- withSocketsDo
$ sendQuery program' database' query'
let requestXMLString = L8.unpack requestXml
return requestXMLString
sendQuery :: String -> String -> String -> IO L8.ByteString
sendQuery program' database' query' = simpleHttp ("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"++ program' ++ ".fcgi?" ++ "db=" ++ database' ++ "&" ++ query')
entrezHTTP :: EntrezHTTPQuery -> IO String
entrezHTTP (EntrezHTTPQuery program' database' query') = do
let defaultProgram = "summary"
let defaultDatabase = "nucleotide"
let selectedProgram = fromMaybe defaultProgram program'
let selectedDatabase = fromMaybe defaultDatabase database'
startSession selectedProgram selectedDatabase query'
retrieveElementsEntrez :: [a] -> ([a] -> IO b) -> IO [b]
retrieveElementsEntrez listElements retrievalfunction = do
let splits = portionListElements listElements 20
mapM retrievalfunction splits
portionListElements :: [a] -> Int -> [[a]]
portionListElements listElements elementsPerSplit
| not (null listElements) = filter (not . null) result
| otherwise = []
where (heads,xs) = splitAt elementsPerSplit listElements
result = heads:portionListElements xs elementsPerSplit
readEntrezTaxonSet :: String -> [Taxon]
readEntrezTaxonSet = runLA (xreadDoc >>> parseEntrezTaxonSet)
parseEntrezTaxonSet :: ArrowXml a => a XmlTree Taxon
parseEntrezTaxonSet = atTag "TaxaSet" >>> getChildren >>>
proc entrezTaxons -> do
_taxons <- parseEntrezTaxon -< entrezTaxons
returnA -< _taxons
parseEntrezTaxon :: ArrowXml a => a XmlTree Taxon
parseEntrezTaxon = (isElem >>> hasName "Taxon") >>>
proc entrezTaxon -> do
_taxonomyId <- getChildren >>> (isElem >>> hasName "TaxId") >>> getChildren >>> getText -< entrezTaxon
_scientificName <- getChildren >>> (isElem >>> hasName "ScientificName") >>> getChildren >>> getText -< entrezTaxon
_parentTaxonomyId <- getChildren >>> (isElem >>> hasName "ParentTaxId") >>> getChildren >>> getText -< entrezTaxon
_rank <- getChildren >>> (isElem >>> hasName "Rank") >>> getChildren >>> getText -< entrezTaxon
_divison <- getChildren >>> (isElem >>> hasName "Division") >>> getChildren >>> getText -< entrezTaxon
_geneticCode <- parseTaxonGeneticCode -< entrezTaxon
_mitoGeneticCode <- parseTaxonMitoGeneticCode -< entrezTaxon
_lineage <- getChildren >>> atTag "Lineage" >>> getChildren >>> getText -< entrezTaxon
_lineageEx <- parseTaxonLineageEx -< entrezTaxon
_createDate <- getChildren >>> (isElem >>> hasName "CreateDate") >>> getChildren >>> getText -< entrezTaxon
_updateDate <- getChildren >>> (isElem >>> hasName "UpdateDate") >>> getChildren >>> getText -< entrezTaxon
_pubDate <- getChildren >>> (isElem >>> hasName "PubDate") >>> getChildren >>> getText -< entrezTaxon
returnA -< Taxon {
taxonTaxId = read _taxonomyId :: Int,
taxonScientificName = _scientificName,
taxonParentTaxId = read _parentTaxonomyId :: Int,
taxonRank = read _rank :: Rank,
division = _divison,
geneticCode = _geneticCode,
mitoGeneticCode = _mitoGeneticCode,
lineage = _lineage,
lineageEx = _lineageEx,
createDate = _createDate,
updateDate = _updateDate,
pubDate = _pubDate
}
parseTaxonGeneticCode :: ArrowXml a => a XmlTree TaxGenCode
parseTaxonGeneticCode = getChildren >>> atTag "GeneticCode" >>>
proc geneticcode -> do
_gcId <- atTag "GCId" >>> getChildren >>> getText -< geneticcode
_gcName <- atTag "GCName" >>> getChildren >>> getText -< geneticcode
returnA -< TaxGenCode {
geneticCodeId = read _gcId :: Int,
abbreviation = Nothing,
geneCodeName = _gcName,
cde = [],
starts = []
}
parseTaxonMitoGeneticCode :: ArrowXml a => a XmlTree TaxGenCode
parseTaxonMitoGeneticCode = getChildren >>> atTag "MitoGeneticCode" >>>
proc mitogeneticcode -> do
_mgcId <- atTag "MGCId" >>> getChildren >>> getText -< mitogeneticcode
_mgcName <- atTag "MGCName" >>> getChildren >>> getText -< mitogeneticcode
returnA -< TaxGenCode {
geneticCodeId = read _mgcId :: Int,
abbreviation = Nothing,
geneCodeName = _mgcName,
cde = [],
starts = []
}
parseTaxonLineageEx :: ArrowXml a => a XmlTree [LineageTaxon]
parseTaxonLineageEx = getChildren >>> atTag "LineageEx" >>>
proc taxonLineageEx -> do
_lineageEx <- listA parseLineageTaxon -< taxonLineageEx
returnA -< _lineageEx
parseLineageTaxon :: ArrowXml a => a XmlTree LineageTaxon
parseLineageTaxon = getChildren >>> atTag "Taxon" >>>
proc lineageTaxon -> do
_lineageTaxId <- atTag "TaxId" >>> getChildren >>> getText -< lineageTaxon
_lineageScienticName <- atTag "ScientificName" >>> getChildren >>> getText -< lineageTaxon
_lineageRank <- atTag "Rank" >>> getChildren >>> getText -< lineageTaxon
returnA -< LineageTaxon {
lineageTaxId = read _lineageTaxId :: Int,
lineageScienticName = _lineageScienticName,
lineageRank = read _lineageRank :: Rank
}
readEntrezSimpleTaxons :: String -> [SimpleTaxon]
readEntrezSimpleTaxons = runLA (xreadDoc >>> parseEntrezSimpleTaxons)
parseEntrezSimpleTaxons :: ArrowXml a => a XmlTree SimpleTaxon
parseEntrezSimpleTaxons = getChildren >>> atTag "Taxon" >>>
proc entrezSimpleTaxon -> do
simple_TaxId <- atTag "TaxId" >>> getChildren >>> getText -< entrezSimpleTaxon
simple_ScientificName <- atTag "ScientificName" >>> getChildren >>> getText -< entrezSimpleTaxon
simple_ParentTaxId <- atTag "ParentTaxId" >>> getChildren >>> getText -< entrezSimpleTaxon
simple_Rank <- atTag "Rank" >>> getChildren >>> getText -< entrezSimpleTaxon
returnA -< SimpleTaxon {
simpleTaxId = read simple_TaxId :: Int,
simpleScientificName = B.pack simple_ScientificName,
simpleParentTaxId = read simple_ParentTaxId :: Int,
simpleRank = read simple_Rank :: Rank
}
readEntrezParentIds :: String -> [Int]
readEntrezParentIds = runLA (xreadDoc >>> parseEntrezParentTaxIds)
parseEntrezParentTaxIds :: ArrowXml a => a XmlTree Int
parseEntrezParentTaxIds = getChildren >>> atTag "Taxon" >>>
proc entrezSimpleTaxon -> do
simple_ParentTaxId <- atTag "ParentTaxId" >>> getChildren >>> getText -< entrezSimpleTaxon
returnA -< read simple_ParentTaxId :: Int
readEntrezSummaries :: String -> [EntrezSummary]
readEntrezSummaries = runLA (xreadDoc >>> parseEntrezSummaries)
parseEntrezSummaries :: ArrowXml a => a XmlTree EntrezSummary
parseEntrezSummaries = atTag "eSummaryResult" >>>
proc entrezSummary -> do
document_Summaries <- listA parseEntrezDocSums -< entrezSummary
returnA -< EntrezSummary {
documentSummaries = document_Summaries
}
parseEntrezDocSums :: ArrowXml a => a XmlTree EntrezDocSum
parseEntrezDocSums = atTag "DocSum" >>>
proc entrezDocSum -> do
summary_Id <- atTag "Id" >>> getChildren >>> getText -< entrezDocSum
summary_Items <- listA parseSummaryItems -< entrezDocSum
returnA -< EntrezDocSum {
summaryId = summary_Id,
summaryItems = summary_Items
}
parseSummaryItems :: ArrowXml a => a XmlTree SummaryItem
parseSummaryItems = atTag "Item" >>>
proc summaryItem -> do
item_Name <- getAttrValue "Name" -< summaryItem
item_Type <- getAttrValue "Type" -< summaryItem
item_Content <- getText <<< getChildren -< summaryItem
returnA -< SummaryItem {
itemName = item_Name,
itemType = item_Type,
itemContent = item_Content
}
readEntrezSearch :: String -> [EntrezSearch]
readEntrezSearch = runLA (xreadDoc >>> parseEntrezSearch)
parseEntrezSearch :: ArrowXml a => a XmlTree EntrezSearch
parseEntrezSearch = atTag "eSearchResult" >>>
proc entrezSearch -> do
_count <- atTag "Count" >>> getChildren >>> getText -< entrezSearch
_retMax <- atTag "RetMax" >>> getChildren >>> getText -< entrezSearch
_retStart <- atTag "RetStart" >>> getChildren >>> getText -< entrezSearch
_searchIds <- atTag "IdList" >>> listA parseSearchId -< entrezSearch
_translationStack <- listA parseTranslationStack -< entrezSearch
_queryTranslation <- atTag "QueryTranslation" >>> getChildren >>> getText -< entrezSearch
returnA -< EntrezSearch {
count = readInt _count,
retMax = readInt _retMax,
retStart = readInt _retStart,
searchIds = _searchIds,
translationStack = _translationStack,
queryTranslation = _queryTranslation
}
parseSearchId :: ArrowXml a => a XmlTree Int
parseSearchId = atTag "Id" >>>
proc entrezSearchId -> do
searchId <- getChildren >>> getText -< entrezSearchId
returnA -< (readInt searchId)
parseTranslationStack :: ArrowXml a => a XmlTree TranslationStack
parseTranslationStack = atTag "TranslationStack" >>>
proc entrezTranslationStack -> do
_termSets <- listA parseTermSet -< entrezTranslationStack
_operation <- atTag "OP" >>> getChildren >>> getText -< entrezTranslationStack
returnA -< TranslationStack {
termSets = _termSets,
operation = _operation
}
parseTermSet :: ArrowXml a => a XmlTree TermSet
parseTermSet = atTag "TermSet" >>>
proc entrezTermSet -> do
_term <- atTag "Term" >>> getChildren >>> getText -< entrezTermSet
_field <- atTag "Field" >>> getChildren >>> getText -< entrezTermSet
_termCount <- atTag "Count" >>> getChildren >>> getText -< entrezTermSet
_explode <- atTag "Explode" >>> getChildren >>> getText -< entrezTermSet
returnA -< TermSet {
term = _term,
field = _field,
termCount = readInt _termCount,
explode = _explode
}
readEntrezGeneSummaries :: String -> [EntrezGeneSummary]
readEntrezGeneSummaries = runLA (xreadDoc >>> parseEntrezGeneSummaries)
parseEntrezGeneSummaries :: ArrowXml a => a XmlTree EntrezGeneSummary
parseEntrezGeneSummaries = atTag "eSummaryResult" >>> getChildren >>> atTag "DocumentSummarySet" >>>
proc entrezSummary -> do
_geneSummaries <- listA parseEntrezGeneDocSums -< entrezSummary
returnA -< EntrezGeneSummary {
geneSummaries = _geneSummaries
}
parseEntrezGeneDocSums :: ArrowXml a => a XmlTree EntrezGeneDocSummary
parseEntrezGeneDocSums = atTag "DocumentSummary" >>>
proc entrezDocSum -> do
_geneId <- atTag "Name" >>> getChildren >>> getText -< entrezDocSum
_geneName <- atTag "Description" >>> getChildren >>> getText -< entrezDocSum
_geneStatus <- atTag "Status" >>> getChildren >>> getText -< entrezDocSum
_geneCurrentID <- atTag "CurrentID" >>> getChildren >>> getText -< entrezDocSum
_geneGeneticSource <- atTag "GeneticSource" >>> getChildren >>> getText -< entrezDocSum
_geneOtherAliases <- atTag "OtherAliases" >>> getChildren >>> getText -< entrezDocSum
_geneGenomicInfo <- parseEntrezGenomicInfo -< entrezDocSum
returnA -< EntrezGeneDocSummary {
geneId = _geneId,
geneName = _geneName,
geneStatus = _geneStatus,
geneCurrentID = _geneCurrentID,
geneGeneticSource = _geneGeneticSource,
geneOtherAliases = _geneOtherAliases,
geneGenomicInfo = _geneGenomicInfo
}
parseEntrezGenomicInfo :: ArrowXml a => a XmlTree EntrezGenomicInfo
parseEntrezGenomicInfo = atTag "GenomicInfo" >>> getChildren >>> atTag "GenomicInfoType" >>>
proc entrezGenomicInfo -> do
_chrAccVer <- atTag "ChrAccVer" >>> getChildren >>> getText -< entrezGenomicInfo
_chrStart <- atTag "ChrStart" >>> getChildren >>> getText -< entrezGenomicInfo
_chrStop <- atTag "ChrStop" >>> getChildren >>> getText -< entrezGenomicInfo
_exonCount <- atTag "ExonCount" >>> getChildren >>> getText -< entrezGenomicInfo
returnA -< EntrezGenomicInfo {
chrAccVer = _chrAccVer,
chrStart = readInt _chrStart,
chrStop = readInt _chrStop,
exonCount = readInt _exonCount
}
atTag :: ArrowXml a => String -> a XmlTree XmlTree
atTag tag = deep (isElem >>> hasName tag)
retrieveGeneSymbolFasta :: String -> String -> Maybe (String,String) -> IO String
retrieveGeneSymbolFasta genesymbol accession registrationInfo = do
let query1 = EntrezHTTPQuery (Just "esearch") (Just "gene") ("term=" ++ genesymbol ++ urlEncode ("[Gene Name] AND " ++ accession ++ "[Nucleotide Accession]"))
uniqueidresponse <- entrezHTTP query1
let uniqueid = head (searchIds (head (readEntrezSearch uniqueidresponse)))
let query2 = EntrezHTTPQuery (Just "esummary") (Just "gene") ("id=" ++ show uniqueid)
summaryresponse <- entrezHTTP query2
let parsedSummary = head (geneSummaries (head (readEntrezGeneSummaries summaryresponse)))
let accessionVersion = chrAccVer (geneGenomicInfo parsedSummary)
let seqStart = chrStart (geneGenomicInfo parsedSummary)
let seqStop = chrStop (geneGenomicInfo parsedSummary)
let strand = convertCoordinatesToStrand seqStart seqStop
fetchNucleotideString accessionVersion seqStart seqStop strand registrationInfo
fetchNucleotideString :: String -> Int -> Int -> Int -> Maybe (String,String) -> IO String
fetchNucleotideString nucleotideId seqStart seqStop strand maybeRegistrationInfo = do
let registrationInfo = maybeBuildRegistration maybeRegistrationInfo
let program' = Just "efetch"
let database' = Just "nucleotide"
let query' = "id=" ++ nucleotideId ++ "&seq_start=" ++ show seqStart ++ "&seq_stop=" ++ show seqStop ++ "&rettype=fasta" ++ "&strand=" ++ show strand ++ registrationInfo
let entrezQuery = EntrezHTTPQuery program' database' query'
entrezHTTP entrezQuery
convertCoordinatesToStrand :: Int -> Int -> Int
convertCoordinatesToStrand start end
| start <= end = 1
| otherwise = 2
setStrand :: String -> Int
setStrand strandString
| strandString == "+" = 1
| strandString == "-" = 2
| strandString == "forward" = 1
| strandString == "reverse" = 2
| strandString == "Forward" = 1
| strandString == "Reverse" = 2
| strandString == "plus" = 1
| strandString == "false" = 2
| strandString == "Plus" = 1
| strandString == "False" = 2
| otherwise = 1
readInt :: String -> Int
readInt = read
maybeBuildRegistration :: Maybe (String,String) -> String
maybeBuildRegistration maybeRegistration
| isJust maybeRegistration = buildRegistration toolname developeremail
| otherwise = ""
where registration = fromJust maybeRegistration
toolname = fst registration
developeremail = snd registration
buildRegistration :: String -> String -> String
buildRegistration toolname developeremail = "&tool=" ++ toolname ++ "&email=" ++ developeremail