----------------------------------------------------------------------------- -- | -- Module : Extract -- Copyright : (C) Peter Robinson 2010-2013 -- License : GPL-2 -- -- Maintainer : Peter Robinson -- Stability : experimental -- Portability : portable -- -- Functions for retrieving data from the mathematics genealogy project. -- ----------------------------------------------------------------------------- module Extract where import Network.HTTP import Text.HTML.TagSoup import Data.Char(isSpace) import Data.Text.Lazy(Text) import Data.Text.Lazy.Encoding import Control.Exception import qualified Data.Text.Lazy as T import qualified Data.ByteString.Lazy.Char8 as B import Control.Applicative import Safe(headMay,tailMay) import Entry openURL :: String -> IO Text openURL x = return . T.pack =<< getResponseBody =<< simpleHTTP (getRequest x) getTags :: Text -> IO [Tag Text] getTags url = parseTags <$> openURL (T.unpack url) parseEntry :: [Tag Text] -> Maybe Entry parseEntry tags = do sc <- scientist tags return $ Entry sc (graduationInfos tags) (advisors' tags) scientist :: [Tag Text] -> Maybe Text scientist tags = do let t = removeClutter <$> do r <- headMay (sections (~== "

[GraduationInfo] graduationInfos tags = let suffixes = sections (~== "") tags in map (graduationInfo tags) suffixes where graduationInfo _ suffix = let getTag i = notEmptyMaybeTagText $ removeClutter $ suffix !! i degree = getTag 1 univ = getTag 3 year = getTag 5 diss = getTag 18 in matchLen suffix degree univ year diss where matchLen s degree univ year diss | length s >= 19 = GraduationInfo degree univ year diss | length s >= 6 = GraduationInfo degree univ year Nothing | length s >= 4 = GraduationInfo degree univ Nothing Nothing | length s >= 2 = GraduationInfo degree Nothing Nothing Nothing | otherwise = throw $ AssertionFailed "Error - Could not parse downloaded data!" advisors' tags = let h1 = dropWhile (~/= "Advisor 1: ") tags h2 = dropWhile (~/= "Advisor 2: ") tags h3 = dropWhile (~/= "Advisor: ") tags parseAdvisor [] = [] parseAdvisor ts = [(advName1,advURL1)] where advName1 = maybeAdv $ notEmptyMaybeTagText $ removeClutter $ head $ tail $ tail ts advURL1 = maybeLink $ head $ tail ts in catMaybes' $ parseAdvisor h1 ++ parseAdvisor h2 ++ parseAdvisor h3 where catMaybes' :: [(Maybe a,Maybe b)] -> [(a,b)] catMaybes' [] = [] catMaybes' ((Just a,Just l):xs) = (a,l) : catMaybes' xs catMaybes' ((Just _,Nothing):xs)= catMaybes' xs catMaybes' ((Nothing,_):xs) = catMaybes' xs maybeAdv adv@(Just t) | T.unpack t == "Advisor: Unknown" = Nothing | otherwise = adv maybeLink t | isTagOpenName (T.pack "a") t = Just $ fromAttrib (T.pack "href") t | otherwise = Nothing notEmptyMaybeTagText :: Tag Text -> Maybe Text notEmptyMaybeTagText (TagText t) | all isSpace (T.unpack t) = Nothing | T.null t = Nothing | otherwise = Just t notEmptyMaybeTagText _ = Nothing -- | Removes preceding, trailing and multiple inter-word spaces: removeClutter :: Tag Text -> Tag Text removeClutter (TagText s) = TagText $ removeInterSpaces (T.strip s) where removeInterSpaces :: Text -> Text removeInterSpaces t = let (first,rest) = T.breakOn (T.pack " ") t in if T.null first then rest else if T.null rest then first else removeInterSpaces $ first `T.append` T.tail rest removeClutter t = t