module Lambdabot.Url (
getHtmlPage,
getHeader,
rawPageTitle,
urlPageTitle,
urlTitlePrompt,
runWebReq
) where
import Data.List
import Data.Maybe
import Lambdabot.MiniHTTP
import Control.Monad.Reader
import Text.HTML.TagSoup.Match
import Text.HTML.TagSoup
import Codec.Binary.UTF8.String
urlTitlePrompt :: String
urlTitlePrompt = "Title: "
maxTitleLength :: Int
maxTitleLength = 80
type WebReq a = ReaderT Proxy IO a
runWebReq :: WebReq a -> Proxy -> IO a
runWebReq = runReaderT
urlPageTitle :: String -> WebReq (Maybe String)
urlPageTitle url = do
title <- rawPageTitle url
return $ maybe Nothing prettyTitle title
where
limitLength s
| length s > maxTitleLength = (take maxTitleLength s) ++ " ..."
| otherwise = s
prettyTitle = Just . (urlTitlePrompt ++) . limitLength
rawPageTitle :: String -> WebReq (Maybe String)
rawPageTitle url
| Just uri <- parseURI url' = do
contents <- getHtmlPage uri
case contentType contents of
Just "text/html" -> return $ extractTitle contents
Just "application/pdf" -> rawPageTitle (googleCacheURL url)
_ -> return $ Nothing
| otherwise = return Nothing
where url' = takeWhile (/='#') url
googleCacheURL = (gURL++) . escapeURIString (const False)
gURL = "http://www.google.com/search?hl=en&q=cache:"
getHtmlPage :: URI -> WebReq [String]
getHtmlPage u = getHtmlPage' u 5
where
getHtmlPage' :: URI -> Int -> WebReq [String]
getHtmlPage' _ 0 = return []
getHtmlPage' uri n = do
contents <- getURIContents uri
case responseStatus contents of
301 -> getHtmlPage' (redirectedUrl contents) (n1)
302 -> getHtmlPage' (redirectedUrl contents) (n1)
200 -> return contents
_ -> return []
where
responseStatus hdrs = (read . (!!1) . words . (!!0)) hdrs :: Int
redirectedUrl hdrs
| Just loc <- getHeader "Location" hdrs =
case parseURI loc of
Nothing -> (fromJust . parseURI) $ fullUrl loc
Just uri' -> uri'
| otherwise = error("No Location header found in 3xx response.")
fullUrl loc = let auth = fromJust $ uriAuthority uri
in (uriScheme uri) ++ "//" ++
(uriRegName auth) ++
loc
getURIContents :: URI -> WebReq [String]
getURIContents uri = do
proxy <- ask
liftIO $ readNBytes 3048 proxy uri (request proxy) ""
where
request Nothing = ["GET " ++ abs_path ++ " HTTP/1.1",
"host: " ++ host,
"Connection: close", ""]
request _ = ["GET " ++ show uri ++ " HTTP/1.0", ""]
abs_path = case uriPath uri ++ uriQuery uri ++ uriFragment uri of
url@('/':_) -> url
url -> '/':url
host = uriRegName . fromJust $ uriAuthority uri
extractTitle :: [String] -> Maybe String
extractTitle = content . tags . decodeString . unlines where
tags = closing . opening . canonicalizeTags . parseTags
opening = dropWhile (not . tagOpenLit "title" (const True))
closing = takeWhile (not . tagCloseLit "title")
content = maybeText . format . innerText
format = unwords . words
maybeText [] = Nothing
maybeText t = Just (encodeString t)
contentType :: [String] -> Maybe (String)
contentType [] = Nothing
contentType contents = Just val
where
val = takeWhile (/=';') ctype
ctype = case getHeader "Content-Type" contents of
Nothing -> error "Lib.URL.isTextHTML: getHeader failed"
Just c -> c
getHeader :: String -> [String] -> Maybe String
getHeader _ [] = Nothing
getHeader hdr (_:hs) = lookup hdr $ concatMap mkassoc hs
where
removeCR = takeWhile (/='\r')
mkassoc s = case findIndex (==':') s of
Just n -> [(take n s, removeCR $ drop (n+2) s)]
Nothing -> []