{-# LANGUAGE PatternGuards #-}
module Lambdabot.Util.Browser
( urlPageTitle
, browseLB
) where
import Codec.Binary.UTF8.String
import Control.Applicative
import Control.Monad.Trans
import Lambdabot.Config
import Lambdabot.Config.Reference
import Lambdabot.Monad
import Lambdabot.Util (limitStr)
import Network.Browser
import Network.HTTP
import Network.URI
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Data.Char (toLower)
import Data.List (isPrefixOf)
browseLB :: MonadLB m => BrowserAction conn a -> m a
browseLB act = lb $ do
proxy' <- getConfig proxy
liftIO . browse $ do
setOutHandler (const (return ()))
setErrHandler (const (return ()))
setAllowRedirects True
setMaxRedirects (Just 5)
setProxy proxy'
act
maxTitleLength :: Int
maxTitleLength = 80
urlPageTitle :: String -> BrowserAction (HandleStream String) (Maybe String)
urlPageTitle = fmap (fmap (limitStr maxTitleLength)) . rawPageTitle
rawPageTitle :: String -> BrowserAction (HandleStream String) (Maybe String)
rawPageTitle url = checkHTTPS $ do
(_, result) <- request (getRequest (takeWhile (/='#') url))
case rspCode result of
(2,0,0) -> do
case takeWhile (/= ';') <$> lookupHeader HdrContentType (rspHeaders result) of
Just "text/html" -> return $ extractTitle (rspBody result)
Just "application/pdf" -> rawPageTitle (googleCacheURL url)
_ -> return $ Nothing
_ -> return Nothing
where googleCacheURL = (gURL++) . escapeURIString (const False)
gURL = "http://www.google.com/search?hl=en&q=cache:"
checkHTTPS act | "https:" `isPrefixOf` map toLower url = return Nothing
| otherwise = act
extractTitle :: String -> Maybe String
extractTitle = content . tags . decodeString where
tags = closing . opening . canonicalizeTags . parseTags
opening = dropWhile (not . tagOpenLit "title" (const True))
closing = takeWhile (not . tagCloseLit "title")
content = maybeText . format . innerText
format = unwords . words
maybeText [] = Nothing
maybeText t = Just (encodeString t)