{-# LANGUAGE PatternGuards #-} -- | URL Utility Functions module Lambdabot.Util.Browser ( urlPageTitle , browseLB ) where import Codec.Binary.UTF8.String import Control.Applicative import Control.Monad.Trans import Lambdabot.Config import Lambdabot.Config.Reference import Lambdabot.Monad import Lambdabot.Util (limitStr) import Network.Browser import Network.HTTP import Network.URI import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Data.Char (toLower) import Data.List (isPrefixOf) -- | Run a browser action with some standardized settings browseLB :: MonadLB m => BrowserAction conn a -> m a browseLB act = lb $ do proxy' <- getConfig proxy liftIO . browse $ do setOutHandler (const (return ())) setErrHandler (const (return ())) setAllowRedirects True setMaxRedirects (Just 5) setProxy proxy' act -- | Limit the maximum title length to prevent jokers from spamming -- the channel with specially crafted HTML pages. maxTitleLength :: Int maxTitleLength = 80 -- | Fetches a page title suitable for display. Ideally, other -- plugins should make use of this function if the result is to be -- displayed in an IRC channel because it ensures that a consistent -- look is used (and also lets the URL plugin effectively ignore -- contextual URLs that might be generated by another instance of -- lambdabot; the URL plugin matches on 'urlTitlePrompt'). urlPageTitle :: String -> BrowserAction (HandleStream String) (Maybe String) urlPageTitle = fmap (fmap (limitStr maxTitleLength)) . rawPageTitle -- | Fetches a page title for the specified URL. This function should -- only be used by other plugins if and only if the result is not to -- be displayed in an IRC channel. Instead, use 'urlPageTitle'. rawPageTitle :: String -> BrowserAction (HandleStream String) (Maybe String) rawPageTitle url = checkHTTPS $ do (_, result) <- request (getRequest (takeWhile (/='#') url)) case rspCode result of (2,0,0) -> do case takeWhile (/= ';') <$> lookupHeader HdrContentType (rspHeaders result) of Just "text/html" -> return $ extractTitle (rspBody result) Just "application/pdf" -> rawPageTitle (googleCacheURL url) _ -> return $ Nothing _ -> return Nothing where googleCacheURL = (gURL++) . escapeURIString (const False) gURL = "http://www.google.com/search?hl=en&q=cache:" checkHTTPS act | "https:" `isPrefixOf` map toLower url = return Nothing | otherwise = act -- | Given a server response (list of Strings), return the text in -- between the title HTML element, only if it is text/html content. -- Now supports all(?) HTML entities thanks to TagSoup. extractTitle :: String -> Maybe String extractTitle = content . tags . decodeString where tags = closing . opening . canonicalizeTags . parseTags opening = dropWhile (not . tagOpenLit "title" (const True)) closing = takeWhile (not . tagCloseLit "title") content = maybeText . format . innerText format = unwords . words maybeText [] = Nothing maybeText t = Just (encodeString t)