{-# LANGUAGE OverloadedStrings, RecordWildCards #-} module Text.Feed.Crawl where import Text.Feed.Crawl.Common import Text.Feed.Crawl.DetectLink import Network.Connection (TLSSettings(..)) import Network.HTTP.Conduit import qualified Data.Conduit as C import qualified Data.ByteString.Char8 as B import qualified Data.ByteString.Lazy.Char8 as BL import System.Environment import System.IO import Control.Monad.Trans.State.Lazy import Control.Monad.Trans.Class (lift) import Network.HTTP.Types.Status (statusCode) import Network.HTTP.Types.Header import Control.Monad.IO.Class import Data.Maybe (listToMaybe, catMaybes) import qualified Control.Exception as E import Network.URI import Control.Applicative import Data.Monoid import Control.Monad (join) -- | Spoof a Safari Browser because some sites don't even serve feeds to -- an http-conduit client userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A" -- |The main function crawlURL :: String -> IO CrawlResult crawlURL url = do request <- parseUrl url let request' = request { requestHeaders = ("User-Agent", userAgent):(requestHeaders request) } let settings = mkManagerSettings (TLSSettingsSimple True False False) Nothing (mkCrawlResult url =<< withRedirectTracking settings request') `E.catch` (\e -> return . Left . CrawlHttpError $ e) mkCrawlResult :: String -> (Response BL.ByteString, [Status]) -> IO CrawlResult mkCrawlResult firstUrl (resp, statuses) = do let ct = lookup hContentType . responseHeaders $ resp let loc = lookup hLocation . responseHeaders $ resp let urls = catMaybes [ sLocation | Status{..} <- statuses ] let lastUrl = head (urls ++ [B.pack firstUrl]) if isFeedContentType ct then return . Right $ CrawlSuccess { crawlLastContentType = ct , crawlLastUrl = lastUrl , crawlFeedContent = responseBody resp } else do links <- findFeedLinks (BL.unpack . responseBody $ resp) -- TODO this ignore any the tag in the html page return . Left . CrawlFoundFeedLinks (responseBody resp) . map (ensureLinkIsAbsolute lastUrl) $ links where ensureLinkIsAbsolute :: B.ByteString -> Link -> Link ensureLinkIsAbsolute linkBaseUrl x@Link{..} = let linkHref' = maybe linkHref id $ ensureAbsURL' (parseURI . B.unpack $ linkBaseUrl) linkHref in x { linkHref = linkHref' } -- |Returns a tuple of response and list of redirect locations. -- The first location is the last redirect. withRedirectTracking :: ManagerSettings -> Request -> IO (Response BL.ByteString, [Status]) withRedirectTracking settings request = do m <- newManager settings r <- runStateT (traceRedirects request m) [] return r traceRedirects :: Request -> Manager -> StateT [Status] IO (Response BL.ByteString) traceRedirects req' man = do let req = req' { checkStatus = \_ _ _ -> Nothing } res <- httpLbs req{redirectCount=0} man let req2 = getRedirectedRequest req (responseHeaders res) (responseCookieJar res) (statusCode (responseStatus res)) let location = lookup hLocation . responseHeaders $ res case (req2, location) of (Just req2', Just location') -> do let location' = ensureAbsURL req location let st = Status { sStatusCode = statusCode (responseStatus res) , sLocation = location' , sContentType = lookup hContentType . responseHeaders $ res } modify (st:) traceRedirects req2' man _ -> return res isFeed :: Status -> Bool isFeed Status{..} = isFeedContentType sContentType ensureAbsURL :: Request -> Maybe B.ByteString -> Maybe B.ByteString ensureAbsURL req url = let sUrl = B.unpack `fmap` url in case sUrl of Nothing -> Nothing Just sUrl' -> B.pack <$> ensureAbsURL' (baseURL req) sUrl' ensureAbsURL' :: Maybe URI -> String -> Maybe String ensureAbsURL' baseURI s = case parseURI s of Just _ -> Just s -- is a valid full URI Nothing -> -- url may be relative path, join to request info (\x -> uriToString id x "") <$> (nonStrictRelativeTo <$> (parseRelativeReference s) <*> baseURI) baseURL :: Request -> Maybe URI baseURL req = let protocol = if secure req then "https://" else "http://" in parseURI $ mconcat [ protocol , B.unpack . host $ req , case port req of 80 -> "" n -> ":" ++ show n ]