src/Text/HTML/Scalpel/Internal/Scrape/URL.hs

{-# OPTIONS_HADDOCK hide #-}
module Text.HTML.Scalpel.Internal.Scrape.URL (
    URL
,   scrapeURL
,   scrapeURLWithOpts
) where

import Text.HTML.Scalpel.Internal.Scrape

import Control.Applicative

import qualified Data.ByteString as BS
import qualified Network.Curl as Curl
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.StringLike as TagSoup


type URL = String

-- | The 'scrapeURL' function downloads the contents of the given URL and
-- executes a 'Scraper' on it.
scrapeURL :: TagSoup.StringLike str => URL -> Scraper str a -> IO (Maybe a)
scrapeURL = scrapeURLWithOpts [Curl.CurlFollowLocation True]

-- | The 'scrapeURLWithOpts' function take a list of curl options and downloads
-- the contents of the given URL and executes a 'Scraper' on it.
scrapeURLWithOpts :: TagSoup.StringLike str
                  => [Curl.CurlOption] -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithOpts options url scraper = do
    maybeTags <- downloadAsTags url
    return (maybeTags >>= scrape scraper)
    where
        downloadAsTags url = do
            maybeBytes <- openURIWithOpts url options
            return $ (TagSoup.parseTags . TagSoup.castString) <$> maybeBytes

openURIWithOpts :: URL -> [Curl.CurlOption] -> IO (Maybe BS.ByteString)
openURIWithOpts url opts = do
    resp <- curlGetResponse_ url opts
    return $ if Curl.respCurlCode resp /= Curl.CurlOK
        then Nothing
        else Just $ Curl.respBody resp

curlGetResponse_ :: URL
                 -> [Curl.CurlOption]
                 -> IO (Curl.CurlResponse_ [(String, String)] BS.ByteString)
curlGetResponse_ = Curl.curlGetResponse_