module Text.HTML.Scalpel.Internal.Scrape.URL (
URL
, Config (..)
, Decoder
, defaultDecoder
, utf8Decoder
, iso88591Decoder
, scrapeURL
, scrapeURLWithOpts
, scrapeURLWithConfig
) where
import Text.HTML.Scalpel.Internal.Scrape
import Control.Applicative ((<$>))
import Data.Char (toLower)
import Data.Default (def)
import Data.List (isInfixOf)
import Data.Maybe (listToMaybe)
import qualified Data.ByteString as BS
import qualified Data.Default as Default
import qualified Data.Text.Encoding as Text
import qualified Network.Curl as Curl
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.StringLike as TagSoup
type URL = String
type CurlResponse = Curl.CurlResponse_ [(String, String)] BS.ByteString
type Decoder str = Curl.CurlResponse_ [(String, String)] BS.ByteString -> str
data Config str = Config {
curlOpts :: [Curl.CurlOption]
, decoder :: Decoder str
}
instance TagSoup.StringLike str => Default.Default (Config str) where
def = Config {
curlOpts = [Curl.CurlFollowLocation True]
, decoder = defaultDecoder
}
scrapeURL :: (Ord str, TagSoup.StringLike str)
=> URL -> Scraper str a -> IO (Maybe a)
scrapeURL = scrapeURLWithOpts [Curl.CurlFollowLocation True]
scrapeURLWithOpts :: (Ord str, TagSoup.StringLike str)
=> [Curl.CurlOption] -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithOpts options = scrapeURLWithConfig (def {curlOpts = options})
scrapeURLWithConfig :: (Ord str, TagSoup.StringLike str)
=> Config str -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithConfig config url scraper = do
maybeTags <- downloadAsTags (decoder config) url
return (maybeTags >>= scrape scraper)
where
downloadAsTags decoder url = do
maybeBytes <- openURIWithOpts url (curlOpts config)
return $ TagSoup.parseTags . decoder <$> maybeBytes
openURIWithOpts :: URL -> [Curl.CurlOption] -> IO (Maybe CurlResponse)
openURIWithOpts url opts = do
resp <- curlGetResponse_ url opts
return $ if Curl.respCurlCode resp /= Curl.CurlOK
then Nothing
else Just resp
curlGetResponse_ :: URL
-> [Curl.CurlOption]
-> IO (Curl.CurlResponse_ [(String, String)] BS.ByteString)
curlGetResponse_ = Curl.curlGetResponse_
defaultDecoder :: TagSoup.StringLike str => Decoder str
defaultDecoder response = TagSoup.castString
$ choosenDecoder body
where
body = Curl.respBody response
headers = Curl.respHeaders response
contentType = listToMaybe
$ map (map toLower . snd)
$ take 1
$ dropWhile ((/= "content-type") . map toLower . fst)
headers
isType t | Just ct <- contentType = ("charset=" ++ t) `isInfixOf` ct
| otherwise = False
choosenDecoder | isType "utf-8" = Text.decodeUtf8
| otherwise = Text.decodeLatin1
utf8Decoder :: TagSoup.StringLike str => Decoder str
utf8Decoder = TagSoup.castString . Text.decodeUtf8 . Curl.respBody
iso88591Decoder :: TagSoup.StringLike str => Decoder str
iso88591Decoder = TagSoup.castString . Text.decodeLatin1 . Curl.respBody