{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_HADDOCK hide #-}
module Text.HTML.Scalpel.Internal.Scrape.URL (
URL
, Config (..)
, Decoder
, defaultDecoder
, utf8Decoder
, iso88591Decoder
, fetchTags
, fetchTagsWithConfig
, scrapeURL
, scrapeURLWithConfig
) where
import Text.HTML.Scalpel.Core
import Control.Monad
import Data.CaseInsensitive ()
import Data.Default (def)
import Data.Maybe (listToMaybe)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Default as Default
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Text
import qualified Network.HTTP.Client as HTTP
import qualified Network.HTTP.Client.TLS as HTTP
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.StringLike as TagSoup
type URL = String
type Decoder str = HTTP.Response LBS.ByteString -> str
data Config str = Config {
decoder :: Decoder str
, manager :: Maybe HTTP.Manager
}
instance TagSoup.StringLike str => Default.Default (Config str) where
def = Config {
decoder = defaultDecoder
, manager = Nothing
}
scrapeURL :: (TagSoup.StringLike str)
=> URL -> Scraper str a -> IO (Maybe a)
scrapeURL = scrapeURLWithConfig def
scrapeURLWithConfig :: (TagSoup.StringLike str)
=> Config str -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithConfig config url scraper = do
scrape scraper `liftM` fetchTagsWithConfig config url
fetchTags :: TagSoup.StringLike str
=> URL -> IO [TagSoup.Tag str]
fetchTags = fetchTagsWithConfig def
fetchTagsWithConfig :: TagSoup.StringLike str
=> Config str -> URL -> IO [TagSoup.Tag str]
fetchTagsWithConfig config url = do
manager <- maybe HTTP.getGlobalManager return (manager config)
response <- flip HTTP.httpLbs manager =<< HTTP.parseRequest url
return $ TagSoup.parseTags $ decoder config $ response
defaultDecoder :: TagSoup.StringLike str => Decoder str
defaultDecoder response = TagSoup.castString
$ choosenDecoder body
where
body = HTTP.responseBody response
headers = HTTP.responseHeaders response
contentType = listToMaybe
$ map (Text.decodeLatin1 . snd)
$ take 1
$ dropWhile ((/= "content-type") . fst)
headers
isType t | Just ct <- contentType = ("charset=" `Text.append` t) `Text.isInfixOf` ct
| otherwise = False
choosenDecoder | isType "utf-8" = Text.decodeUtf8 . LBS.toStrict
| otherwise = Text.decodeLatin1 . LBS.toStrict
utf8Decoder :: TagSoup.StringLike str => Decoder str
utf8Decoder = TagSoup.castString . Text.decodeUtf8 . LBS.toStrict . HTTP.responseBody
iso88591Decoder :: TagSoup.StringLike str => Decoder str
iso88591Decoder = TagSoup.castString . Text.decodeLatin1 . LBS.toStrict . HTTP.responseBody