{-# LANGUAGE OverloadedStrings, BangPatterns, RecordWildCards, ViewPatterns,
DoAndIfThenElse, PatternGuards, ScopedTypeVariables,
TupleSections #-}
{-# OPTIONS_GHC -fwarn-incomplete-patterns -fwarn-unused-imports #-}
module Network.HTTP.Conduit.Downloader
(
urlGetContents, urlGetContentsPost
, download, post, downloadG, rawDownload
, DownloadResult(..), RawDownloadResult(..), DownloadOptions
, DownloaderSettings(..)
, Downloader, withDownloader, withDownloaderSettings, newDownloader
, postRequest
) where
import qualified Data.Text as T
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Internal as B
import Control.Monad
import qualified Control.Exception as E
import Data.Default as C
import Data.String
import Data.Char
import Data.Maybe
import Data.List
import Foreign
import qualified Network.Socket as NS
import qualified Network.Socket.ByteString as NS
import qualified OpenSSL as SSL
import qualified OpenSSL.Session as SSL
import qualified Network.HTTP.Types as N
import qualified Network.HTTP.Client as C
import qualified Network.HTTP.Client.Internal as C
import System.Timeout
import Codec.Compression.Zlib.Raw as Deflate
import Network.URI
import System.IO.Unsafe
import Data.Time.Format
import Data.Time.Clock
import Data.Time.Clock.POSIX
data DownloadResult
= DROK B.ByteString DownloadOptions
| DRRedirect String
| DRError String
| DRNotModified
deriving (Show, Read, Eq)
data RawDownloadResult
= RawDownloadResult
{ rdrStatus :: N.Status
, rdrHttpVersion :: N.HttpVersion
, rdrHeaders :: N.ResponseHeaders
, rdrBody :: B.ByteString
, rdrCookieJar :: C.CookieJar
}
deriving Show
type DownloadOptions = [String]
data DownloaderSettings
= DownloaderSettings
{ dsUserAgent :: B.ByteString
, dsTimeout :: Int
, dsManagerSettings :: C.ManagerSettings
, dsMaxDownloadSize :: Int
}
instance Default DownloaderSettings where
def =
DownloaderSettings
{ dsUserAgent = "Mozilla/5.0 (compatible; HttpConduitDownloader/1.0; +http://hackage.haskell.org/package/http-conduit-downloader)"
, dsTimeout = 30
, dsManagerSettings =
C.defaultManagerSettings
{ C.managerTlsConnection = getOpenSSLConnection
, C.managerTlsProxyConnection = getOpenSSLProxyConnection
, C.managerProxyInsecure = C.proxyFromRequest
, C.managerProxySecure = C.proxyFromRequest
}
, dsMaxDownloadSize = 10*1024*1024
}
getOpenSSLConnection :: IO (Maybe NS.HostAddress -> String -> Int
-> IO C.Connection)
getOpenSSLConnection =
return $ \ mbha host port -> do
let c sock = makeSSLConnection sock host
case mbha of
Nothing -> openSocketByName host port c
Just ha -> openSocket ha port c
getOpenSSLProxyConnection :: IO (B.ByteString -> (C.Connection -> IO ()) -> String -> Maybe NS.HostAddress -> String -> Int -> IO C.Connection)
getOpenSSLProxyConnection =
return $ \ connstr checkConn serverName _mbha host port -> do
openSocketByName host port $ \ sock -> do
sc <- C.makeConnection
(NS.recv sock 8129)
(NS.sendAll sock)
(return ())
C.connectionWrite sc connstr
checkConn sc
makeSSLConnection sock serverName
globalSSLContext :: SSL.SSLContext
globalSSLContext = unsafePerformIO $ do
ctx <- SSL.context
return ctx
{-# NOINLINE globalSSLContext #-}
makeSSLConnection :: NS.Socket -> String -> IO C.Connection
makeSSLConnection sock host = do
ssl <- SSL.connection globalSSLContext sock
SSL.setTlsextHostName ssl host
SSL.connect ssl
C.makeConnection
(SSL.read ssl bufSize
`E.catch`
\ (_ :: SSL.ConnectionAbruptlyTerminated) -> return ""
)
(SSL.write ssl)
(NS.close sock `E.catch` \(_ :: E.IOException) -> return ())
openSocket :: NS.HostAddress
-> Int
-> (NS.Socket -> IO a)
-> IO a
openSocket ha port act =
openSocket'
NS.AddrInfo
{ NS.addrFlags = []
, NS.addrFamily = NS.AF_INET
, NS.addrSocketType = NS.Stream
, NS.addrProtocol = 6
, NS.addrAddress = NS.SockAddrInet (toEnum port) ha
, NS.addrCanonName = Nothing
}
act
openSocket' :: NS.AddrInfo -> (NS.Socket -> IO a) -> IO a
openSocket' addr act = do
E.bracketOnError
(NS.socket (NS.addrFamily addr) (NS.addrSocketType addr)
(NS.addrProtocol addr))
(NS.close)
(\sock -> do
NS.setSocketOption sock NS.NoDelay 1
NS.connect sock (NS.addrAddress addr)
act sock)
openSocketByName :: NS.HostName -> Int -> (NS.Socket -> IO a) -> IO a
openSocketByName host port act = do
let hints = NS.defaultHints
{ NS.addrFlags = []
, NS.addrFamily = NS.AF_INET
, NS.addrSocketType = NS.Stream
, NS.addrProtocol = 6
}
(addrInfo:_) <- NS.getAddrInfo (Just hints) (Just host) (Just $ show port)
openSocket' addrInfo act
data Downloader
= Downloader
{ manager :: C.Manager
, settings :: DownloaderSettings
}
newDownloader :: DownloaderSettings -> IO Downloader
newDownloader s = do
SSL.withOpenSSL $ return ()
m <- C.newManager $ dsManagerSettings s
return $ Downloader m s
withDownloader :: (Downloader -> IO a) -> IO a
withDownloader = withDownloaderSettings def
withDownloaderSettings :: DownloaderSettings -> (Downloader -> IO a) -> IO a
withDownloaderSettings s f = f =<< newDownloader s
parseUrl :: String -> Either E.SomeException C.Request
parseUrl = C.parseRequest . takeWhile (/= '#')
download :: Downloader
-> String
-> Maybe NS.HostAddress
-> DownloadOptions
-> IO DownloadResult
download = downloadG return
post :: Downloader -> String -> Maybe NS.HostAddress -> B.ByteString
-> IO DownloadResult
post d url ha dat =
downloadG (return . postRequest dat) d url ha []
postRequest :: B.ByteString -> C.Request -> C.Request
postRequest dat rq =
rq { C.method = N.methodPost
, C.requestBody = C.RequestBodyBS dat }
downloadG :: (C.Request -> IO C.Request)
-> Downloader
-> String
-> Maybe NS.HostAddress
-> DownloadOptions
-> IO (DownloadResult)
downloadG f d u h o = fmap fst $ rawDownload f d u h o
rawDownload :: (C.Request -> IO C.Request)
-> Downloader
-> String
-> Maybe NS.HostAddress
-> DownloadOptions
-> IO (DownloadResult, Maybe RawDownloadResult)
rawDownload f (Downloader {..}) url hostAddress opts =
case parseUrl url of
Left e ->
fmap (, Nothing) $
maybe (return $ DRError $ show e) (httpExceptionToDR url)
(E.fromException e)
Right rq -> do
let dl req firstTime = do
r <- (timeout (dsTimeout settings * 1000000) $ C.withResponse req manager $ \ r -> do
let s = C.responseStatus r
h = C.responseHeaders r
rdr d =
RawDownloadResult
{ rdrStatus = s
, rdrHttpVersion = C.responseVersion r
, rdrHeaders = h
, rdrBody = d
, rdrCookieJar = C.responseCookieJar r
}
readLen = B.foldl' (\ a d -> a * 10 + ord d - ord '0') 0
mbb <- case lookup "Content-Length" h of
Just l
| B.all (\ c -> c >= '0' && c <= '9') l
&& not (B.null l)
&& readLen l > dsMaxDownloadSize settings
-> do
return Nothing
_ ->
sinkByteString (C.brRead $ C.responseBody r)
(dsMaxDownloadSize settings)
case mbb of
Just b -> do
let d = tryDeflate h b
curTime <- getCurrentTime
return
(makeDownloadResultC curTime url s h d
, Just $ rdr d)
Nothing ->
return (DRError "Too much data", Just $ rdr ""))
`E.catch`
(fmap (Just . (, Nothing)) . httpExceptionToDR url)
`E.catch`
(return . (Just . (, Nothing)) . someException)
case r of
Just (DRError e, _)
| ("EOF reached" `isSuffixOf` e
|| e == "Invalid HTTP status line:\n"
|| e == "Incomplete headers"
) && firstTime ->
dl req False
| "ZlibException" `isPrefixOf` e && firstTime ->
dl (disableCompression req) False
_ ->
return $ fromMaybe (DRError "Timeout", Nothing) r
disableCompression req =
req { C.requestHeaders =
("Accept-Encoding", "") : C.requestHeaders req }
rq1 = rq { C.requestHeaders =
[("Accept", "*/*")
,("User-Agent", dsUserAgent settings)
]
++ map toHeader opts
++ C.requestHeaders rq
, C.redirectCount = 0
, C.responseTimeout = C.responseTimeoutNone
, C.hostAddress = hostAddress
, C.checkResponse = \ _ _ -> return ()
}
req <- f rq1
dl req True
where toHeader :: String -> N.Header
toHeader h = let (a,b) = break (== ':') h in
(fromString a, fromString (tail b))
someException :: E.SomeException -> DownloadResult
someException e = case show e of
"<<timeout>>" -> DRError "Timeout"
s -> DRError s
tryDeflate headers b
| Just d <- lookup "Content-Encoding" headers
, B.map toLower d == "deflate"
= BL.toStrict $ Deflate.decompress $ BL.fromStrict b
| otherwise = b
httpExceptionToDR :: Monad m => String -> C.HttpException -> m DownloadResult
httpExceptionToDR url exn = return $ case exn of
C.HttpExceptionRequest _ ec -> httpExceptionContentToDR url ec
C.InvalidUrlException _ e
| e == "Invalid URL" -> DRError e
| otherwise -> DRError $ "Invalid URL: " ++ e
httpExceptionContentToDR :: String -> C.HttpExceptionContent -> DownloadResult
httpExceptionContentToDR url ec = case ec of
C.StatusCodeException r b ->
makeDownloadResultC (posixSecondsToUTCTime 0) url
(C.responseStatus r) (C.responseHeaders r) b
C.TooManyRedirects _ -> DRError "Too many redirects"
C.OverlongHeaders -> DRError "Overlong HTTP headers"
C.ResponseTimeout -> DRError "Timeout"
C.ConnectionTimeout -> DRError "Connection timeout"
C.ConnectionFailure e -> DRError $ "Connection failed: " ++ show e
C.InvalidStatusLine l -> DRError $ "Invalid HTTP status line:\n" ++ B.unpack l
C.InvalidHeader h -> DRError $ "Invalid HTTP header:\n" ++ B.unpack h
C.InvalidRequestHeader h -> DRError $ "Invalid HTTP request header:\n" ++ B.unpack h
C.InternalException e ->
case show e of
"<<timeout>>" -> DRError "Timeout"
s -> DRError s
C.ProxyConnectException _ _ _ -> DRError "Can't connect to proxy"
C.NoResponseDataReceived -> DRError "No response data received"
C.TlsNotSupported -> DRError "TLS not supported"
C.WrongRequestBodyStreamSize e a ->
DRError $ "The request body provided did not match the expected size "
++ ea e a
C.ResponseBodyTooShort e a -> DRError $ "Response body too short " ++ ea e a
C.InvalidChunkHeaders -> DRError "Invalid chunk headers"
C.IncompleteHeaders -> DRError "Incomplete headers"
C.InvalidDestinationHost _ -> DRError "Invalid destination host"
C.HttpZlibException e -> DRError $ show e
C.InvalidProxyEnvironmentVariable n v ->
DRError $ "Invalid proxy environment variable "
++ show n ++ "=" ++ show v
C.InvalidProxySettings s -> DRError $ "Invalid proxy settings:\n" ++ T.unpack s
C.ConnectionClosed -> DRError "Connection closed"
where ea expected actual =
"(expected " ++ show expected ++ " bytes, actual is "
++ show actual ++ " bytes)"
bufSize :: Int
bufSize = 32 * 1024 - overhead
where overhead = 2 * sizeOf (undefined :: Int)
newBuf :: IO B.ByteString
newBuf = do
fp <- B.mallocByteString bufSize
return $ B.PS fp 0 0
addBs :: [B.ByteString] -> B.ByteString -> B.ByteString
-> IO ([B.ByteString], B.ByteString)
addBs acc (B.PS bfp _ bl) (B.PS sfp offs sl) = do
let cpSize = min (bufSize - bl) sl
bl' = bl + cpSize
withForeignPtr bfp $ \ dst -> withForeignPtr sfp $ \ src ->
B.memcpy (dst `plusPtr` bl) (src `plusPtr` offs) (toEnum cpSize)
if bl' == bufSize then do
buf' <- newBuf
addBs (B.PS bfp 0 bufSize : acc) buf'
(B.PS sfp (offs + cpSize) (sl - cpSize))
else do
return (acc, B.PS bfp 0 bl')
sinkByteString :: IO B.ByteString -> Int -> IO (Maybe B.ByteString)
sinkByteString readChunk limit = do
buf <- newBuf
go 0 [] buf
where go len acc buf = do
inp <- readChunk
if B.null inp then
return $ Just $ B.concat $ reverse (buf:acc)
else do
(acc', buf') <- addBs acc buf inp
let len' = len + B.length inp
if len' > limit then
return Nothing
else
go len' acc' buf'
makeDownloadResultC :: UTCTime -> String -> N.Status -> N.ResponseHeaders
-> B.ByteString -> DownloadResult
makeDownloadResultC curTime url c headers b = do
if N.statusCode c == 304 then
DRNotModified
else if N.statusCode c `elem`
[ 300
, 301
, 302
, 303
, 307
, 308
] then
case lookup "location" headers of
Just (B.unpack -> loc) ->
redirect $
relUri (takeWhile (/= '#') $ dropWhile (== ' ') loc)
_ ->
DRError $ "Redirect status, but no Location field\n"
++ B.unpack (N.statusMessage c) ++ "\n"
++ unlines (map show headers)
else if N.statusCode c >= 300 then
DRError $ "HTTP " ++ show (N.statusCode c) ++ " "
++ B.unpack (N.statusMessage c)
else
DROK b (redownloadOpts [] headers)
where redirect r
| otherwise = DRRedirect r
redownloadOpts acc [] = reverse acc
redownloadOpts _ (("Pragma", B.map toLower -> tag) : _)
| "no-cache" `B.isInfixOf` tag = []
redownloadOpts _ (("Cache-Control", B.map toLower -> tag) : _)
| any (`B.isInfixOf` tag)
["no-cache", "no-store", "must-revalidate", "max-age=0"] = []
redownloadOpts acc (("Expires", time):xs)
| ts <- B.unpack time
, Just t <- parseHttpTime ts
, t > curTime =
redownloadOpts acc xs
| otherwise = []
redownloadOpts acc (("ETag", tag):xs) =
redownloadOpts (("If-None-Match: " ++ B.unpack tag) : acc) xs
redownloadOpts acc (("Last-Modified", time):xs)
| ts <- B.unpack time
, Just t <- parseHttpTime ts
, t <= curTime =
redownloadOpts (("If-Modified-Since: " ++ B.unpack time) : acc) xs
redownloadOpts acc (_:xs) = redownloadOpts acc xs
fixNonAscii =
escapeURIString
(\ x -> ord x <= 0x7f && x `notElem` (" []{}|\"" :: String)) .
trimString
relUri (fixNonAscii -> r) =
fromMaybe r $
fmap (($ "") . uriToString id) $
liftM2 relativeTo
(parseURIReference r)
(parseURI $ fixNonAscii url)
tryParseTime :: [String] -> String -> Maybe UTCTime
tryParseTime formats string =
foldr mplus Nothing $
map (\ fmt -> parseTimeM True defaultTimeLocale fmt (trimString string))
formats
trimString :: String -> String
trimString = reverse . dropWhile isSpace . reverse . dropWhile isSpace
parseHttpTime :: String -> Maybe UTCTime
parseHttpTime =
tryParseTime
["%a, %e %b %Y %k:%M:%S %Z"
,"%A, %e-%b-%y %k:%M:%S %Z"
,"%a %b %e %k:%M:%S %Y"
]
globalDownloader :: Downloader
globalDownloader = unsafePerformIO $ newDownloader def
{-# NOINLINE globalDownloader #-}
urlGetContents :: String -> IO B.ByteString
urlGetContents url = do
r <- download globalDownloader url Nothing []
case r of
DROK c _ -> return c
e -> fail $ "urlGetContents " ++ show url ++ ": " ++ show e
urlGetContentsPost :: String -> B.ByteString -> IO B.ByteString
urlGetContentsPost url dat = do
r <- post globalDownloader url Nothing dat
case r of
DROK c _ -> return c
e -> fail $ "urlGetContentsPost " ++ show url ++ ": " ++ show e