Network/URI.hs

{-# LANGUAGE CPP #-}
--------------------------------------------------------------------------------
-- |
--  Module      :  Network.URI
--  Copyright   :  (c) 2004, Graham Klyne
--  License     :  BSD-style (see end of this file)
--
--  Maintainer  :  Graham Klyne <gk@ninebynine.org>
--  Stability   :  provisional
--  Portability :  portable
--
--  This module defines functions for handling URIs.  It presents substantially the
--  same interface as the older GHC Network.URI module, but is implemented using
--  Parsec rather than a Regex library that is not available with Hugs.  The internal
--  representation of URI has been changed so that URI strings are more
--  completely preserved when round-tripping to a URI value and back.
--
--  In addition, four methods are provided for parsing different
--  kinds of URI string (as noted in RFC3986):
--      'parseURI',
--      'parseURIReference',
--      'parseRelativeReference' and
--      'parseAbsoluteURI'.
--
--  Further, four methods are provided for classifying different
--  kinds of URI string (as noted in RFC3986):
--      'isURI',
--      'isURIReference',
--      'isRelativeReference' and
--      'isAbsoluteURI'.
--
--  The long-standing official reference for URI handling was RFC2396 [1],
--  as updated by RFC 2732 [2], but this was replaced by a new specification,
--  RFC3986 [3] in January 2005.  This latter specification has been used
--  as the primary reference for constructing the URI parser implemented
--  here, and it is intended that there is a direct relationship between
--  the syntax definition in that document and this parser implementation.
--
--  RFC 1808 [4] contains a number of test cases for relative URI handling.
--  Dan Connolly's Python module @uripath.py@ [5] also contains useful details
--  and test cases.
--
--  Some of the code has been copied from the previous GHC implementation,
--  but the parser is replaced with one that performs more complete
--  syntax checking of the URI itself, according to RFC3986 [3].
--
--  References
--
--  (1) <http://www.ietf.org/rfc/rfc2396.txt>
--
--  (2) <http://www.ietf.org/rfc/rfc2732.txt>
--
--  (3) <http://www.ietf.org/rfc/rfc3986.txt>
--
--  (4) <http://www.ietf.org/rfc/rfc1808.txt>
--
--  (5) <http://www.w3.org/2000/10/swap/uripath.py>
--
--------------------------------------------------------------------------------

module Network.URI
    (
    -- * The URI type
      URI(..)
    , URIAuth(..)
    , nullURI
      
    -- * Parsing
    , parseURI
    , parseURIReference
    , parseRelativeReference
    , parseAbsoluteURI
      
    -- * Test for strings containing various kinds of URI
    , isURI
    , isURIReference
    , isRelativeReference
    , isAbsoluteURI
    , isIPv6address
    , isIPv4address
      
    -- * Predicates
    , uriIsAbsolute
    , uriIsRelative
      
    -- * Relative URIs
    , relativeTo
    , nonStrictRelativeTo
    , relativeFrom
      
    -- * Operations on URI strings
    -- | Support for putting strings into URI-friendly
    --   escaped format and getting them back again.
    --   This can't be done transparently in all cases, because certain
    --   characters have different meanings in different kinds of URI.
    --   The URI spec [3], section 2.4, indicates that all URI components
    --   should be escaped before they are assembled as a URI:
    --   \"Once produced, a URI is always in its percent-encoded form\"
    , uriToString
    , isReserved, isUnreserved
    , isAllowedInURI, isUnescapedInURI
    , isUnescapedInURIComponent
    , escapeURIChar
    , escapeURIString
    , unEscapeString
      
    -- * URI Normalization functions
    , normalizeCase
    , normalizeEscape
    , normalizePathSegments
      
    -- * Deprecated functions
    , parseabsoluteURI
    , escapeString
    , reserved, unreserved
    , scheme, authority, path, query, fragment
    ) where

import Text.ParserCombinators.Parsec
    ( GenParser, ParseError
    , parse, (<|>), (<?>), try
    , option, many, many1, count, notFollowedBy
    , char, satisfy, oneOf, string, eof
    , unexpected
    )

import Control.Monad (MonadPlus(..))
import Data.Char (ord, chr, isHexDigit, toLower, toUpper, digitToInt)
import Data.Bits ((.|.),(.&.),shiftL,shiftR)
import Debug.Trace (trace)
import Numeric (showIntAtBase)

import Data.Typeable (Typeable)
#if MIN_VERSION_base(4,0,0)
import Data.Data (Data)
#else
import Data.Generics (Data)
#endif

------------------------------------------------------------
--  The URI datatype
------------------------------------------------------------

-- |Represents a general universal resource identifier using
--  its component parts.
--
--  For example, for the URI
--
--  >   foo://anonymous@www.haskell.org:42/ghc?query#frag
--
--  the components are:
--
data URI = URI
    { uriScheme     :: String           -- ^ @foo:@
    , uriAuthority  :: Maybe URIAuth    -- ^ @\/\/anonymous\@www.haskell.org:42@
    , uriPath       :: String           -- ^ @\/ghc@
    , uriQuery      :: String           -- ^ @?query@
    , uriFragment   :: String           -- ^ @#frag@
    } deriving (Eq, Ord, Typeable, Data)

-- |Type for authority value within a URI
data URIAuth = URIAuth
    { uriUserInfo   :: String           -- ^ @anonymous\@@
    , uriRegName    :: String           -- ^ @www.haskell.org@
    , uriPort       :: String           -- ^ @:42@
    } deriving (Eq, Ord, Show, Typeable, Data)

-- |Blank URI
nullURI :: URI
nullURI = URI
    { uriScheme     = ""
    , uriAuthority  = Nothing
    , uriPath       = ""
    , uriQuery      = ""
    , uriFragment   = ""
    }

--  URI as instance of Show.  Note that for security reasons, the default
--  behaviour is to suppress any userinfo field (see RFC3986, section 7.5).
--  This can be overridden by using uriToString directly with first
--  argument @id@ (noting that this returns a ShowS value rather than a string).
--
--  [[[Another design would be to embed the userinfo mapping function in
--  the URIAuth value, with the default value suppressing userinfo formatting,
--  but providing a function to return a new URI value with userinfo
--  data exposed by show.]]]
--
instance Show URI where
    showsPrec _ = uriToString defaultUserInfoMap

defaultUserInfoMap :: String -> String
defaultUserInfoMap uinf = user++newpass
    where
        (user,pass) = break (==':') uinf
        newpass     = if null pass || (pass == "@")
                                   || (pass == ":@")
                        then pass
                        else ":...@"

testDefaultUserInfoMap :: [Bool]
testDefaultUserInfoMap =
     [ defaultUserInfoMap ""                == ""
     , defaultUserInfoMap "@"               == "@"
     , defaultUserInfoMap "user@"           == "user@"
     , defaultUserInfoMap "user:@"          == "user:@"
     , defaultUserInfoMap "user:anonymous@" == "user:...@"
     , defaultUserInfoMap "user:pass@"      == "user:...@"
     , defaultUserInfoMap "user:pass"       == "user:...@"
     , defaultUserInfoMap "user:anonymous"  == "user:...@"
     ]

------------------------------------------------------------
--  Parse a URI
------------------------------------------------------------

-- |Turn a string containing a URI into a 'URI'.
--  Returns 'Nothing' if the string is not a valid URI;
--  (an absolute URI with optional fragment identifier).
--
--  NOTE: this is different from the previous network.URI,
--  whose @parseURI@ function works like 'parseURIReference'
--  in this module.
--
parseURI :: String -> Maybe URI
parseURI = parseURIAny uri

-- |Parse a URI reference to a 'URI' value.
--  Returns 'Nothing' if the string is not a valid URI reference.
--  (an absolute or relative URI with optional fragment identifier).
--
parseURIReference :: String -> Maybe URI
parseURIReference = parseURIAny uriReference

-- |Parse a relative URI to a 'URI' value.
--  Returns 'Nothing' if the string is not a valid relative URI.
--  (a relative URI with optional fragment identifier).
--
parseRelativeReference :: String -> Maybe URI
parseRelativeReference = parseURIAny relativeRef

-- |Parse an absolute URI to a 'URI' value.
--  Returns 'Nothing' if the string is not a valid absolute URI.
--  (an absolute URI without a fragment identifier).
--
parseAbsoluteURI :: String -> Maybe URI
parseAbsoluteURI = parseURIAny absoluteURI

-- |Test if string contains a valid URI
--  (an absolute URI with optional fragment identifier).
--
isURI :: String -> Bool
isURI = isValidParse uri

-- |Test if string contains a valid URI reference
--  (an absolute or relative URI with optional fragment identifier).
--
isURIReference :: String -> Bool
isURIReference = isValidParse uriReference

-- |Test if string contains a valid relative URI
--  (a relative URI with optional fragment identifier).
--
isRelativeReference :: String -> Bool
isRelativeReference = isValidParse relativeRef

-- |Test if string contains a valid absolute URI
--  (an absolute URI without a fragment identifier).
--
isAbsoluteURI :: String -> Bool
isAbsoluteURI = isValidParse absoluteURI

-- |Test if string contains a valid IPv6 address
--
isIPv6address :: String -> Bool
isIPv6address = isValidParse ipv6address

-- |Test if string contains a valid IPv4 address
--
isIPv4address :: String -> Bool
isIPv4address = isValidParse ipv4address

-- |Test function: parse and reconstruct a URI reference
--
testURIReference :: String -> String
testURIReference uristr = show (parseAll uriReference "" uristr)

--  Helper function for turning a string into a URI
--
parseURIAny :: URIParser URI -> String -> Maybe URI
parseURIAny parser uristr = case parseAll parser "" uristr of
        Left  _ -> Nothing
        Right u -> Just u

--  Helper function to test a string match to a parser
--
isValidParse :: URIParser a -> String -> Bool
isValidParse parser uristr = case parseAll parser "" uristr of
        -- Left  e -> error (show e)
        Left  _ -> False
        Right _ -> True

parseAll :: URIParser a -> String -> String -> Either ParseError a
parseAll parser filename uristr = parse newparser filename uristr
    where
        newparser =
            do  { res <- parser
                ; eof
                ; return res
                }

------------------------------------------------------------
--  Predicates
------------------------------------------------------------

uriIsAbsolute :: URI -> Bool
uriIsAbsolute (URI {uriScheme = scheme}) = scheme /= ""

uriIsRelative :: URI -> Bool
uriIsRelative = not . uriIsAbsolute

------------------------------------------------------------
--  URI parser body based on Parsec elements and combinators
------------------------------------------------------------

--  Parser parser type.
--  Currently
type URIParser a = GenParser Char () a

--  RFC3986, section 2.1
--
--  Parse and return a 'pct-encoded' sequence
--
escaped :: URIParser String
escaped =
    do  { char '%'
        ; h1 <- hexDigitChar
        ; h2 <- hexDigitChar
        ; return $ ['%',h1,h2]
        }

--  RFC3986, section 2.2
--
-- |Returns 'True' if the character is a \"reserved\" character in a
--  URI.  To include a literal instance of one of these characters in a
--  component of a URI, it must be escaped.
--
isReserved :: Char -> Bool
isReserved c = isGenDelims c || isSubDelims c

isGenDelims :: Char -> Bool
isGenDelims c = c `elem` ":/?#[]@"

isSubDelims :: Char -> Bool
isSubDelims c = c `elem` "!$&'()*+,;="

genDelims :: URIParser String
genDelims = do { c <- satisfy isGenDelims ; return [c] }

subDelims :: URIParser String
subDelims = do { c <- satisfy isSubDelims ; return [c] }

--  RFC3986, section 2.3
--
-- |Returns 'True' if the character is an \"unreserved\" character in
--  a URI.  These characters do not need to be escaped in a URI.  The
--  only characters allowed in a URI are either \"reserved\",
--  \"unreserved\", or an escape sequence (@%@ followed by two hex digits).
--
isUnreserved :: Char -> Bool
isUnreserved c = isAlphaNumChar c || (c `elem` "-_.~")

unreservedChar :: URIParser String
unreservedChar = do { c <- satisfy isUnreserved ; return [c] }

--  RFC3986, section 3
--
--   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
--
--   hier-part   = "//" authority path-abempty
--               / path-abs
--               / path-rootless
--               / path-empty

uri :: URIParser URI
uri =
    do  { us <- try uscheme
        -- ; ua <- option Nothing ( do { try (string "//") ; uauthority } )
        -- ; up <- upath
        ; (ua,up) <- hierPart
        ; uq <- option "" ( do { char '?' ; uquery    } )
        ; uf <- option "" ( do { char '#' ; ufragment } )
        ; return $ URI
            { uriScheme    = us
            , uriAuthority = ua
            , uriPath      = up
            , uriQuery     = uq
            , uriFragment  = uf
            }
        }

hierPart :: URIParser ((Maybe URIAuth),String)
hierPart =
        do  { try (string "//")
            ; ua <- uauthority
            ; up <- pathAbEmpty
            ; return (ua,up)
            }
    <|> do  { up <- pathAbs
            ; return (Nothing,up)
            }
    <|> do  { up <- pathRootLess
            ; return (Nothing,up)
            }
    <|> do  { return (Nothing,"")
            }

--  RFC3986, section 3.1

uscheme :: URIParser String
uscheme =
    do  { s <- oneThenMany alphaChar (satisfy isSchemeChar)
        ; char ':'
        ; return $ s++":"
        }

--  RFC3986, section 3.2

uauthority :: URIParser (Maybe URIAuth)
uauthority =
    do  { uu <- option "" (try userinfo)
        ; uh <- host
        ; up <- option "" port
        ; return $ Just $ URIAuth
            { uriUserInfo = uu
            , uriRegName  = uh
            , uriPort     = up
            }
        }

--  RFC3986, section 3.2.1

userinfo :: URIParser String
userinfo =
    do  { uu <- many (uchar ";:&=+$,")
        ; char '@'
        ; return (concat uu ++"@")
        }

--  RFC3986, section 3.2.2

host :: URIParser String
host = ipLiteral <|> try ipv4address <|> regName

ipLiteral :: URIParser String
ipLiteral =
    do  { char '['
        ; ua <- ( ipv6address <|> ipvFuture )
        ; char ']'
        ; return $ "[" ++ ua ++ "]"
        }
    <?> "IP address literal"

ipvFuture :: URIParser String
ipvFuture =
    do  { char 'v'
        ; h <- hexDigitChar
        ; char '.'
        ; a <- many1 (satisfy isIpvFutureChar)
        ; return $ 'v':h:'.':a
        }

isIpvFutureChar :: Char -> Bool
isIpvFutureChar c = isUnreserved c || isSubDelims c || (c==';')

ipv6address :: URIParser String
ipv6address =
        try ( do
                { a2 <- count 6 h4c
                ; a3 <- ls32
                ; return $ concat a2 ++ a3
                } )
    <|> try ( do
                { string "::"
                ; a2 <- count 5 h4c
                ; a3 <- ls32
                ; return $ "::" ++ concat a2 ++ a3
                } )
    <|> try ( do
                { a1 <- opt_n_h4c_h4 0
                ; string "::"
                ; a2 <- count 4 h4c
                ; a3 <- ls32
                ; return $ a1 ++ "::" ++ concat a2 ++ a3
                } )
    <|> try ( do
                { a1 <- opt_n_h4c_h4 1
                ; string "::"
                ; a2 <- count 3 h4c
                ; a3 <- ls32
                ; return $ a1 ++ "::" ++ concat a2 ++ a3
                } )
    <|> try ( do
                { a1 <- opt_n_h4c_h4 2
                ; string "::"
                ; a2 <- count 2 h4c
                ; a3 <- ls32
                ; return $ a1 ++ "::" ++ concat a2 ++ a3
                } )
    <|> try ( do
                { a1 <- opt_n_h4c_h4 3
                ; string "::"
                ; a2 <- h4c
                ; a3 <- ls32
                ; return $ a1 ++ "::" ++ a2 ++ a3
                } )
    <|> try ( do
                { a1 <- opt_n_h4c_h4 4
                ; string "::"
                ; a3 <- ls32
                ; return $ a1 ++ "::" ++ a3
                } )
    <|> try ( do
                { a1 <- opt_n_h4c_h4 5
                ; string "::"
                ; a3 <- h4
                ; return $ a1 ++ "::" ++ a3
                } )
    <|> try ( do
                { a1 <- opt_n_h4c_h4 6
                ; string "::"
                ; return $ a1 ++ "::"
                } )
    <?> "IPv6 address"

opt_n_h4c_h4 :: Int -> URIParser String
opt_n_h4c_h4 n = option "" $
    do  { a1 <- countMinMax 0 n h4c
        ; a2 <- h4
        ; return $ concat a1 ++ a2
        }

ls32 :: URIParser String
ls32 =  try ( do
                { a1 <- h4c
                ; a2 <- h4
                ; return (a1++a2)
                } )
    <|> ipv4address

h4c :: URIParser String
h4c = try $
    do  { a1 <- h4
        ; char ':'
        ; notFollowedBy (char ':')
        ; return $ a1 ++ ":"
        }

h4 :: URIParser String
h4 = countMinMax 1 4 hexDigitChar

ipv4address :: URIParser String
ipv4address =
    do  { a1 <- decOctet ; char '.'
        ; a2 <- decOctet ; char '.'
        ; a3 <- decOctet ; char '.'
        ; a4 <- decOctet
        ; notFollowedBy regName
        ; return $ a1++"."++a2++"."++a3++"."++a4
        }
    <?> "IPv4 Address"

decOctet :: URIParser String
decOctet =
    do  { a1 <- countMinMax 1 3 digitChar
        ; if (read a1 :: Integer) > 255 then
            fail "Decimal octet value too large"
          else
            return a1
        }

regName :: URIParser String
regName =
    do  { ss <- countMinMax 0 255 ( unreservedChar <|> escaped <|> subDelims )
        ; return $ concat ss
        }
    <?> "Registered name"

--  RFC3986, section 3.2.3

port :: URIParser String
port =
    do  { char ':'
        ; p <- many digitChar
        ; return (':':p)
        }

--
--  RFC3986, section 3.3
--
--   path          = path-abempty    ; begins with "/" or is empty
--                 / path-abs        ; begins with "/" but not "//"
--                 / path-noscheme   ; begins with a non-colon segment
--                 / path-rootless   ; begins with a segment
--                 / path-empty      ; zero characters
--
--   path-abempty  = *( "/" segment )
--   path-abs      = "/" [ segment-nz *( "/" segment ) ]
--   path-noscheme = segment-nzc *( "/" segment )
--   path-rootless = segment-nz *( "/" segment )
--   path-empty    = 0<pchar>
--
--   segment       = *pchar
--   segment-nz    = 1*pchar
--   segment-nzc   = 1*( unreserved / pct-encoded / sub-delims / "@" )
--
--   pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"

{-
upath :: URIParser String
upath = pathAbEmpty
    <|> pathAbs
    <|> pathNoScheme
    <|> pathRootLess
    <|> pathEmpty
-}

pathAbEmpty :: URIParser String
pathAbEmpty =
    do  { ss <- many slashSegment
        ; return $ concat ss
        }

pathAbs :: URIParser String
pathAbs =
    do  { char '/'
        ; ss <- option "" pathRootLess
        ; return $ '/':ss
        }

pathNoScheme :: URIParser String
pathNoScheme =
    do  { s1 <- segmentNzc
        ; ss <- many slashSegment
        ; return $ concat (s1:ss)
        }

pathRootLess :: URIParser String
pathRootLess =
    do  { s1 <- segmentNz
        ; ss <- many slashSegment
        ; return $ concat (s1:ss)
        }

slashSegment :: URIParser String
slashSegment =
    do  { char '/'
        ; s <- segment
        ; return ('/':s)
        }

segment :: URIParser String
segment =
    do  { ps <- many pchar
        ; return $ concat ps
        }

segmentNz :: URIParser String
segmentNz =
    do  { ps <- many1 pchar
        ; return $ concat ps
        }

segmentNzc :: URIParser String
segmentNzc =
    do  { ps <- many1 (uchar "@")
        ; return $ concat ps
        }

pchar :: URIParser String
pchar = uchar ":@"

-- helper function for pchar and friends
uchar :: String -> URIParser String
uchar extras =
        unreservedChar
    <|> escaped
    <|> subDelims
    <|> do { c <- oneOf extras ; return [c] }

--  RFC3986, section 3.4

uquery :: URIParser String
uquery =
    do  { ss <- many $ uchar (":@"++"/?")
        ; return $ '?':concat ss
        }

--  RFC3986, section 3.5

ufragment :: URIParser String
ufragment =
    do  { ss <- many $ uchar (":@"++"/?")
        ; return $ '#':concat ss
        }

--  Reference, Relative and Absolute URI forms
--
--  RFC3986, section 4.1

uriReference :: URIParser URI
uriReference = uri <|> relativeRef

--  RFC3986, section 4.2
--
--   relative-URI  = relative-part [ "?" query ] [ "#" fragment ]
--
--   relative-part = "//" authority path-abempty
--                 / path-abs
--                 / path-noscheme
--                 / path-empty

relativeRef :: URIParser URI
relativeRef =
    do  { notMatching uscheme
        -- ; ua <- option Nothing ( do { try (string "//") ; uauthority } )
        -- ; up <- upath
        ; (ua,up) <- relativePart
        ; uq <- option "" ( do { char '?' ; uquery    } )
        ; uf <- option "" ( do { char '#' ; ufragment } )
        ; return $ URI
            { uriScheme    = ""
            , uriAuthority = ua
            , uriPath      = up
            , uriQuery     = uq
            , uriFragment  = uf
            }
        }

relativePart :: URIParser ((Maybe URIAuth),String)
relativePart =
        do  { try (string "//")
            ; ua <- uauthority
            ; up <- pathAbEmpty
            ; return (ua,up)
            }
    <|> do  { up <- pathAbs
            ; return (Nothing,up)
            }
    <|> do  { up <- pathNoScheme
            ; return (Nothing,up)
            }
    <|> do  { return (Nothing,"")
            }

--  RFC3986, section 4.3

absoluteURI :: URIParser URI
absoluteURI =
    do  { us <- uscheme
        -- ; ua <- option Nothing ( do { try (string "//") ; uauthority } )
        -- ; up <- upath
        ; (ua,up) <- hierPart
        ; uq <- option "" ( do { char '?' ; uquery    } )
        ; return $ URI
            { uriScheme    = us
            , uriAuthority = ua
            , uriPath      = up
            , uriQuery     = uq
            , uriFragment  = ""
            }
        }

--  Imports from RFC 2234

    -- NOTE: can't use isAlphaNum etc. because these deal with ISO 8859
    -- (and possibly Unicode!) chars.
    -- [[[Above was a comment originally in GHC Network/URI.hs:
    --    when IRIs are introduced then most codepoints above 128(?) should
    --    be treated as unreserved, and higher codepoints for letters should
    --    certainly be allowed.
    -- ]]]

isAlphaChar :: Char -> Bool
isAlphaChar c    = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')

isDigitChar :: Char -> Bool
isDigitChar c    = (c >= '0' && c <= '9')

isAlphaNumChar :: Char -> Bool
isAlphaNumChar c = isAlphaChar c || isDigitChar c

isHexDigitChar :: Char -> Bool
isHexDigitChar c = isHexDigit c

isSchemeChar :: Char -> Bool
isSchemeChar c   = (isAlphaNumChar c) || (c `elem` "+-.")

alphaChar :: URIParser Char
alphaChar = satisfy isAlphaChar         -- or: Parsec.letter ?

digitChar :: URIParser Char
digitChar = satisfy isDigitChar         -- or: Parsec.digit ?

alphaNumChar :: URIParser Char
alphaNumChar = satisfy isAlphaNumChar

hexDigitChar :: URIParser Char
hexDigitChar = satisfy isHexDigitChar   -- or: Parsec.hexDigit ?

--  Additional parser combinators for common patterns

oneThenMany :: GenParser t s a -> GenParser t s a -> GenParser t s [a]
oneThenMany p1 pr =
    do  { a1 <- p1
        ; ar <- many pr
        ; return (a1:ar)
        }

countMinMax :: Int -> Int -> GenParser t s a -> GenParser t s [a]
countMinMax m n p | m > 0 =
    do  { a1 <- p
        ; ar <- countMinMax (m-1) (n-1) p
        ; return (a1:ar)
        }
countMinMax _ n _ | n <= 0 = return []
countMinMax _ n p = option [] $
    do  { a1 <- p
        ; ar <- countMinMax 0 (n-1) p
        ; return (a1:ar)
        }

notMatching :: Show a => GenParser tok st a -> GenParser tok st ()
notMatching p = do { a <- try p ; unexpected (show a) } <|> return ()

------------------------------------------------------------
--  Reconstruct a URI string
------------------------------------------------------------
--
-- |Turn a 'URI' into a string.
--
--  Uses a supplied function to map the userinfo part of the URI.
--
--  The Show instance for URI uses a mapping that hides any password
--  that may be present in the URI.  Use this function with argument @id@
--  to preserve the password in the formatted output.
--
uriToString :: (String->String) -> URI -> ShowS
uriToString userinfomap URI { uriScheme=myscheme
                            , uriAuthority=myauthority
                            , uriPath=mypath
                            , uriQuery=myquery
                            , uriFragment=myfragment
                            } =
    (myscheme++) . (uriAuthToString userinfomap myauthority)
               . (mypath++) . (myquery++) . (myfragment++)

uriAuthToString :: (String->String) -> (Maybe URIAuth) -> ShowS
uriAuthToString _           Nothing   = id          -- shows ""
uriAuthToString userinfomap
        (Just URIAuth { uriUserInfo = myuinfo
                      , uriRegName  = myregname
                      , uriPort     = myport
                      } ) =
    ("//"++) . (if null myuinfo then id else ((userinfomap myuinfo)++))
             . (myregname++)
             . (myport++)

------------------------------------------------------------
--  Character classes
------------------------------------------------------------

-- | Returns 'True' if the character is allowed in a URI.
--
isAllowedInURI :: Char -> Bool
isAllowedInURI c = isReserved c || isUnreserved c || c == '%' -- escape char

-- | Returns 'True' if the character is allowed unescaped in a URI.
--
isUnescapedInURI :: Char -> Bool
isUnescapedInURI c = isReserved c || isUnreserved c

-- | Returns 'True' if the character is allowed unescaped in a URI component.
--
isUnescapedInURIComponent :: Char -> Bool
isUnescapedInURIComponent c = not (isReserved c || not (isUnescapedInURI c))

------------------------------------------------------------
--  Escape sequence handling
------------------------------------------------------------

-- |Escape character if supplied predicate is not satisfied,
--  otherwise return character as singleton string.
--
escapeURIChar :: (Char->Bool) -> Char -> String
escapeURIChar p c
    | p c       = [c]
    | otherwise = concatMap (\i -> '%' : myShowHex i "") (utf8EncodeChar c)
    where
        myShowHex :: Int -> ShowS
        myShowHex n r =  case showIntAtBase 16 (toChrHex) n r of
            []  -> "00"
            [x] -> ['0',x]
            cs  -> cs
        toChrHex d
            | d < 10    = chr (ord '0' + fromIntegral d)
            | otherwise = chr (ord 'A' + fromIntegral (d - 10))

-- From http://hackage.haskell.org/package/utf8-string
-- by Eric Mertens, BSD3
-- Returns [Int] for use with showIntAtBase
utf8EncodeChar :: Char -> [Int]
utf8EncodeChar = map fromIntegral . go . ord
 where
  go oc
   | oc <= 0x7f       = [oc]

   | oc <= 0x7ff      = [ 0xc0 + (oc `shiftR` 6)
                        , 0x80 + oc .&. 0x3f
                        ]

   | oc <= 0xffff     = [ 0xe0 + (oc `shiftR` 12)
                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
                        , 0x80 + oc .&. 0x3f
                        ]
   | otherwise        = [ 0xf0 + (oc `shiftR` 18)
                        , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
                        , 0x80 + oc .&. 0x3f
                        ]

-- |Can be used to make a string valid for use in a URI.
--
escapeURIString
    :: (Char->Bool)     -- ^ a predicate which returns 'False'
                        --   if the character should be escaped
    -> String           -- ^ the string to process
    -> String           -- ^ the resulting URI string
escapeURIString p s = concatMap (escapeURIChar p) s

-- |Turns all instances of escaped characters in the string back
--  into literal characters.
--
unEscapeString :: String -> String
unEscapeString [] = ""
unEscapeString s@(c:cs) = case unEscapeByte s of
    Just (byte, rest) -> unEscapeUtf8 byte rest
    Nothing -> c : unEscapeString cs

unEscapeByte :: String -> Maybe (Int, String)
unEscapeByte ('%':x1:x2:s) | isHexDigit x1 && isHexDigit x2 =
    Just (digitToInt x1 * 16 + digitToInt x2, s)
unEscapeByte _ = Nothing

-- Adapted from http://hackage.haskell.org/package/utf8-string
-- by Eric Mertens, BSD3
unEscapeUtf8 :: Int -> String -> String
unEscapeUtf8 c rest
    | c < 0x80 = chr c : unEscapeString rest
    | c < 0xc0 = replacement_character : unEscapeString rest
    | c < 0xe0 = multi1
    | c < 0xf0 = multi_byte 2 0xf 0x800
    | c < 0xf8 = multi_byte 3 0x7 0x10000
    | c < 0xfc = multi_byte 4 0x3 0x200000
    | c < 0xfe = multi_byte 5 0x1 0x4000000
    | otherwise    = replacement_character : unEscapeString rest
    where
    replacement_character = '\xfffd'
    multi1 = case unEscapeByte rest of
      Just (c1, ds) | c1 .&. 0xc0 == 0x80 ->
        let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|.  fromEnum (c1 .&. 0x3f)
        in if d >= 0x000080 then toEnum d : unEscapeString ds
                            else replacement_character : unEscapeString ds
      _ -> replacement_character : unEscapeString rest

    multi_byte i mask overlong =
      aux i rest (unEscapeByte rest) (c .&. mask)
      where
        aux 0 rs _ acc
          | overlong <= acc && acc <= 0x10ffff &&
            (acc < 0xd800 || 0xdfff < acc)     &&
            (acc < 0xfffe || 0xffff < acc)      = chr acc : unEscapeString rs
          | otherwise = replacement_character : unEscapeString rs

        aux n _ (Just (r, rs)) acc
          | r .&. 0xc0 == 0x80 = aux (n-1) rs (unEscapeByte rs)
                               $! shiftL acc 6 .|. (r .&. 0x3f)

        aux _ rs _ _ = replacement_character : unEscapeString rs

------------------------------------------------------------
-- Resolving a relative URI relative to a base URI
------------------------------------------------------------

-- |Returns a new 'URI' which represents the value of the
--  first 'URI' interpreted as relative to the second 'URI'.
--  For example:
--
--  > "foo" `relativeTo` "http://bar.org/" = "http://bar.org/foo"
--  > "http:foo" `nonStrictRelativeTo` "http://bar.org/" = "http://bar.org/foo"
--
--  Algorithm from RFC3986 [3], section 5.2.2
--

nonStrictRelativeTo :: URI -> URI -> URI
nonStrictRelativeTo ref base = relativeTo ref' base
    where
        ref' = if uriScheme ref == uriScheme base
               then ref { uriScheme="" }
               else ref

isDefined :: ( MonadPlus m, Eq (m a) ) => m a -> Bool
isDefined a = a /= mzero

-- | Returns a new 'URI' which represents the value of the first 'URI'
-- interpreted as relative to the second 'URI'.
--
-- Algorithm from RFC3986 [3], section 5.2
relativeTo :: URI -> URI -> URI
relativeTo ref base
    | isDefined ( uriScheme ref ) =
        just_segments ref
    | isDefined ( uriAuthority ref ) =
        just_segments ref { uriScheme = uriScheme base }
    | isDefined ( uriPath ref ) =
        if (head (uriPath ref) == '/') then
            just_segments ref
                { uriScheme    = uriScheme base
                , uriAuthority = uriAuthority base
                }
        else
            just_segments ref
                { uriScheme    = uriScheme base
                , uriAuthority = uriAuthority base
                , uriPath      = mergePaths base ref
                }
    | isDefined ( uriQuery ref ) =
        just_segments ref
            { uriScheme    = uriScheme base
            , uriAuthority = uriAuthority base
            , uriPath      = uriPath base
            }
    | otherwise =
        just_segments ref
            { uriScheme    = uriScheme base
            , uriAuthority = uriAuthority base
            , uriPath      = uriPath base
            , uriQuery     = uriQuery base
            }
    where
        just_segments u =
            u { uriPath = removeDotSegments (uriPath u) }
        mergePaths b r
            | isDefined (uriAuthority b) && null pb = '/':pr
            | otherwise                             = dropLast pb ++ pr
            where
                pb = uriPath b
                pr = uriPath r
        dropLast = fst . splitLast -- reverse . dropWhile (/='/') . reverse

--  Remove dot segments, but protect leading '/' character
removeDotSegments :: String -> String
removeDotSegments ('/':ps) = '/':elimDots ps []
removeDotSegments ps       = elimDots ps []

--  Second arg accumulates segments processed so far in reverse order
elimDots :: String -> [String] -> String
-- elimDots ps rs | traceVal "\nps " ps $ traceVal "rs " rs $ False = error ""
elimDots [] [] = ""
elimDots [] rs = concat (reverse rs)
elimDots (    '.':'/':ps)     rs = elimDots ps rs
elimDots (    '.':[]    )     rs = elimDots [] rs
elimDots (    '.':'.':'/':ps) rs = elimDots ps (drop 1 rs)
elimDots (    '.':'.':[]    ) rs = elimDots [] (drop 1 rs)
elimDots ps rs = elimDots ps1 (r:rs)
    where
        (r,ps1) = nextSegment ps

--  Returns the next segment and the rest of the path from a path string.
--  Each segment ends with the next '/' or the end of string.
--
nextSegment :: String -> (String,String)
nextSegment ps =
    case break (=='/') ps of
        (r,'/':ps1) -> (r++"/",ps1)
        (r,_)       -> (r,[])

--  Split last (name) segment from path, returning (path,name)
splitLast :: String -> (String,String)
splitLast p = (reverse revpath,reverse revname)
    where
        (revname,revpath) = break (=='/') $ reverse p

------------------------------------------------------------
-- Finding a URI relative to a base URI
------------------------------------------------------------

-- |Returns a new 'URI' which represents the relative location of
--  the first 'URI' with respect to the second 'URI'.  Thus, the
--  values supplied are expected to be absolute URIs, and the result
--  returned may be a relative URI.
--
--  Example:
--
--  > "http://example.com/Root/sub1/name2#frag"
--  >   `relativeFrom` "http://example.com/Root/sub2/name2#frag"
--  >   == "../sub1/name2#frag"
--
--  There is no single correct implementation of this function,
--  but any acceptable implementation must satisfy the following:
--
--  > (uabs `relativeFrom` ubase) `relativeTo` ubase == uabs
--
--  For any valid absolute URI.
--  (cf. <http://lists.w3.org/Archives/Public/uri/2003Jan/0008.html>
--       <http://lists.w3.org/Archives/Public/uri/2003Jan/0005.html>)
--
relativeFrom :: URI -> URI -> URI
relativeFrom uabs base
    | diff uriScheme    uabs base = uabs
    | diff uriAuthority uabs base = uabs { uriScheme = "" }
    | diff uriPath      uabs base = uabs
        { uriScheme    = ""
        , uriAuthority = Nothing
        , uriPath      = relPathFrom (removeBodyDotSegments $ uriPath uabs)
                                     (removeBodyDotSegments $ uriPath base)
        }
    | diff uriQuery     uabs base = uabs
        { uriScheme    = ""
        , uriAuthority = Nothing
        , uriPath      = ""
        }
    | otherwise = uabs          -- Always carry fragment from uabs
        { uriScheme    = ""
        , uriAuthority = Nothing
        , uriPath      = ""
        , uriQuery     = ""
        }
    where
        diff :: Eq b => (a -> b) -> a -> a -> Bool
        diff sel u1 u2 = sel u1 /= sel u2
        -- Remove dot segments except the final segment
        removeBodyDotSegments p = removeDotSegments p1 ++ p2
            where
                (p1,p2) = splitLast p

relPathFrom :: String -> String -> String
relPathFrom []   _    = "/"
relPathFrom pabs []   = pabs
relPathFrom pabs base =                 -- Construct a relative path segments
    if sa1 == sb1                       -- if the paths share a leading segment
        then if (sa1 == "/")            -- other than a leading '/'
            then if (sa2 == sb2)
                then relPathFrom1 ra2 rb2
                else pabs
            else relPathFrom1 ra1 rb1
        else pabs
    where
        (sa1,ra1) = nextSegment pabs
        (sb1,rb1) = nextSegment base
        (sa2,ra2) = nextSegment ra1
        (sb2,rb2) = nextSegment rb1

--  relPathFrom1 strips off trailing names from the supplied paths,
--  and calls difPathFrom to find the relative path from base to
--  target
relPathFrom1 :: String -> String -> String
relPathFrom1 pabs base = relName
    where
        (sa,na) = splitLast pabs
        (sb,nb) = splitLast base
        rp      = relSegsFrom sa sb
        relName = if null rp then
                      if (na == nb) then ""
                      else if protect na then "./"++na
                      else na
                  else
                      rp++na
        -- Precede name with some path if it is null or contains a ':'
        protect s = null s || ':' `elem` s

--  relSegsFrom discards any common leading segments from both paths,
--  then invokes difSegsFrom to calculate a relative path from the end
--  of the base path to the end of the target path.
--  The final name is handled separately, so this deals only with
--  "directory" segtments.
--
relSegsFrom :: String -> String -> String
{-
relSegsFrom sabs base
    | traceVal "\nrelSegsFrom\nsabs " sabs $ traceVal "base " base $
      False = error ""
-}
relSegsFrom []   []   = ""      -- paths are identical
relSegsFrom sabs base =
    if sa1 == sb1
        then relSegsFrom ra1 rb1
        else difSegsFrom sabs base
    where
        (sa1,ra1) = nextSegment sabs
        (sb1,rb1) = nextSegment base

--  difSegsFrom calculates a path difference from base to target,
--  not including the final name at the end of the path
--  (i.e. results always ends with '/')
--
--  This function operates under the invariant that the supplied
--  value of sabs is the desired path relative to the beginning of
--  base.  Thus, when base is empty, the desired path has been found.
--
difSegsFrom :: String -> String -> String
{-
difSegsFrom sabs base
    | traceVal "\ndifSegsFrom\nsabs " sabs $ traceVal "base " base $
      False = error ""
-}
difSegsFrom sabs ""   = sabs
difSegsFrom sabs base = difSegsFrom ("../"++sabs) (snd $ nextSegment base)

------------------------------------------------------------
--  Other normalization functions
------------------------------------------------------------

-- |Case normalization; cf. RFC3986 section 6.2.2.1
--  NOTE:  authority case normalization is not performed
--
normalizeCase :: String -> String
normalizeCase uristr = ncScheme uristr
    where
        ncScheme (':':cs)                = ':':ncEscape cs
        ncScheme (c:cs) | isSchemeChar c = toLower c:ncScheme cs
        ncScheme _                       = ncEscape uristr -- no scheme present
        ncEscape ('%':h1:h2:cs) = '%':toUpper h1:toUpper h2:ncEscape cs
        ncEscape (c:cs)         = c:ncEscape cs
        ncEscape []             = []

-- |Encoding normalization; cf. RFC3986 section 6.2.2.2
--
normalizeEscape :: String -> String
normalizeEscape ('%':h1:h2:cs)
    | isHexDigit h1 && isHexDigit h2 && isUnreserved escval =
        escval:normalizeEscape cs
    where
        escval = chr (digitToInt h1*16+digitToInt h2)
normalizeEscape (c:cs)         = c:normalizeEscape cs
normalizeEscape []             = []

-- |Path segment normalization; cf. RFC3986 section 6.2.2.4
--
normalizePathSegments :: String -> String
normalizePathSegments uristr = normstr juri
    where
        juri = parseURI uristr
        normstr Nothing  = uristr
        normstr (Just u) = show (normuri u)
        normuri u = u { uriPath = removeDotSegments (uriPath u) }

------------------------------------------------------------
--  Local trace helper functions
------------------------------------------------------------

traceShow :: Show a => String -> a -> a
traceShow msg x = trace (msg ++ show x) x

traceVal :: Show a => String -> a -> b -> b
traceVal msg x y = trace (msg ++ show x) y

------------------------------------------------------------
--  Deprecated functions
------------------------------------------------------------

{-# DEPRECATED parseabsoluteURI "use parseAbsoluteURI" #-}
parseabsoluteURI :: String -> Maybe URI
parseabsoluteURI = parseAbsoluteURI

{-# DEPRECATED escapeString "use escapeURIString, and note the flipped arguments" #-}
escapeString :: String -> (Char->Bool) -> String
escapeString = flip escapeURIString

{-# DEPRECATED reserved "use isReserved" #-}
reserved :: Char -> Bool
reserved = isReserved

{-# DEPRECATED unreserved "use isUnreserved" #-}
unreserved :: Char -> Bool
unreserved = isUnreserved

--  Additional component access functions for backward compatibility

{-# DEPRECATED scheme "use uriScheme" #-}
scheme :: URI -> String
scheme = orNull init . uriScheme

{-# DEPRECATED authority "use uriAuthority, and note changed functionality" #-}
authority :: URI -> String
authority = dropss . ($"") . uriAuthToString id . uriAuthority
    where
        -- Old-style authority component does not include leading '//'
        dropss ('/':'/':s) = s
        dropss s           = s

{-# DEPRECATED path "use uriPath" #-}
path :: URI -> String
path = uriPath

{-# DEPRECATED query "use uriQuery, and note changed functionality" #-}
query :: URI -> String
query = orNull tail . uriQuery

{-# DEPRECATED fragment "use uriFragment, and note changed functionality" #-}
fragment :: URI -> String
fragment = orNull tail . uriFragment

orNull :: ([a]->[a]) -> [a] -> [a]
orNull _ [] = []
orNull f as = f as

--------------------------------------------------------------------------------
--
--  Copyright (c) 2004, G. KLYNE.  All rights reserved.
--  Distributed as free software under the following license.
--
--  Redistribution and use in source and binary forms, with or without
--  modification, are permitted provided that the following conditions
--  are met:
--
--  - Redistributions of source code must retain the above copyright notice,
--  this list of conditions and the following disclaimer.
--
--  - Redistributions in binary form must reproduce the above copyright
--  notice, this list of conditions and the following disclaimer in the
--  documentation and/or other materials provided with the distribution.
--
--  - Neither name of the copyright holders nor the names of its
--  contributors may be used to endorse or promote products derived from
--  this software without specific prior written permission.
--
--  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
--  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
--  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
--  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
--  HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
--  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
--  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
--  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
--  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
--  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
--  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--
--------------------------------------------------------------------------------