{-# LANGUAGE CPP                   #-}
{-# LANGUAGE OverloadedStrings     #-}
{-# LANGUAGE FlexibleInstances     #-}
{-# LANGUAGE MultiParamTypeClasses #-}

module Text.XmlHtml.TextParser
( guessEncoding
, parse
, isValidChar
, parseText
, takeWhile0
, takeWhile1
, text
, scanText
, ScanState(..)

, module Text.Parsec.Text
) where

#if !MIN_VERSION_base(4,8,0)
import           Control.Applicative
#endif
import           Data.Char
import           Data.Maybe
import           Text.XmlHtml.Common

import           Data.Text (Text)
import qualified Data.Text as T

import qualified Text.Parsec as P
import           Text.Parsec.Text

import           Data.ByteString (ByteString)
import qualified Data.ByteString as B

------------------------------------------------------------------------------
-- | Get an initial guess at document encoding from the byte order mark.  If
-- the mark doesn't exist, guess UTF-8.  Otherwise, guess according to the
-- mark.
guessEncoding :: ByteString -> (Encoding, ByteString)
guessEncoding :: ByteString -> (Encoding, ByteString)
guessEncoding ByteString
b
    | Int -> ByteString -> ByteString
B.take Int
3 ByteString
b forall a. Eq a => a -> a -> Bool
== [Word8] -> ByteString
B.pack [ Word8
0xEF, Word8
0xBB, Word8
0xBF ] = (Encoding
UTF8,    Int -> ByteString -> ByteString
B.drop Int
3 ByteString
b)
    | Int -> ByteString -> ByteString
B.take Int
2 ByteString
b forall a. Eq a => a -> a -> Bool
== [Word8] -> ByteString
B.pack [ Word8
0xFE, Word8
0xFF ]       = (Encoding
UTF16BE, Int -> ByteString -> ByteString
B.drop Int
2 ByteString
b)
    | Int -> ByteString -> ByteString
B.take Int
2 ByteString
b forall a. Eq a => a -> a -> Bool
== [Word8] -> ByteString
B.pack [ Word8
0xFF, Word8
0xFE ]       = (Encoding
UTF16LE, Int -> ByteString -> ByteString
B.drop Int
2 ByteString
b)
    | Bool
otherwise                                 = (Encoding
UTF8,    ByteString
b)

------------------------------------------------------------------------------
parse :: (Encoding -> Parser a) -> String -> ByteString -> Either String a
parse :: forall a.
(Encoding -> Parser a) -> String -> ByteString -> Either String a
parse Encoding -> Parser a
p String
src ByteString
b = let (Encoding
e, ByteString
b') = ByteString -> (Encoding, ByteString)
guessEncoding ByteString
b
                    t :: Text
t       = Encoding -> ByteString -> Text
decoder Encoding
e ByteString
b'
                    bad :: Maybe Char
bad     = (Char -> Bool) -> Text -> Maybe Char
T.find (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isValidChar) Text
t
                in  if forall a. Maybe a -> Bool
isNothing Maybe Char
bad
                        then forall a. Parser a -> String -> Text -> Either String a
parseText (Encoding -> Parser a
p Encoding
e forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* forall s (m :: * -> *) t u.
(Stream s m t, Show t) =>
ParsecT s u m ()
P.eof) String
src Text
t
                        else forall a b. a -> Either a b
Left forall a b. (a -> b) -> a -> b
$ String
"Document contains invalid character:"
                                 forall a. [a] -> [a] -> [a]
++ String
" \\" forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> String
show (Char -> Int
ord (forall a. HasCallStack => Maybe a -> a
fromJust Maybe Char
bad))


------------------------------------------------------------------------------
-- | Checks if a document contains invalid characters.
--
isValidChar :: Char -> Bool
isValidChar :: Char -> Bool
isValidChar Char
c | Char
c forall a. Ord a => a -> a -> Bool
< Char
'\x9'                     = Bool
False
              | Char
c forall a. Ord a => a -> a -> Bool
> Char
'\xA'    Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
< Char
'\xD'     = Bool
False
              | Char
c forall a. Ord a => a -> a -> Bool
> Char
'\xD'    Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
< Char
'\x20'    = Bool
False
              | Char
c forall a. Ord a => a -> a -> Bool
> Char
'\xD7FF' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
< Char
'\xE000'  = Bool
False
              | Char
c forall a. Ord a => a -> a -> Bool
> Char
'\xFFFD' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
< Char
'\x10000' = Bool
False
              | Bool
otherwise                     = Bool
True


------------------------------------------------------------------------------
-- | Parses a 'Text' value and gives back the result.  The parser is expected
-- to match the entire string.
parseText :: Parser a         -- ^ The parser to match
          -> String           -- ^ Name of the source file (can be @\"\"@)
          -> Text             -- ^ Text to parse
          -> Either String a  -- Either an error message or the result
parseText :: forall a. Parser a -> String -> Text -> Either String a
parseText Parser a
p String
src Text
t = forall a b c. (a -> b) -> Either a c -> Either b c
inLeft forall a. Show a => a -> String
show (forall s t a.
Stream s Identity t =>
Parsec s () a -> String -> s -> Either ParseError a
P.parse Parser a
p String
src Text
t)
  where inLeft :: (a -> b) -> Either a c -> Either b c
        inLeft :: forall a b c. (a -> b) -> Either a c -> Either b c
inLeft a -> b
f (Left a
x)  = forall a b. a -> Either a b
Left (a -> b
f a
x)
        inLeft a -> b
_ (Right c
x) = forall a b. b -> Either a b
Right c
x


------------------------------------------------------------------------------
-- | Consume input as long as the predicate returns 'True', and return the
-- consumed input.  This parser does not fail.  If it matches no input, it
-- will return an empty string.
takeWhile0 :: (Char -> Bool) -> Parser Text
takeWhile0 :: (Char -> Bool) -> Parser Text
takeWhile0 Char -> Bool
p = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> Text
T.pack forall a b. (a -> b) -> a -> b
$ forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m [a]
P.many forall a b. (a -> b) -> a -> b
$ forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy Char -> Bool
p


------------------------------------------------------------------------------
-- | Consume input as long as the predicate returns 'True', and return the
-- consumed input.  This parser requires the predicate to succeed on at least
-- one character of input.  It will fail if the first character fails the
-- predicate.
takeWhile1 :: (Char -> Bool) -> Parser Text
takeWhile1 :: (Char -> Bool) -> Parser Text
takeWhile1 Char -> Bool
p = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> Text
T.pack forall a b. (a -> b) -> a -> b
$ forall s (m :: * -> *) t u a.
Stream s m t =>
ParsecT s u m a -> ParsecT s u m [a]
P.many1 forall a b. (a -> b) -> a -> b
$ forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy Char -> Bool
p


------------------------------------------------------------------------------
-- | The equivalent of Parsec's string combinator, but for text.  If there is
-- not a complete match, then no input is consumed.  This matches the behavior
-- of @string@ from the attoparsec-text package.
text :: Text -> Parser Text
text :: Text -> Parser Text
text Text
t = forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try forall a b. (a -> b) -> a -> b
$ forall s (m :: * -> *) u.
Stream s m Char =>
String -> ParsecT s u m String
P.string (Text -> String
T.unpack Text
t) forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall (m :: * -> *) a. Monad m => a -> m a
return Text
t


------------------------------------------------------------------------------
-- | Represents the state of a text scanner, for use with the 'scanText'
-- parser combinator.
data ScanState = ScanNext (Char -> ScanState)
               | ScanFinish
               | ScanFail String


------------------------------------------------------------------------------
-- | Scans text and progresses through a DFA, collecting the complete matching
-- text as it goes.
scanText :: (Char -> ScanState) -> Parser String
scanText :: (Char -> ScanState) -> ParsecT Text () Identity String
scanText Char -> ScanState
f = do
    forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try forall a b. (a -> b) -> a -> b
$ do
        Char
c <- forall s (m :: * -> *) u. Stream s m Char => ParsecT s u m Char
P.anyChar
        case Char -> ScanState
f Char
c of
            ScanNext Char -> ScanState
f'  -> (Char
cforall a. a -> [a] -> [a]
:) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` (Char -> ScanState) -> ParsecT Text () Identity String
scanText Char -> ScanState
f'
            ScanState
ScanFinish   -> forall (m :: * -> *) a. Monad m => a -> m a
return [Char
c]
            ScanFail String
err -> forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
err