{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module Text.XmlHtml.TextParser
( guessEncoding
, parse
, isValidChar
, parseText
, takeWhile0
, takeWhile1
, text
, scanText
, ScanState(..)
, module Text.Parsec.Text
) where
#if !MIN_VERSION_base(4,8,0)
import Control.Applicative
#endif
import Data.Char
import Data.Maybe
import Text.XmlHtml.Common
import Data.Text (Text)
import qualified Data.Text as T
import qualified Text.Parsec as P
import Text.Parsec.Text
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
guessEncoding :: ByteString -> (Encoding, ByteString)
guessEncoding :: ByteString -> (Encoding, ByteString)
guessEncoding ByteString
b
| Int -> ByteString -> ByteString
B.take Int
3 ByteString
b forall a. Eq a => a -> a -> Bool
== [Word8] -> ByteString
B.pack [ Word8
0xEF, Word8
0xBB, Word8
0xBF ] = (Encoding
UTF8, Int -> ByteString -> ByteString
B.drop Int
3 ByteString
b)
| Int -> ByteString -> ByteString
B.take Int
2 ByteString
b forall a. Eq a => a -> a -> Bool
== [Word8] -> ByteString
B.pack [ Word8
0xFE, Word8
0xFF ] = (Encoding
UTF16BE, Int -> ByteString -> ByteString
B.drop Int
2 ByteString
b)
| Int -> ByteString -> ByteString
B.take Int
2 ByteString
b forall a. Eq a => a -> a -> Bool
== [Word8] -> ByteString
B.pack [ Word8
0xFF, Word8
0xFE ] = (Encoding
UTF16LE, Int -> ByteString -> ByteString
B.drop Int
2 ByteString
b)
| Bool
otherwise = (Encoding
UTF8, ByteString
b)
parse :: (Encoding -> Parser a) -> String -> ByteString -> Either String a
parse :: forall a.
(Encoding -> Parser a) -> String -> ByteString -> Either String a
parse Encoding -> Parser a
p String
src ByteString
b = let (Encoding
e, ByteString
b') = ByteString -> (Encoding, ByteString)
guessEncoding ByteString
b
t :: Text
t = Encoding -> ByteString -> Text
decoder Encoding
e ByteString
b'
bad :: Maybe Char
bad = (Char -> Bool) -> Text -> Maybe Char
T.find (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isValidChar) Text
t
in if forall a. Maybe a -> Bool
isNothing Maybe Char
bad
then forall a. Parser a -> String -> Text -> Either String a
parseText (Encoding -> Parser a
p Encoding
e forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* forall s (m :: * -> *) t u.
(Stream s m t, Show t) =>
ParsecT s u m ()
P.eof) String
src Text
t
else forall a b. a -> Either a b
Left forall a b. (a -> b) -> a -> b
$ String
"Document contains invalid character:"
forall a. [a] -> [a] -> [a]
++ String
" \\" forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> String
show (Char -> Int
ord (forall a. HasCallStack => Maybe a -> a
fromJust Maybe Char
bad))
isValidChar :: Char -> Bool
isValidChar :: Char -> Bool
isValidChar Char
c | Char
c forall a. Ord a => a -> a -> Bool
< Char
'\x9' = Bool
False
| Char
c forall a. Ord a => a -> a -> Bool
> Char
'\xA' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
< Char
'\xD' = Bool
False
| Char
c forall a. Ord a => a -> a -> Bool
> Char
'\xD' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
< Char
'\x20' = Bool
False
| Char
c forall a. Ord a => a -> a -> Bool
> Char
'\xD7FF' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
< Char
'\xE000' = Bool
False
| Char
c forall a. Ord a => a -> a -> Bool
> Char
'\xFFFD' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
< Char
'\x10000' = Bool
False
| Bool
otherwise = Bool
True
parseText :: Parser a
-> String
-> Text
-> Either String a
parseText :: forall a. Parser a -> String -> Text -> Either String a
parseText Parser a
p String
src Text
t = forall a b c. (a -> b) -> Either a c -> Either b c
inLeft forall a. Show a => a -> String
show (forall s t a.
Stream s Identity t =>
Parsec s () a -> String -> s -> Either ParseError a
P.parse Parser a
p String
src Text
t)
where inLeft :: (a -> b) -> Either a c -> Either b c
inLeft :: forall a b c. (a -> b) -> Either a c -> Either b c
inLeft a -> b
f (Left a
x) = forall a b. a -> Either a b
Left (a -> b
f a
x)
inLeft a -> b
_ (Right c
x) = forall a b. b -> Either a b
Right c
x
takeWhile0 :: (Char -> Bool) -> Parser Text
takeWhile0 :: (Char -> Bool) -> Parser Text
takeWhile0 Char -> Bool
p = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> Text
T.pack forall a b. (a -> b) -> a -> b
$ forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m [a]
P.many forall a b. (a -> b) -> a -> b
$ forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy Char -> Bool
p
takeWhile1 :: (Char -> Bool) -> Parser Text
takeWhile1 :: (Char -> Bool) -> Parser Text
takeWhile1 Char -> Bool
p = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> Text
T.pack forall a b. (a -> b) -> a -> b
$ forall s (m :: * -> *) t u a.
Stream s m t =>
ParsecT s u m a -> ParsecT s u m [a]
P.many1 forall a b. (a -> b) -> a -> b
$ forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy Char -> Bool
p
text :: Text -> Parser Text
text :: Text -> Parser Text
text Text
t = forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try forall a b. (a -> b) -> a -> b
$ forall s (m :: * -> *) u.
Stream s m Char =>
String -> ParsecT s u m String
P.string (Text -> String
T.unpack Text
t) forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall (m :: * -> *) a. Monad m => a -> m a
return Text
t
data ScanState = ScanNext (Char -> ScanState)
| ScanFinish
| ScanFail String
scanText :: (Char -> ScanState) -> Parser String
scanText :: (Char -> ScanState) -> ParsecT Text () Identity String
scanText Char -> ScanState
f = do
forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try forall a b. (a -> b) -> a -> b
$ do
Char
c <- forall s (m :: * -> *) u. Stream s m Char => ParsecT s u m Char
P.anyChar
case Char -> ScanState
f Char
c of
ScanNext Char -> ScanState
f' -> (Char
cforall a. a -> [a] -> [a]
:) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` (Char -> ScanState) -> ParsecT Text () Identity String
scanText Char -> ScanState
f'
ScanState
ScanFinish -> forall (m :: * -> *) a. Monad m => a -> m a
return [Char
c]
ScanFail String
err -> forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
err