{-# LANGUAGE CPP #-}

{-|
Description:    Clean a 'Char' stream by normalizing newlines and warning about control characters.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable

To simplify the tokenization parsers, the many representations of line breaks
are unified into a single, Unix-style @\\n@.  While we're iterating over the
input, and before some of the special characters are replaced, it's also a good
time to trigger the warnings for unexpected characters
('ControlCharacterInInputStream', 'SurrogateInInputStream', and
'NoncharacterInInputStream').
-}
module Web.Mangrove.Parse.Encoding.Preprocess
    ( preprocess
    , preprocessStep
      -- * Initialization
    , Encoding ( .. )
    , DecoderState
    , initialDecoderState
    ) where


import qualified Data.Bifunctor as F.B
import qualified Data.ByteString as BS
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Tuple.HT as U.HT

#if MIN_VERSION_base(4,11,0)
#else
import Data.Semigroup ( (<>) )
#endif

import Web.Mangrove.Parse.Common.Error
import Web.Willow.Common.Encoding
import Web.Willow.Common.Encoding.Character
import Web.Willow.Common.Parser.Util


-- | __Encoding:__
--      @[preprocessing the input stream]
--      (https://encoding.spec.whatwg.org/#preprocessing-the-input-stream)@
-- 
-- Given a character encoding scheme, transform a dependant 'BS.ByteString'
-- into portable 'Char's.  If any byte sequences are meaningless or illegal,
-- they are replaced with the Unicode replacement character @\\xFFFD@.  All
-- newlines are normallized to a single @\\n@ 'Char', and Unicode control
-- characters, surrogate characters, and non-characters are marked with the
-- proper errors.
-- 
-- See 'preprocessStep' to operate over only a minimal section.
preprocess :: DecoderState -> BS.ByteString -> ([([ParseError], Char)], DecoderState)
preprocess :: DecoderState
-> ByteString -> ([([ParseError], Char)], DecoderState)
preprocess DecoderState
state = ([Either ShortByteString String] -> [([ParseError], Char)])
-> ([Either ShortByteString String], DecoderState)
-> ([([ParseError], Char)], DecoderState)
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
F.B.first ([([ParseError], Char)] -> [([ParseError], Char)]
forall a. Semigroup a => [(a, Char)] -> [(a, Char)]
normalize ([([ParseError], Char)] -> [([ParseError], Char)])
-> ([Either ShortByteString String] -> [([ParseError], Char)])
-> [Either ShortByteString String]
-> [([ParseError], Char)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (([ParseError], Char) -> ([ParseError], Char))
-> [([ParseError], Char)] -> [([ParseError], Char)]
forall a b. (a -> b) -> [a] -> [b]
map ([ParseError], Char) -> ([ParseError], Char)
charError ([([ParseError], Char)] -> [([ParseError], Char)])
-> ([Either ShortByteString String] -> [([ParseError], Char)])
-> [Either ShortByteString String]
-> [([ParseError], Char)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Either ShortByteString String -> [([ParseError], Char)])
-> [Either ShortByteString String] -> [([ParseError], Char)]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Either ShortByteString String -> [([ParseError], Char)]
flatten) (([Either ShortByteString String], DecoderState)
 -> ([([ParseError], Char)], DecoderState))
-> (ByteString -> ([Either ShortByteString String], DecoderState))
-> ByteString
-> ([([ParseError], Char)], DecoderState)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DecoderState
-> ByteString -> ([Either ShortByteString String], DecoderState)
decode DecoderState
state
  where normalize :: [(a, Char)] -> [(a, Char)]
normalize ((a
err1, Char
'\r'):(a
err2, Char
'\n'):[(a, Char)]
cs) = (a
err1 a -> a -> a
forall a. Semigroup a => a -> a -> a
<> a
err2, Char
'\n') (a, Char) -> [(a, Char)] -> [(a, Char)]
forall a. a -> [a] -> [a]
: [(a, Char)] -> [(a, Char)]
normalize [(a, Char)]
cs
        normalize ((a
err, Char
'\r'):[(a, Char)]
cs) = (a
err, Char
'\n') (a, Char) -> [(a, Char)] -> [(a, Char)]
forall a. a -> [a] -> [a]
: [(a, Char)] -> [(a, Char)]
normalize [(a, Char)]
cs
        normalize ((a, Char)
c:[(a, Char)]
cs) = (a, Char)
c (a, Char) -> [(a, Char)] -> [(a, Char)]
forall a. a -> [a] -> [a]
: [(a, Char)] -> [(a, Char)]
normalize [(a, Char)]
cs
        normalize [] = []

-- | __Encoding:__
--      @[preprocessing the input stream]
--      (https://encoding.spec.whatwg.org/#preprocessing-the-input-stream)@
-- 
-- Read the smallest number of bytes from the head of the 'BS.ByteString'
-- which would leave the decoder in a re-enterable state.  Any byte
-- sequences which are meaningless or illegal are replaced with the Unicode
-- replacement character @\\xFFFD@.  All newlines are normallized to a single
-- @\\n@ 'Char', and Unicode control characters, surrogate characters, and
-- non-characters are marked with the proper errors.
-- 
-- See 'preprocess' to operate over the entire string at once.
preprocessStep
    :: DecoderState
    -> BS.ByteString
    -> ([([ParseError], Char)], DecoderState, BS.ByteString)
preprocessStep :: DecoderState
-> ByteString -> ([([ParseError], Char)], DecoderState, ByteString)
preprocessStep DecoderState
state ByteString
stream = ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
normalize' (([([ParseError], Char)], DecoderState, ByteString)
 -> ([([ParseError], Char)], DecoderState, ByteString))
-> ((Maybe (Either ShortByteString String), DecoderState,
     ByteString)
    -> ([([ParseError], Char)], DecoderState, ByteString))
-> (Maybe (Either ShortByteString String), DecoderState,
    ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Maybe (Either ShortByteString String), DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
forall b c.
(Maybe (Either ShortByteString String), b, c)
-> ([([ParseError], Char)], b, c)
flatten' ((Maybe (Either ShortByteString String), DecoderState, ByteString)
 -> ([([ParseError], Char)], DecoderState, ByteString))
-> (Maybe (Either ShortByteString String), DecoderState,
    ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
forall a b. (a -> b) -> a -> b
$ DecoderState
-> ByteString
-> (Maybe (Either ShortByteString String), DecoderState,
    ByteString)
decodeStep DecoderState
state ByteString
stream
  where flatten' :: (Maybe (Either ShortByteString String), b, c)
-> ([([ParseError], Char)], b, c)
flatten' = (Maybe (Either ShortByteString String) -> [([ParseError], Char)])
-> (Maybe (Either ShortByteString String), b, c)
-> ([([ParseError], Char)], b, c)
forall a d b c. (a -> d) -> (a, b, c) -> (d, b, c)
U.HT.mapFst3 ([([ParseError], Char)]
-> (Either ShortByteString String -> [([ParseError], Char)])
-> Maybe (Either ShortByteString String)
-> [([ParseError], Char)]
forall b a. b -> (a -> b) -> Maybe a -> b
maybe [] ((Either ShortByteString String -> [([ParseError], Char)])
 -> Maybe (Either ShortByteString String) -> [([ParseError], Char)])
-> (Either ShortByteString String -> [([ParseError], Char)])
-> Maybe (Either ShortByteString String)
-> [([ParseError], Char)]
forall a b. (a -> b) -> a -> b
$ (([ParseError], Char) -> ([ParseError], Char))
-> [([ParseError], Char)] -> [([ParseError], Char)]
forall a b. (a -> b) -> [a] -> [b]
map ([ParseError], Char) -> ([ParseError], Char)
charError ([([ParseError], Char)] -> [([ParseError], Char)])
-> (Either ShortByteString String -> [([ParseError], Char)])
-> Either ShortByteString String
-> [([ParseError], Char)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Either ShortByteString String -> [([ParseError], Char)]
flatten)
        normalize' :: ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
normalize' c' :: ([([ParseError], Char)], DecoderState, ByteString)
c'@([], DecoderState
_, ByteString
_) = ([([ParseError], Char)], DecoderState, ByteString)
c'
        normalize' ([([ParseError]
errs, Char
'\r')], DecoderState
state', ByteString
stream') = case (Maybe (Either ShortByteString String), DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
forall b c.
(Maybe (Either ShortByteString String), b, c)
-> ([([ParseError], Char)], b, c)
flatten' ((Maybe (Either ShortByteString String), DecoderState, ByteString)
 -> ([([ParseError], Char)], DecoderState, ByteString))
-> (Maybe (Either ShortByteString String), DecoderState,
    ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
forall a b. (a -> b) -> a -> b
$ DecoderState
-> ByteString
-> (Maybe (Either ShortByteString String), DecoderState,
    ByteString)
decodeStep DecoderState
state' ByteString
stream' of
            (([ParseError]
errs', Char
'\n'):[([ParseError], Char)]
cs, DecoderState
state'', ByteString
stream'') ->
                ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
normalize' (([ParseError]
errs [ParseError] -> [ParseError] -> [ParseError]
forall a. [a] -> [a] -> [a]
++ [ParseError]
errs', Char
'\n') ([ParseError], Char)
-> [([ParseError], Char)] -> [([ParseError], Char)]
forall a. a -> [a] -> [a]
: [([ParseError], Char)]
cs, DecoderState
state'', ByteString
stream'')
            ([([ParseError], Char)], DecoderState, ByteString)
_ -> ([([ParseError]
errs, Char
'\n')], DecoderState
state', ByteString
stream')
        normalize' (([ParseError]
errs, Char
'\r'):([ParseError]
errs', Char
'\n'):[([ParseError], Char)]
cs, DecoderState
state', ByteString
stream') =
            ([([ParseError], Char)] -> [([ParseError], Char)])
-> ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
forall a d b c. (a -> d) -> (a, b, c) -> (d, b, c)
U.HT.mapFst3 (([ParseError]
errs [ParseError] -> [ParseError] -> [ParseError]
forall a. [a] -> [a] -> [a]
++ [ParseError]
errs', Char
'\n') ([ParseError], Char)
-> [([ParseError], Char)] -> [([ParseError], Char)]
forall a. a -> [a] -> [a]
:) (([([ParseError], Char)], DecoderState, ByteString)
 -> ([([ParseError], Char)], DecoderState, ByteString))
-> ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
forall a b. (a -> b) -> a -> b
$ ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
normalize' ([([ParseError], Char)]
cs, DecoderState
state', ByteString
stream')
        normalize' (([ParseError]
errs, Char
'\r'):[([ParseError], Char)]
cs, DecoderState
state', ByteString
stream') =
            ([([ParseError], Char)] -> [([ParseError], Char)])
-> ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
forall a d b c. (a -> d) -> (a, b, c) -> (d, b, c)
U.HT.mapFst3 (([ParseError]
errs, Char
'\n') ([ParseError], Char)
-> [([ParseError], Char)] -> [([ParseError], Char)]
forall a. a -> [a] -> [a]
:) (([([ParseError], Char)], DecoderState, ByteString)
 -> ([([ParseError], Char)], DecoderState, ByteString))
-> ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
forall a b. (a -> b) -> a -> b
$ ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
normalize' ([([ParseError], Char)]
cs, DecoderState
state', ByteString
stream')
        normalize' (([ParseError], Char)
c:[([ParseError], Char)]
cs, DecoderState
state', ByteString
stream') =
            ([([ParseError], Char)] -> [([ParseError], Char)])
-> ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
forall a d b c. (a -> d) -> (a, b, c) -> (d, b, c)
U.HT.mapFst3 (([ParseError], Char)
c ([ParseError], Char)
-> [([ParseError], Char)] -> [([ParseError], Char)]
forall a. a -> [a] -> [a]
:) (([([ParseError], Char)], DecoderState, ByteString)
 -> ([([ParseError], Char)], DecoderState, ByteString))
-> ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
forall a b. (a -> b) -> a -> b
$ ([([ParseError], Char)], DecoderState, ByteString)
-> ([([ParseError], Char)], DecoderState, ByteString)
normalize' ([([ParseError], Char)]
cs, DecoderState
state', ByteString
stream')


-- | Add a 'ControlCharacterInInputStream', 'SurrogateInInputStream', or
-- 'NoncharacterInInputStream' error to the relevant characters.
charError :: ([ParseError], Char) -> ([ParseError], Char)
charError :: ([ParseError], Char) -> ([ParseError], Char)
charError c' :: ([ParseError], Char)
c'@([ParseError]
_, Char
c)
    | Char -> Char -> Char -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Char
'\xD800' Char
'\xDFFF' Char
c = ParseError -> ([ParseError], Char)
addErr ParseError
SurrogateInInputStream
    | Char -> Char -> Char -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Char
'\xFDD0' Char
'\xFDEF' Char
c = ParseError -> ([ParseError], Char)
addErr ParseError
NoncharacterInInputStream
    | Bool
noncharacter = ParseError -> ([ParseError], Char)
addErr ParseError
NoncharacterInInputStream
    | Char -> Char -> Char -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Char
'\SOH' Char
'\US' Char
c Bool -> Bool -> Bool
&& Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
notElem Char
c String
"\t\n\f\r" = ParseError -> ([ParseError], Char)
addErr ParseError
ControlCharacterInInputStream
    | Char -> Char -> Char -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Char
'\DEL' Char
'\x9F' Char
c = ParseError -> ([ParseError], Char)
addErr ParseError
ControlCharacterInInputStream
    | Bool
otherwise = ([ParseError], Char)
c'
  where noncharacter :: Bool
noncharacter = case Int -> Int -> Int
forall a. Integral a => a -> a -> a
mod (Char -> Int
forall a. Enum a => a -> Int
fromEnum Char
c) Int
0x10000 of
            Int
0xFFFE -> Bool
True
            Int
0xFFFF -> Bool
True
            Int
_ -> Bool
False
        -- If this function could ever be applied twice, any errors would be
        -- duplicated.  As it's not exported from the module, that shouldn't be
        -- an issue.
        addErr :: ParseError -> ([ParseError], Char)
addErr ParseError
err = ([ParseError] -> [ParseError])
-> ([ParseError], Char) -> ([ParseError], Char)
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
F.B.first (ParseError
err ParseError -> [ParseError] -> [ParseError]
forall a. a -> [a] -> [a]
:) ([ParseError], Char)
c'

-- | Rewrap the split type containing a list into a list at the top level.  Any
-- 'Left' errors are replaced by @[('InvalidByteSequence' /bytes/, '\\xFFFD')]@.
flatten :: Either BS.SH.ShortByteString String -> [([ParseError], Char)]
flatten :: Either ShortByteString String -> [([ParseError], Char)]
flatten = (ShortByteString -> [([ParseError], Char)])
-> (String -> [([ParseError], Char)])
-> Either ShortByteString String
-> [([ParseError], Char)]
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (\ShortByteString
err -> [([ShortByteString -> ParseError
InvalidByteSequence ShortByteString
err], Char
replacementChar)]) ((Char -> ([ParseError], Char)) -> String -> [([ParseError], Char)]
forall a b. (a -> b) -> [a] -> [b]
map ((Char -> ([ParseError], Char))
 -> String -> [([ParseError], Char)])
-> (Char -> ([ParseError], Char))
-> String
-> [([ParseError], Char)]
forall a b. (a -> b) -> a -> b
$ \Char
c -> ([], Char
c))