{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Zenacy.HTML.Internal.Token
( Token(..)
, TokenBuffer(..)
, TAttr(..)
, tokenAttr
, tokenHasAttr
, tokenGetAttr
, tokenGetAttrVal
, tokenBuffer
, tokenCapacity
, tokenReset
, tokenTail
, tokenFirst
, tokenNext
, tokenCount
, tokenOffset
, tokenList
, tokenDrop
, tokenHasEOF
, tokenSlice
, tokenTagStartName
, tokenTagEndName
, tokenDoctypeType
, tokenTagStartType
, tokenTagEndType
, tokenCommentType
, tokenCharType
, tokenEOFType
, tokenDoctypeInit
, tokenDoctypeNameAppend
, tokenDoctypeSetForceQuirks
, tokenDoctypePublicIdInit
, tokenDoctypePublicIdAppend
, tokenDoctypeSystemIdInit
, tokenDoctypeSystemIdAppend
, tokenTagStartInit
, tokenTagStartSetSelfClosing
, tokenTagEndInit
, tokenTagNameAppend
, tokenAttrInit
, tokenAttrNameAppend
, tokenAttrValAppend
, tokenAttrNamePrune
, tokenCommentInit
, tokenCommentAppend
, tokenCharInit
, tokenEOFInit
, tokenType
, tokenSize
, tokenPack
) where
import Zenacy.HTML.Internal.BS
import Zenacy.HTML.Internal.Buffer
import Zenacy.HTML.Internal.Core
import Zenacy.HTML.Internal.Types
import Control.Monad
( when
, forM
)
import Control.Monad.Extra
( anyM
)
import Control.Monad.ST
( ST
)
import Data.STRef
( STRef
, newSTRef
, readSTRef
, writeSTRef
)
import Data.DList
( DList
)
import qualified Data.DList as D
( empty
, snoc
, toList
)
import Data.List
( find
, or
)
import Data.Maybe
( catMaybes
, isJust
)
import Data.Vector.Unboxed.Mutable
( MVector
)
import qualified Data.Vector.Unboxed.Mutable as U
( new
, length
, read
, write
, grow
)
import Data.Word
( Word8
)
data Token
= TDoctype
{ tDoctypeName :: !BS
, tDoctypeQuirks :: !Bool
, tDoctypePublic :: !(Maybe BS)
, tDoctypeSystem :: !(Maybe BS)
}
| TStart
{ tStartName :: !BS
, tStartClosed :: !Bool
, tStartAttr :: ![TAttr]
}
| TEnd
{ tEndName :: !BS
}
| TComment
{ tCommentData :: !BS
}
| TChar
{ tCharData :: !Word8
}
| TEOF
deriving (Eq, Ord, Show)
data TAttr = TAttr
{ tAttrName :: BS
, tAttrVal :: BS
, tAttrNamespace :: HTMLAttrNamespace
} deriving (Eq, Ord, Show)
data TokenBuffer s = TokenBuffer
{ tbCntl :: MVector s Int
, tbData :: MVector s Word8
}
tokenAttr :: BS -> BS -> TAttr
tokenAttr n v = TAttr n v HTMLAttrNamespaceNone
tokenHasAttr :: BS -> Token -> Bool
tokenHasAttr x t = isJust $ tokenGetAttr x t
tokenGetAttr :: BS -> Token -> Maybe TAttr
tokenGetAttr x = \case
TStart{..} -> find (\TAttr{..} -> tAttrName == x) tStartAttr
_otherwise -> Nothing
tokenGetAttrVal :: BS -> Token -> Maybe BS
tokenGetAttrVal x t = tAttrVal <$> tokenGetAttr x t
tokenBuffer :: ST s (STRef s (TokenBuffer s))
tokenBuffer = do
c <- U.new 100
d <- U.new 100
r <- newSTRef (TokenBuffer c d)
tokenReset r
pure r
tokenCapacity :: STRef s (TokenBuffer s) -> ST s (Int, Int)
tokenCapacity r = do
TokenBuffer{..} <- readSTRef r
pure (U.length tbCntl, U.length tbData)
headerSize :: Int
headerSize = 4
tokenReset :: STRef s (TokenBuffer s) -> ST s ()
tokenReset r = do
TokenBuffer{..} <- readSTRef r
U.write tbCntl 0 1
U.write tbCntl 1 0
U.write tbCntl 2 0
U.write tbCntl 3 0
U.write tbData 0 0
tokenTail :: STRef s (TokenBuffer s) -> ST s Int
tokenTail r = do
TokenBuffer{..} <- readSTRef r
U.read tbCntl 2
tokenFirst :: STRef s (TokenBuffer s) -> ST s Int
tokenFirst r = do
TokenBuffer{..} <- readSTRef r
h <- U.read tbCntl 1
U.write tbCntl 3 h
pure h
tokenNext :: STRef s (TokenBuffer s) -> ST s Int
tokenNext r = do
TokenBuffer{..} <- readSTRef r
t <- U.read tbCntl 2
i <- U.read tbCntl 3
n <- U.read tbCntl (i + 1)
if i == t
then pure 0
else do
let j = i + n
U.write tbCntl 3 j
pure j
tokenCount :: STRef s (TokenBuffer s) -> ST s Int
tokenCount r = tokenFold (\i n -> n + 1) 0 r
tokenOffset :: STRef s (TokenBuffer s) -> ST s [Int]
tokenOffset r = D.toList <$> tokenFold (flip D.snoc) D.empty r
tokenList :: STRef s (TokenBuffer s) -> ST s [Token]
tokenList r = tokenOffset r >>= mapM (flip tokenPack r)
tokenFold :: (Int -> a -> a) -> a -> STRef s (TokenBuffer s) -> ST s a
tokenFold f a r = do
TokenBuffer{..} <- readSTRef r
i <- U.read tbCntl 3
x <- go a tokenFirst
U.write tbCntl 3 i
pure x
where
go b g = do
i <- g r
if i == 0
then pure b
else go (f i b) tokenNext
tokenDrop :: STRef s (TokenBuffer s) -> ST s ()
tokenDrop r = do
TokenBuffer{..} <- readSTRef r
headPtr <- U.read tbCntl 1
tailPtr <- U.read tbCntl 2
when (tailPtr > 0) $ do
if tailPtr == headPtr
then tokenReset r
else go tbCntl tailPtr headPtr
where
go d t i = do
n <- U.read d (i + offsetSize)
if t == i + n
then U.write d 2 i
else go d t (i + n)
tokenHasEOF :: STRef s (TokenBuffer s) -> ST s Bool
tokenHasEOF r =
tokenOffset r >>= anyM isEOF
where
isEOF i = (== tokenEOFType) <$> tokenType i r
tokenSlice :: Int -> Int -> STRef s (TokenBuffer s) -> ST s [Word8]
tokenSlice offset len r = do
TokenBuffer{..} <- readSTRef r
go tbData offset len D.empty
where
go d i 0 a = pure $ D.toList a
go d i n a = do
w <- U.read d i
go d (i + 1) (n - 1) $ D.snoc a w
tokenTagStartName :: Int -> STRef s (TokenBuffer s) -> ST s (Maybe [Word8])
tokenTagStartName = tagNameIfType tokenTagStartType
tokenTagEndName :: Int -> STRef s (TokenBuffer s) -> ST s (Maybe [Word8])
tokenTagEndName = tagNameIfType tokenTagEndType
tagNameIfType :: Int -> Int -> STRef s (TokenBuffer s) -> ST s (Maybe [Word8])
tagNameIfType t x r = do
TokenBuffer{..} <- readSTRef r
a <- tokenType x r
if a == t
then do
o <- U.read tbCntl (x + 2)
n <- U.read tbCntl (x + 3)
Just <$> tokenSlice o n r
else
pure Nothing
tokenDoctypeType :: Int
tokenDoctypeType = 101
tokenTagStartType :: Int
tokenTagStartType = 102
tokenTagEndType :: Int
tokenTagEndType = 103
tokenCommentType :: Int
tokenCommentType = 104
tokenCharType :: Int
tokenCharType = 105
tokenEOFType :: Int
tokenEOFType = 106
tokenInit :: Int -> STRef s (TokenBuffer s) -> ST s (MVector s Int, Int)
tokenInit maxIndex r = do
t@TokenBuffer{..} <- readSTRef r
tailPtr <- U.read tbCntl 2
i <- if tailPtr == 0
then do
U.write tbCntl 1 headerSize
U.write tbCntl 2 headerSize
pure headerSize
else do
n <- U.read tbCntl (tailPtr + offsetSize)
let j = tailPtr + n
U.write tbCntl 2 j
pure j
c <- ensureCntl (i + maxIndex) t r
pure (c, i)
tokenDoctypeSize :: Int
tokenDoctypeSize = 11
tokenDoctypeInit :: STRef s (TokenBuffer s) -> ST s ()
tokenDoctypeInit r = do
(c, i) <- tokenInit 10 r
U.write c (i + 0) tokenDoctypeType
U.write c (i + 1) tokenDoctypeSize
U.write c (i + 2) 0
U.write c (i + 3) 0
U.write c (i + 4) 0
U.write c (i + 5) 0
U.write c (i + 6) 0
U.write c (i + 7) 0
U.write c (i + 8) 0
U.write c (i + 9) 0
U.write c (i + 10) 0
tokenDoctypeNameAppend :: Word8 -> STRef s (TokenBuffer s) -> ST s ()
tokenDoctypeNameAppend = stringAppendTail 2
tokenDoctypeSetForceQuirks :: STRef s (TokenBuffer s) -> ST s ()
tokenDoctypeSetForceQuirks r = do
TokenBuffer{..} <- readSTRef r
i <- U.read tbCntl 2
U.write tbCntl (i + 4) 1
tokenDoctypePublicIdInit :: STRef s (TokenBuffer s) -> ST s ()
tokenDoctypePublicIdInit r = do
TokenBuffer{..} <- readSTRef r
i <- U.read tbCntl 2
U.write tbCntl (i + 5) 1
tokenDoctypePublicIdAppend :: Word8 -> STRef s (TokenBuffer s) -> ST s ()
tokenDoctypePublicIdAppend = stringAppendTail 6
tokenDoctypeSystemIdInit :: STRef s (TokenBuffer s) -> ST s ()
tokenDoctypeSystemIdInit r = do
TokenBuffer{..} <- readSTRef r
i <- U.read tbCntl 2
U.write tbCntl (i + 8) 1
tokenDoctypeSystemIdAppend :: Word8 -> STRef s (TokenBuffer s) -> ST s ()
tokenDoctypeSystemIdAppend = stringAppendTail 9
attrSize :: Int
attrSize = 4
attrStart :: Int
attrStart = 6
tokenTagStartSize :: Int
tokenTagStartSize = 6
tokenTagStartInit :: STRef s (TokenBuffer s) -> ST s ()
tokenTagStartInit r = do
(c, i) <- tokenInit 5 r
U.write c (i + 0) tokenTagStartType
U.write c (i + 1) tokenTagStartSize
U.write c (i + 2) 0
U.write c (i + 3) 0
U.write c (i + 4) 0
U.write c (i + 5) 0
tokenTagEndSize :: Int
tokenTagEndSize = 4
tokenTagEndInit :: STRef s (TokenBuffer s) -> ST s ()
tokenTagEndInit r = do
(c, i) <- tokenInit 3 r
U.write c (i + 0) tokenTagEndType
U.write c (i + 1) tokenTagEndSize
U.write c (i + 2) 0
U.write c (i + 3) 0
tokenTagStartSetSelfClosing :: STRef s (TokenBuffer s) -> ST s ()
tokenTagStartSetSelfClosing r = do
TokenBuffer{..} <- readSTRef r
i <- U.read tbCntl 2
U.write tbCntl (i + 4) 1
tokenTagNameAppend :: Word8 -> STRef s (TokenBuffer s) -> ST s ()
tokenTagNameAppend = stringAppendTail 2
tokenAttrInit :: STRef s (TokenBuffer s) -> ST s ()
tokenAttrInit r = do
t@TokenBuffer{..} <- readSTRef r
tailPtr <- U.read tbCntl 2
m <- U.read tbCntl (tailPtr + 1)
n <- U.read tbCntl (tailPtr + 5)
let i = attrSize * n + attrStart + tailPtr
c <- ensureCntl (i + 3) t r
U.write c (i + 0) 0
U.write c (i + 1) 0
U.write c (i + 2) 0
U.write c (i + 3) 0
U.write c (tailPtr + 1) (m + 4)
U.write c (tailPtr + 5) (n + 1)
tokenAttrNameAppend :: Word8 -> STRef s (TokenBuffer s) -> ST s ()
tokenAttrNameAppend x r = do
t@TokenBuffer{..} <- readSTRef r
tailPtr <- U.read tbCntl 2
n <- U.read tbCntl (tailPtr + 5)
let i = attrSize * (n - 1) + attrStart + tailPtr
stringAppend (i + 0) x t r
tokenAttrValAppend :: Word8 -> STRef s (TokenBuffer s) -> ST s ()
tokenAttrValAppend x r = do
t@TokenBuffer{..} <- readSTRef r
tailPtr <- U.read tbCntl 2
n <- U.read tbCntl (tailPtr + 5)
let i = attrSize * (n - 1) + attrStart + tailPtr
stringAppend (i + 2) x t r
tokenAttrNamePrune :: Int -> STRef s (TokenBuffer s) -> ST s Bool
tokenAttrNamePrune x r = do
t@TokenBuffer{..} <- readSTRef r
a <- U.read tbCntl (x + offsetType)
if a /= tokenTagStartType
then pure False
else do
attrCount <- U.read tbCntl (x + 5)
u <- forM [1 .. attrCount] $ \j -> do
let i = (j - 1) * attrSize + attrStart + x
no0 <- U.read tbCntl (i + 0)
nc0 <- U.read tbCntl (i + 1)
match <- forM [1 .. j - 1] $ \k -> do
let m = (k - 1) * attrSize + attrStart + x
no1 <- U.read tbCntl (m + 0)
nc1 <- U.read tbCntl (m + 1)
s0 <- tokenSlice no0 nc0 r
s1 <- tokenSlice no1 nc1 r
pure $ nc0 == nc1 && s0 == s1
if or match
then do
U.write tbCntl (i + 0) 0
U.write tbCntl (i + 1) 0
pure True
else
pure False
pure $ or u
tokenCommentSize :: Int
tokenCommentSize = 4
tokenCommentInit :: STRef s (TokenBuffer s) -> ST s ()
tokenCommentInit r = do
(c, i) <- tokenInit 3 r
U.write c (i + 0) tokenCommentType
U.write c (i + 1) tokenCommentSize
U.write c (i + 2) 0
U.write c (i + 3) 0
tokenCommentAppend :: Word8 -> STRef s (TokenBuffer s) -> ST s ()
tokenCommentAppend = stringAppendTail 2
tokenCharSize :: Int
tokenCharSize = 3
tokenCharInit :: Word8 -> STRef s (TokenBuffer s) -> ST s ()
tokenCharInit x r = do
t <- readSTRef r
(c, i) <- tokenInit 2 r
U.write c (i + 0) tokenCharType
U.write c (i + 1) tokenCharSize
U.write c (i + 2) (fromIntegral x)
tokenEOFSize :: Int
tokenEOFSize = 2
tokenEOFInit :: STRef s (TokenBuffer s) -> ST s ()
tokenEOFInit r = do
(c, i) <- tokenInit 2 r
U.write c (i + 0) tokenEOFType
U.write c (i + 1) tokenEOFSize
offsetType :: Int
offsetType = 0
offsetSize :: Int
offsetSize = 1
tokenType :: Int -> STRef s (TokenBuffer s) -> ST s Int
tokenType x r = do
TokenBuffer{..} <- readSTRef r
U.read tbCntl (x + offsetType)
tokenSize :: Int -> STRef s (TokenBuffer s) -> ST s Int
tokenSize x r = do
TokenBuffer{..} <- readSTRef r
U.read tbCntl (x + offsetSize)
tokenPack :: Int -> STRef s (TokenBuffer s) -> ST s Token
tokenPack x r = do
TokenBuffer{..} <- readSTRef r
t <- U.read tbCntl x
n <- U.read tbCntl 0
s <- bufferString tbData n
if | t == tokenDoctypeType -> packDoctype x tbCntl s
| t == tokenTagStartType -> packTagStart x tbCntl s
| t == tokenTagEndType -> packTagEnd x tbCntl s
| t == tokenCommentType -> packComment x tbCntl s
| t == tokenCharType -> packChar x tbCntl
| t == tokenEOFType -> pure TEOF
| otherwise -> pure TEOF
packDoctype :: Int -> MVector s Int -> BS -> ST s Token
packDoctype x tbCntl bs = do
nameOffset <- U.read tbCntl (x + 2)
nameLen <- U.read tbCntl (x + 3)
quirks <- U.read tbCntl (x + 4)
pubExists <- U.read tbCntl (x + 5)
pubOffset <- U.read tbCntl (x + 6)
pubLen <- U.read tbCntl (x + 7)
sysExists <- U.read tbCntl (x + 8)
sysOffset <- U.read tbCntl (x + 9)
sysLen <- U.read tbCntl (x + 10)
pure $ TDoctype
(bsPart nameOffset nameLen bs)
(quirks == 1)
(if pubExists == 1 then Just (bsPart pubOffset pubLen bs) else Nothing)
(if sysExists == 1 then Just (bsPart sysOffset sysLen bs) else Nothing)
packTagStart :: Int -> MVector s Int -> BS -> ST s Token
packTagStart x tbCntl bs = do
nameOffset <- U.read tbCntl (x + 2)
nameLen <- U.read tbCntl (x + 3)
selfClose <- U.read tbCntl (x + 4)
attrCount <- U.read tbCntl (x + 5)
attr <- forM [1 .. attrCount] $ \j -> do
let i = (j - 1) * attrSize + attrStart + x
no <- U.read tbCntl (i + 0)
nc <- U.read tbCntl (i + 1)
ao <- U.read tbCntl (i + 2)
ac <- U.read tbCntl (i + 3)
pure $
if no > 0
then Just $ TAttr
(bsPart no nc bs)
(bsPart ao ac bs)
HTMLAttrNamespaceNone
else Nothing
pure $ TStart
(bsPart nameOffset nameLen bs)
(selfClose == 1)
(catMaybes attr)
packTagEnd :: Int -> MVector s Int -> BS -> ST s Token
packTagEnd x tbCntl bs = do
nameOffset <- U.read tbCntl (x + 2)
nameLen <- U.read tbCntl (x + 3)
pure $ TEnd $ bsPart nameOffset nameLen bs
packComment :: Int -> MVector s Int -> BS -> ST s Token
packComment x tbCntl bs = do
o <- U.read tbCntl (x + 2)
n <- U.read tbCntl (x + 3)
pure $ TComment $ bsPart o n bs
packChar :: Int -> MVector s Int -> ST s Token
packChar x tbCntl = do
c <- U.read tbCntl (x + 2)
pure $ TChar $ fromIntegral c
stringAppendTail :: Int -> Word8 -> STRef s (TokenBuffer s) -> ST s ()
stringAppendTail offset word r = do
t@TokenBuffer{..} <- readSTRef r
tailPtr <- U.read tbCntl 2
stringAppend (tailPtr + offset) word t r
{-# INLINE stringAppendTail #-}
stringAppend :: Int -> Word8 -> TokenBuffer s -> STRef s (TokenBuffer s) -> ST s ()
stringAppend x word t@TokenBuffer{..} r = do
let y = x + 1
i <- U.read tbCntl 0
o <- U.read tbCntl x
n <- U.read tbCntl y
d <- ensureData i t r
U.write d i word
U.write tbCntl 0 (i + 1)
U.write tbCntl y (n + 1)
when (o == 0) $
U.write tbCntl x i
{-# INLINE stringAppend #-}
ensureCntl :: Int -> TokenBuffer s -> STRef s (TokenBuffer s) -> ST s (MVector s Int)
ensureCntl x TokenBuffer{..} r
| x < U.length tbCntl =
pure tbCntl
| otherwise = do
v <- U.grow tbCntl $ U.length tbCntl
writeSTRef r $ TokenBuffer v tbData
pure v
{-# INLINE ensureCntl #-}
ensureData :: Int -> TokenBuffer s -> STRef s (TokenBuffer s) -> ST s (MVector s Word8)
ensureData x TokenBuffer{..} r
| x < U.length tbData =
pure tbData
| otherwise = do
v <- U.grow tbData $ U.length tbData
writeSTRef r $ TokenBuffer tbCntl v
pure v
{-# INLINE ensureData #-}