{-# OPTIONS_GHC -Wall #-}
module ALON.Diff.HTML
( ChunkTree(..)
, DiffToken(..)
, DiffWithPatchRefs(..)
, buildChunkTreeFromHTML
, buildChunkTreeFromHTMLLazy
, buildChunkTree
, mapChunkTree
, diffHTML
, diffHTMLLazy
, diffChunkTree
, patchHTML
, patchHTMLLazy
, patchTreeToTokens
, getTreeTokens
, serializeDiffTokens
, normalizeLineEndings
, normalizeHTMLTokens
, getPatchRefs
, getPatchRefsAsDiffTokens
, diffHTMLWithPatchRefs
, diffHTMLWithPatchRefsLazy
) where
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Crypto.Hash.MD5 as MD5
import Data.Text.Encoding (encodeUtf8, decodeUtf8)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString as B (length)
import qualified Data.List as L
import qualified Data.Map.Strict as Map
import Text.HTML.Parser
import Data.Maybe (maybeToList)
import Data.Aeson
import Data.ByteString.Base64 (encodeBase64')
import qualified Data.Bytes.Put as P
import qualified Data.Bytes.Serial as S
import qualified Data.Bytes.VarInt as V
import Data.Bits
import Data.Word8
import HTMLEntities.Decoder (htmlEncodedText)
import Data.Text.Lazy.Builder (toLazyText)
data ChunkTree =
ChunkTree
{ fullNodeHash :: !(MD5.Ctx)
, nodeToken :: (Maybe Token)
, fullNodeLength :: !Int
, branches :: [ChunkTree]
}
data DiffToken =
DiffChunkReference !ByteString
| DiffContent !T.Text
| DiffOpenTag !T.Text
| DiffAttribute !T.Text !T.Text
| DiffCloseTag
| DiffSoloTag !T.Text
| DiffDoctype !T.Text
deriving (Show)
data DiffWithPatchRefs = DiffWithPatchRefs [DiffToken] [(ByteString, Maybe [DiffToken])]
-- Compact JSON serialization of a diff token
instance ToJSON DiffToken where
toJSON (DiffChunkReference refValue) = object [ T.pack "r" .= (decodeUtf8 $ encodeBase64' refValue) ]
toJSON (DiffContent text) = String $ descapeHTMLText text
toJSON (DiffOpenTag text) = toJSON [ String text ]
toJSON (DiffAttribute name value) = toJSON [ String name, String value ]
toJSON (DiffCloseTag) = toJSON ([]::[Value])
toJSON (DiffSoloTag text) = toJSON [ String text, toJSON ([]::[Value])]
toJSON (DiffDoctype text) = object [ T.pack "d" .= text ]
instance Show ChunkTree where
show tree =
"(ChunkTree " ++
show (decodeUtf8 $ encodeBase64' (treeDigest tree)) ++
" (" ++
show (nodeToken tree) ++
") " ++
show (fullNodeLength tree) ++
" " ++
show (branches tree) ++
")"
-- Note, this should be calculated based off the hash in future.
refMaxSize :: Int
--refMaxSize = 17 -- comment this in for binary based deltas
refMaxSize = 34 -- comment this in for JSON deltas
attributeOverhead :: Int
-- attributeOverhead = 2 -- comment this in for binary based deltas
attributeOverhead = 8 -- comment this in for JSON deltas
-- Transform HTML escapes/entities in text node text into
descapeHTMLText :: T.Text -> T.Text
descapeHTMLText = TL.toStrict . toLazyText . htmlEncodedText
-- Start a (potentially sub tree) state from a token
initialTreeState :: String -> T.Text -> [Attr] -> Token -> ChunkTree
initialTreeState hashPrefix nodeText attributes inputToken =
ChunkTree nodeHashContextWithAttributes ( Just inputToken ) ( tokenSizeOverhead + ( T.length nodeText ) + attributesLength ) []
where
tokenSizeOverhead =
case inputToken of
{- (TagOpen _ _) -> 2 -- Comment this set in for more optimal binary based deltas.
(TagSelfClose _ _) -> 1
(TagClose _ ) -> 0
(ContentText _ ) -> 1
(ContentChar _ ) -> 1
(Doctype _) -> 1
(Comment _) -> 0 --}
(TagOpen _ _) -> 8 -- Comment this set in for more optimal JSON based deltas
(TagSelfClose _ _) -> 8
(TagClose _ ) -> 0
(ContentText _ ) -> 3
(ContentChar _ ) -> 3
(Doctype _) -> 8
(Comment _) -> 0
attributeOrder (Attr name1 _) (Attr name2 _ ) = compare name1 name2
nodeHashContext = MD5.update (MD5.init) $ encodeUtf8 $ ( T.pack hashPrefix ) `mappend` nodeText
sortedAttributes = L.sortBy attributeOrder attributes
nodeHashContextWithAttributes = L.foldl' hashAttribute nodeHashContext sortedAttributes
hashAttribute context (Attr name value) = MD5.updates context $ fmap encodeUtf8 [T.pack " attr-", name, T.pack "=", value]
attributeLength previousLength (Attr name value) = previousLength + attributeOverhead + T.length name + T.length value
attributesLength = L.foldl' attributeLength 0 attributes
-- Add a branch chunk to a previous chunk tree
addBranchChunk :: ChunkTree -> ChunkTree -> ChunkTree
addBranchChunk oldParentState branch =
ChunkTree
{ fullNodeHash = MD5.update (fullNodeHash oldParentState) (treeDigest branch)
, nodeToken = nodeToken oldParentState
, fullNodeLength = fullNodeLength oldParentState + fullNodeLength branch
, branches = branches oldParentState ++ [branch]
}
voidTokens :: [T.Text]
voidTokens = map T.pack ["area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"]
paragraphEndTokens :: [T.Text]
paragraphEndTokens = map T.pack ["p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "address", "blockquote", "dl", "div", "fieldset", "form", "hr", "noscript", "table", "body"]
-- Given a token, and the current chunk tree state state, output a modified state stack
buildChunkNode :: Token -> [ChunkTree] -> Maybe [ChunkTree]
buildChunkNode ( TagOpen tagName attributes ) stateStack =
Just ( ( initialTreeState "tag-" (T.toLower tagName) attributes $ TagOpen tagName attributes ) : stateStack )
buildChunkNode ( TagSelfClose tagName attributes ) (state:deepState) =
Just ( addBranchChunk state localState : deepState )
where
localState = initialTreeState "tag-" (T.toLower tagName) attributes $ TagSelfClose tagName attributes
buildChunkNode ( TagClose tagName ) (state:deepState) = do
initialTag <- nodeToken state
tagValue <- openedTag initialTag
if ( tagValue == tagName ) then addChild deepState else Nothing -- Check the HTML is well formed; note in normalization void tokens are changed to self-closing.
where
-- Note, add child also does a content defined aggregatation of the node.
addChild (parentState:deeperState) = Just ( ( addBranchChunk parentState $ contentAggregateChunkTreeNode state ) : deeperState )
addChild [] = Nothing
openedTag (TagOpen openTagName _) = Just openTagName
openedTag _ = Nothing
buildChunkNode ( ContentText value ) (state:deepState) =
Just ( addBranchChunk state localState : deepState )
where
localState = initialTreeState "content-" (descapeHTMLText value) [] ( ContentText value )
buildChunkNode ( ContentChar value ) (state:deepState) =
Just ( addBranchChunk state localState : deepState )
where
localState = initialTreeState "content-" (T.singleton value) [] ( ContentChar value )
buildChunkNode ( Doctype value ) (state:deepState) =
Just ( addBranchChunk state localState : deepState )
where
localState = initialTreeState "doctype-" (T.strip $ T.toLower value) [] ( Doctype value )
buildChunkNode ( Comment _ ) stateStack = Just stateStack -- Comments aren't included in the output
buildChunkNode _ [] = Nothing
oneItemOnly :: [a] -> Maybe a
oneItemOnly (item:[]) = Just item
oneItemOnly _ = Nothing
-- Normalize line endings of HTML to remove carriage returns.
normalizeLineEndings :: T.Text -> T.Text
normalizeLineEndings inputText = T.replace (T.pack "\r") (T.pack "\n") $ T.replace (T.pack "\r\n") (T.pack "\n") inputText
-- Note, we only handle paragraph elements at the moment, but should probably add support for other
-- elements that don't need to be closed correctly.
normalizeParagraphElements :: [Token] -> [Token]
normalizeParagraphElements tokens =
let
processToken :: ([Token], Bool, Int) -> Token -> ([Token], Bool, Int)
processToken (previousTokens, False, _) inputToken =
case inputToken of
(TagOpen tagName _) | T.toLower tagName == T.pack "p" ->
(inputToken:previousTokens, True, 1)
(TagClose tagName) | T.toLower tagName == T.pack "p" ->
(previousTokens, False, 0)
_ ->
(inputToken:previousTokens, False, 0)
processToken (previousTokens, True, pDepth) inputToken =
case (inputToken, pDepth) of
(TagOpen tagName _, _) | T.toLower tagName == T.pack "p" ->
(inputToken:(TagClose $ T.pack "p"):previousTokens, True, 1)
(TagOpen tagName _, _) | elem (T.toLower tagName) paragraphEndTokens ->
(inputToken:(TagClose $ T.pack "p"):previousTokens, False, 0)
(TagSelfClose tagName _, _) | elem (T.toLower tagName) paragraphEndTokens ->
(inputToken:(TagClose $ T.pack "p"):previousTokens, False, 0)
(TagClose tagName, _) | T.toLower tagName == T.pack "p" ->
(inputToken:previousTokens, False, 0)
(TagClose tagName, _) | elem (T.toLower tagName) paragraphEndTokens ->
(inputToken:(TagClose $ T.pack "p"):previousTokens, False, 0)
(TagClose _, 1) ->
(inputToken:(TagClose $ T.pack "p"):previousTokens, False, 0)
(TagClose _, _) ->
(inputToken:previousTokens, True, pDepth - 1)
(TagOpen _ _, _) ->
(inputToken:previousTokens, True, pDepth + 1)
(_, _) ->
(inputToken:previousTokens, True, pDepth)
(reverseOutputTokens, isParagraphOpen, _) = L.foldl' processToken ([], False, 0) tokens
in
L.reverse $ (if isParagraphOpen then (TagClose $ T.pack "p") : reverseOutputTokens else reverseOutputTokens)
-- Normalize tokens in such a way that the DOM we will build in the chunk-tree will match what's built in the browser.
normalizeHTMLTokens :: [Token] -> [Token]
normalizeHTMLTokens tokens =
let
(outputTokens, finalText, _) = foldr processBackwardsToken ([], Nothing, False) tokens
processBackwardsToken inputToken (nextTokens, aggregateText, stopAggregating) =
case (inputToken, aggregateText) of
(TagOpen tagName _, _) | T.toLower tagName == T.pack "html" ->
(inputToken:nextTokens, Nothing, True)
(TagClose tagName, _) | T.toLower tagName == T.pack "html" || T.toLower tagName == T.pack "body" ->
(inputToken:nextTokens, aggregateText, False) -- Push text nodes from after the close of html/body into the end child of body.
(ContentText _, _) | stopAggregating ->
(nextTokens, Nothing, True) -- Don't collect text nodes if text nodes
(ContentChar _, _) | stopAggregating ->
(nextTokens, Nothing, True)
(ContentText text, Nothing) ->
(nextTokens, Just $ normalizeLineEndings text, False) -- Start aggregating text (normalize line endings)
(ContentText text, Just previousText) ->
(nextTokens, Just $ previousText `T.append` normalizeLineEndings text, False) -- Append text (with normalized line endings)
(ContentChar single, Nothing) ->
(nextTokens, Just $ normalizeLineEndings $ T.singleton single, False) -- Single char case
(ContentChar single, Just previousText) ->
(nextTokens, Just $ previousText `T.append` (normalizeLineEndings $ T.singleton single), False)
(Comment _, _) ->
(nextTokens, aggregateText, stopAggregating) -- Drop comments, keep the accumatulated text
(Doctype _, _) ->
(inputToken:nextTokens, Nothing, True) -- Doctype, drop any text nodes that appear after it
(TagOpen tagName attributes, Just previousText) | elem (T.toLower tagName) voidTokens ->
((TagSelfClose tagName attributes):(ContentText previousText):nextTokens, Nothing, stopAggregating) -- Convert void token to self-closing tag
(TagOpen tagName attributes, Nothing) | elem (T.toLower tagName) voidTokens ->
((TagSelfClose tagName attributes):nextTokens, Nothing, stopAggregating) -- Convert void token to self-closing tag
(_, Just previousText) ->
(inputToken:(ContentText previousText):nextTokens, Nothing, stopAggregating) -- normal tag close, put out tag close followed by aggregate text
(_, Nothing) ->
(inputToken:nextTokens, Nothing, stopAggregating) -- no aggregate text
preParagraphNormalization =
case finalText of
(Just text) -> ContentText text : outputTokens
Nothing -> outputTokens
in
normalizeParagraphElements preParagraphNormalization
-- Build a Chunk tree from a set of HTML tokens (which may represent a forest)
-- Will return Nothing if the set of HTML tokens is not well formed.
buildChunkTree :: [Token] -> Maybe ChunkTree
buildChunkTree tokens =
buildChunkTreeInner >>=
oneItemOnly
where
buildChunkTreeInner :: Maybe [ChunkTree]
buildChunkTreeInner =
L.foldl' (\ms tkn -> buildChunkNode tkn =<< ms) (Just [ChunkTree MD5.init Nothing 0 []]) (normalizeHTMLTokens tokens)
-- Note, this will content aggregate a chunk tree node and it's children, top down, but stopping at branches with
-- two or less children.
-- Basically, creates "ghost" intermediary nodes, which don't effect the hashes of the nodes of the tree with tokens,
-- but adds aggregate "ghost" child nodes in intermediate levels that have their own hash, as a way of representing
-- a set of multiple children.
contentAggregateChunkTreeNode :: ChunkTree -> ChunkTree
contentAggregateChunkTreeNode unaggregated =
ChunkTree (fullNodeHash unaggregated) (nodeToken unaggregated) (fullNodeLength unaggregated) $ maybeToList $ aggregateChildren (branches unaggregated)
where
aggregateChildren :: [ChunkTree] -> Maybe ChunkTree
aggregateChildren [] = Nothing
aggregateChildren (first:[]) = Just first
aggregateChildren (first:second:[]) = Just $ createAggregateChild first second
aggregateChildren children = do
let (leftChildren, rightChildren) = splitAt (maximumDigestIndex children) children
leftAggregate <- aggregateChildren leftChildren
rightAggregate <- aggregateChildren rightChildren
return $ createAggregateChild leftAggregate rightAggregate
createAggregateChild :: ChunkTree -> ChunkTree -> ChunkTree
createAggregateChild left right =
let
aggregateHash = MD5.update (MD5.update MD5.init $ treeDigest left) (treeDigest right)
aggregateLength = fullNodeLength left + fullNodeLength right
in
ChunkTree aggregateHash Nothing aggregateLength [left, right]
greaterDigest (maxIndex, maxDigest) (index, branchDigest) =
if branchDigest > maxDigest then (index, branchDigest) else (maxIndex, maxDigest)
maximumDigestIndex (_:second:fromList) =
let (foundIndex, _) = L.foldl' greaterDigest (1, treeDigest second) $ zip [2..] $ map treeDigest fromList in foundIndex
maximumDigestIndex _ = 0 -- should never get here
-- Given HTML, build a chunk tree.
buildChunkTreeFromHTML :: T.Text -> Maybe ChunkTree
buildChunkTreeFromHTML text = buildChunkTree $ parseTokens text
-- Given lazy text HTML, build a chunk tree.
buildChunkTreeFromHTMLLazy :: TL.Text -> Maybe ChunkTree
buildChunkTreeFromHTMLLazy text = buildChunkTree $ parseTokensLazy text
-- Get the hash digest of a ChunkTree node.
treeDigest :: ChunkTree -> ByteString
treeDigest = MD5.finalize . fullNodeHash
-- Produces a map of hash digests to chunk-tree nodes, as long as they are
-- above the break even size.
mapChunkTree :: ChunkTree -> Map.Map ByteString ChunkTree
mapChunkTree tree =
mapChunkTreeInner tree mempty
where
mapChunkTreeInner branch parentMap =
foldr (mapChunkTreeInner) (insertTreeMaybe parentMap branch) $ branches branch
insertTreeMaybe inputMap currentBranch =
if fullNodeLength currentBranch > refMaxSize then
Map.insert (treeDigest currentBranch) currentBranch inputMap
else
inputMap
-- Produce diff tokens for a given a chunk tree node if it can not be matched in the source.
treeTokenToDiffTokens :: ChunkTree -> [DiffToken]
treeTokenToDiffTokens (ChunkTree _ Nothing _ _ ) = []
treeTokenToDiffTokens (ChunkTree _ (Just aToken) _ _ ) =
case aToken of
(TagOpen tagName attributes) -> DiffOpenTag tagName : map translateAttribute attributes
(TagClose _) -> [DiffCloseTag]
(TagSelfClose tagName attributes) -> DiffSoloTag tagName : map translateAttribute attributes
(ContentText value) -> [DiffContent value]
(ContentChar value) -> [DiffContent $ T.singleton value]
(Doctype value) -> [DiffDoctype value]
(Comment _) -> []
where
translateAttribute (Attr name value) = DiffAttribute name value
-- After the nodes have been traversed in a tree, this is used to produce a closing token.
treeTokenToCloseDiffTokens :: ChunkTree -> [DiffToken]
treeTokenToCloseDiffTokens (ChunkTree _ (Just (TagOpen _ _ )) _ _ ) = [DiffCloseTag]
treeTokenToCloseDiffTokens _ = []
-- Given a map of a source chunk-tree and a target chunk-tree, create a diff patch
diffChunkTree :: Map.Map ByteString ChunkTree -> ChunkTree -> [DiffToken]
diffChunkTree sourceMap tree =
let nodeDigest = treeDigest tree in
case Map.lookup (nodeDigest) sourceMap of
Nothing -> treeTokenToDiffTokens tree ++
( foldr (++) [] $ map (diffChunkTree sourceMap) $ branches tree ) ++
treeTokenToCloseDiffTokens tree
Just _ -> [DiffChunkReference $ nodeDigest]
-- Change a chunktree node into a set of diff tokens with no references
chunkTreeToDiffTokens :: ChunkTree -> [DiffToken]
chunkTreeToDiffTokens tree =
treeTokenToDiffTokens tree ++ ( foldr (++) [] $ map (chunkTreeToDiffTokens) $ branches tree ) ++ treeTokenToCloseDiffTokens tree
-- Return all the patch refs in a delta (in the form as a set of diff tokens), with their chunk-tree tokenized to difftokens (with no references)
getPatchRefsAsDiffTokens :: [DiffToken] -> Map.Map ByteString ChunkTree -> [(ByteString, Maybe [DiffToken])]
getPatchRefsAsDiffTokens delta sourceMap =
let patchRefs = getPatchRefs delta
patchRefToDiffTokens patchRef = do
patchTree <- Map.lookup patchRef sourceMap
return (chunkTreeToDiffTokens patchTree)
in
map (\patchRef -> (patchRef, patchRefToDiffTokens patchRef)) patchRefs
-- Perform a diff between two well formed HTML texts and return the diff on success.
diffHTML :: T.Text -> T.Text -> Maybe [DiffToken]
diffHTML sourceHTML targetHTML = do
sourceChunkTree <- buildChunkTreeFromHTML sourceHTML
targetChunkTree <- buildChunkTreeFromHTML targetHTML
if treeDigest sourceChunkTree /= treeDigest targetChunkTree then do
let sourceMap = mapChunkTree sourceChunkTree
return (diffChunkTree sourceMap targetChunkTree)
else
return []
-- Perform a diff between two well formed HTML texts and return the diff and the full version of the patches the diff references.
diffHTMLWithPatchRefs :: T.Text -> T.Text -> Maybe DiffWithPatchRefs
diffHTMLWithPatchRefs sourceHTML targetHTML = do
sourceChunkTree <- buildChunkTreeFromHTML sourceHTML
targetChunkTree <- buildChunkTreeFromHTML targetHTML
if treeDigest sourceChunkTree /= treeDigest targetChunkTree then do
let sourceMap = mapChunkTree sourceChunkTree
let diff = diffChunkTree sourceMap targetChunkTree
let patches = getPatchRefsAsDiffTokens diff sourceMap
return (DiffWithPatchRefs diff patches)
else
return (DiffWithPatchRefs [] [])
diffHTMLWithPatchRefsLazy :: TL.Text -> TL.Text -> Maybe DiffWithPatchRefs
diffHTMLWithPatchRefsLazy sourceHTML targetHTML = do
sourceChunkTree <- buildChunkTreeFromHTMLLazy sourceHTML
targetChunkTree <- buildChunkTreeFromHTMLLazy targetHTML
if treeDigest sourceChunkTree /= treeDigest targetChunkTree then do
let sourceMap = mapChunkTree sourceChunkTree
let diff = diffChunkTree sourceMap targetChunkTree
let patches = getPatchRefsAsDiffTokens diff sourceMap
return (DiffWithPatchRefs diff patches)
else
return (DiffWithPatchRefs [] [])
-- Same as diff html but with lazy text
diffHTMLLazy :: TL.Text -> TL.Text -> Maybe [DiffToken]
diffHTMLLazy sourceHTML targetHTML = do
sourceChunkTree <- buildChunkTreeFromHTMLLazy sourceHTML
targetChunkTree <- buildChunkTreeFromHTMLLazy targetHTML
if treeDigest sourceChunkTree /= treeDigest targetChunkTree then do
let sourceMap = mapChunkTree sourceChunkTree
return (diffChunkTree sourceMap targetChunkTree)
else
return []
-- Render the chunk tree node back to a set of tokens.
getTreeTokens :: ChunkTree -> [Token]
getTreeTokens tree =
( L.foldl' accumulateChildTokens (maybeToList treeNodeToken) (branches tree) ) ++ closeToken treeNodeToken
where
accumulateChildTokens previous currentTree = previous ++ getTreeTokens currentTree
treeNodeToken = nodeToken tree
closeToken (Just (TagOpen tagName _)) = [TagClose tagName]
closeToken _ = []
-- Given a tree and a set of diff tokens, use them to patch into a stream of HTML tokens.
patchTreeToTokens :: ChunkTree -> [DiffToken] -> Maybe [Token]
patchTreeToTokens tree patch =
let
sourceMap = mapChunkTree tree
patchDiffToken (Just (tokenStack, outputTokens)) (DiffChunkReference refValue) = do
foundTree <- Map.lookup refValue sourceMap
let treeTokens = reverse $ getTreeTokens foundTree
return (tokenStack, treeTokens ++ outputTokens)
patchDiffToken (Just (tokenStack, outputTokens)) (DiffContent value) =
Just (tokenStack, ContentText value : outputTokens)
patchDiffToken (Just (tokenStack, outputTokens)) (DiffOpenTag tagName) =
let outputToken = TagOpen tagName [] in Just (outputToken:tokenStack, outputToken:outputTokens)
patchDiffToken (Just (tokenStack, (TagOpen tagName attributes):outputTokens)) (DiffAttribute name value) =
Just (tokenStack, TagOpen tagName (attributes ++ [Attr name value]):outputTokens )
patchDiffToken (Just (tokenStack, (TagSelfClose tagName attributes):outputTokens)) (DiffAttribute name value) =
Just (tokenStack, TagSelfClose tagName (attributes ++ [Attr name value]):outputTokens )
patchDiffToken _ (DiffAttribute _ _) = Nothing
patchDiffToken (Just ((TagOpen tagName _):tokenStack, outputTokens)) (DiffCloseTag) =
Just (tokenStack, TagClose tagName : outputTokens)
patchDiffToken _ (DiffCloseTag) = Nothing
patchDiffToken (Just (tokenStack, outputTokens)) (DiffSoloTag tagName) =
Just (tokenStack, (TagSelfClose tagName []) : outputTokens)
patchDiffToken (Just (tokenStack, outputTokens)) (DiffDoctype value) =
Just (tokenStack, (Doctype value):outputTokens)
patchDiffToken _ _ = Nothing
in do
tokenState <- ( L.foldl' patchDiffToken (Just ([],[])) patch )
let (_, tokenStream) = tokenState
return (reverse tokenStream)
-- Given HTML and a set of diff tokens, patch to create a target HTML
patchHTML :: T.Text -> [DiffToken] -> Maybe TL.Text
patchHTML htmlText patch = do
sourceTree <- buildChunkTreeFromHTML htmlText
patchedTokens <- patchTreeToTokens sourceTree patch
return (renderTokens patchedTokens)
-- Given HTML (lazy) and a set of diff tokens, patch to create a target HTML
patchHTMLLazy :: TL.Text -> [DiffToken] -> Maybe TL.Text
patchHTMLLazy htmlText patch = do
sourceTree <- buildChunkTreeFromHTMLLazy htmlText
patchedTokens <- patchTreeToTokens sourceTree patch
return (renderTokens patchedTokens)
-- Get all the references
getPatchRefs :: [DiffToken] -> [ByteString]
getPatchRefs tokens =
foldr prependChunkRefs [] tokens
where
prependChunkRefs (DiffChunkReference reference) currentList = reference : currentList
prependChunkRefs _ currentList = currentList
-- Serialize to a compact format in binary
serializeDiffTokens :: [DiffToken] -> BL.ByteString
serializeDiffTokens tokens = P.runPutL $ ( sequence_ (map serializeToken tokens) >> S.serialize ( 7 :: Word8 ) )
where
serializeToken (DiffChunkReference refValue) = encodeTypeAndBytes 0 refValue
serializeToken (DiffContent text) = encodeTypeAndText 1 text
serializeToken (DiffOpenTag text) = encodeTypeAndText 2 text
serializeToken (DiffAttribute name value) =
encodeTypeAndText 3 name >> S.serialize (V.VarInt $ B.length valueBytes ) >> P.putByteString valueBytes
where
valueBytes = encodeUtf8 value
serializeToken (DiffCloseTag) = S.serialize (4 :: Word8)
serializeToken (DiffSoloTag text) = encodeTypeAndText 5 text
serializeToken (DiffDoctype text) = encodeTypeAndText 6 text
combineByte :: Int -> Int -> Word8
combineByte typeInt contentLength = fromIntegral (typeInt .|. (contentLength `shiftL` 3))
encodeTypeAndText typeId content = encodeTypeAndBytes typeId $ encodeUtf8 content
encodeTypeAndBytes typeId content = ( serializeLengthAndType typeId $ B.length content ) >> P.putByteString content
serializeLengthAndType typeId contentLength | contentLength < 32 = S.serialize (combineByte typeId contentLength)
| otherwise = S.serialize ( (fromIntegral typeId ) :: Word8 ) >>
S.serialize (V.VarInt ((fromIntegral $ contentLength - 32 ) :: Word))