{-# OPTIONS_GHC -Wall #-}

module ALON.Diff.HTML
  ( ChunkTree(..)
  , DiffToken(..)
  , DiffWithPatchRefs(..)
  , buildChunkTreeFromHTML
  , buildChunkTreeFromHTMLLazy
  , buildChunkTree
  , mapChunkTree
  , diffHTML
  , diffHTMLLazy
  , diffChunkTree
  , patchHTML
  , patchHTMLLazy
  , patchTreeToTokens
  , getTreeTokens
  , serializeDiffTokens
  , normalizeLineEndings
  , normalizeHTMLTokens
  , getPatchRefs
  , getPatchRefsAsDiffTokens
  , diffHTMLWithPatchRefs
  , diffHTMLWithPatchRefsLazy
  ) where

import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import           Crypto.Hash.MD5 as MD5
import           Data.Text.Encoding (encodeUtf8, decodeUtf8)
import           Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString as B (length)
import qualified Data.List as L
import qualified Data.Map.Strict as Map
import           Text.HTML.Parser
import           Data.Maybe (maybeToList)
import           Data.Aeson
import           Data.ByteString.Base64 (encodeBase64')
import qualified Data.Bytes.Put as P
import qualified Data.Bytes.Serial as S
import qualified Data.Bytes.VarInt as V
import           Data.Bits
import           Data.Word8
import           HTMLEntities.Decoder (htmlEncodedText)
import           Data.Text.Lazy.Builder (toLazyText)

data ChunkTree =
    ChunkTree
    { fullNodeHash   :: !(MD5.Ctx)
    , nodeToken      :: (Maybe Token)
    , fullNodeLength :: !Int
    , branches       :: [ChunkTree]
    }

data DiffToken =
    DiffChunkReference !ByteString
  | DiffContent !T.Text
  | DiffOpenTag !T.Text
  | DiffAttribute !T.Text !T.Text
  | DiffCloseTag
  | DiffSoloTag !T.Text
  | DiffDoctype !T.Text
  deriving (Show)

data DiffWithPatchRefs = DiffWithPatchRefs [DiffToken] [(ByteString, Maybe [DiffToken])]  

-- Compact JSON serialization of a diff token
instance ToJSON DiffToken where
  toJSON (DiffChunkReference refValue) = object [ T.pack "r" .= (decodeUtf8 $ encodeBase64' refValue) ]
  toJSON (DiffContent text)            = String $ descapeHTMLText text
  toJSON (DiffOpenTag text)            = toJSON [ String text ]
  toJSON (DiffAttribute name value)    = toJSON [ String name, String value ]
  toJSON (DiffCloseTag)                = toJSON ([]::[Value])
  toJSON (DiffSoloTag text)            = toJSON [ String text, toJSON ([]::[Value])]
  toJSON (DiffDoctype text)            = object [ T.pack "d" .= text ]

instance Show ChunkTree where
  show tree =
    "(ChunkTree " ++
    show (decodeUtf8 $ encodeBase64' (treeDigest tree)) ++
    " (" ++
    show (nodeToken tree) ++
    ") " ++
    show (fullNodeLength tree) ++
    " " ++
    show (branches tree) ++
    ")"

-- Note, this should be calculated based off the hash in future.
refMaxSize :: Int
--refMaxSize = 17 -- comment this in for binary based deltas
refMaxSize = 34 -- comment this in for JSON deltas

attributeOverhead :: Int
-- attributeOverhead = 2 -- comment this in for binary based deltas
attributeOverhead = 8 -- comment this in for JSON deltas

-- Transform HTML escapes/entities in text node text into 
descapeHTMLText :: T.Text -> T.Text
descapeHTMLText = TL.toStrict . toLazyText . htmlEncodedText

-- Start a (potentially sub tree) state from a token 
initialTreeState :: String -> T.Text -> [Attr] -> Token -> ChunkTree
initialTreeState hashPrefix nodeText attributes inputToken =
    ChunkTree nodeHashContextWithAttributes ( Just inputToken ) ( tokenSizeOverhead + ( T.length nodeText ) + attributesLength ) []
  where
    tokenSizeOverhead =
      case inputToken of
{-      (TagOpen _ _)      -> 2 -- Comment this set in for more optimal binary based deltas. 
        (TagSelfClose _ _) -> 1
        (TagClose _ )      -> 0
        (ContentText _ )   -> 1
        (ContentChar _ )   -> 1
        (Doctype _)        -> 1 
        (Comment _)        -> 0 --}
        (TagOpen _ _)      -> 8 -- Comment this set in for more optimal JSON based deltas
        (TagSelfClose _ _) -> 8
        (TagClose _ )      -> 0
        (ContentText _ )   -> 3
        (ContentChar _ )   -> 3
        (Doctype _)        -> 8
        (Comment _)        -> 0
    attributeOrder (Attr name1 _) (Attr name2 _ )    = compare name1 name2
    nodeHashContext                                  = MD5.update (MD5.init) $ encodeUtf8 $ ( T.pack hashPrefix ) `mappend` nodeText
    sortedAttributes                                 = L.sortBy attributeOrder attributes
    nodeHashContextWithAttributes                    = L.foldl' hashAttribute nodeHashContext sortedAttributes
    hashAttribute context (Attr name value)          = MD5.updates context $ fmap encodeUtf8 [T.pack " attr-", name, T.pack "=", value]
    attributeLength previousLength (Attr name value) = previousLength + attributeOverhead + T.length name + T.length value
    attributesLength                                 = L.foldl' attributeLength 0 attributes

-- Add a branch chunk to a previous chunk tree
addBranchChunk :: ChunkTree -> ChunkTree -> ChunkTree
addBranchChunk oldParentState branch = 
    ChunkTree 
    { fullNodeHash   = MD5.update (fullNodeHash oldParentState) (treeDigest branch)
    , nodeToken      = nodeToken oldParentState
    , fullNodeLength = fullNodeLength oldParentState + fullNodeLength branch
    , branches       = branches oldParentState ++ [branch]
    }

voidTokens :: [T.Text]
voidTokens = map T.pack ["area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"]

paragraphEndTokens :: [T.Text]
paragraphEndTokens = map T.pack ["p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "address", "blockquote", "dl", "div", "fieldset", "form", "hr", "noscript", "table", "body"]

-- Given a token, and the current chunk tree state state, output a modified state stack
buildChunkNode :: Token -> [ChunkTree] -> Maybe [ChunkTree]
buildChunkNode ( TagOpen tagName attributes )      stateStack        =
    Just ( ( initialTreeState "tag-" (T.toLower tagName) attributes $ TagOpen tagName attributes ) : stateStack )
buildChunkNode ( TagSelfClose tagName attributes ) (state:deepState) =
    Just ( addBranchChunk state localState : deepState )
  where
    localState = initialTreeState "tag-" (T.toLower tagName) attributes $ TagSelfClose tagName attributes
buildChunkNode ( TagClose tagName )                (state:deepState) = do
    initialTag <- nodeToken state
    tagValue   <- openedTag initialTag
    if ( tagValue == tagName ) then addChild deepState else Nothing -- Check the HTML is well formed; note in normalization void tokens are changed to self-closing.
  where
    -- Note, add child also does a content defined aggregatation of the node.
    addChild (parentState:deeperState) = Just ( ( addBranchChunk parentState $ contentAggregateChunkTreeNode state ) : deeperState )
    addChild []                        = Nothing
    openedTag (TagOpen openTagName _)  = Just openTagName
    openedTag _                        = Nothing
buildChunkNode ( ContentText value )               (state:deepState) =
    Just ( addBranchChunk state localState : deepState )
  where
    localState = initialTreeState "content-" (descapeHTMLText value) [] ( ContentText value )
buildChunkNode ( ContentChar value )               (state:deepState) =
    Just ( addBranchChunk state localState : deepState )
  where
    localState = initialTreeState "content-" (T.singleton value) [] ( ContentChar value )
buildChunkNode ( Doctype value )                   (state:deepState) =
    Just ( addBranchChunk state localState : deepState )
  where
    localState = initialTreeState "doctype-" (T.strip $ T.toLower value) [] ( Doctype value )
buildChunkNode ( Comment _ )                       stateStack        = Just stateStack -- Comments aren't included in the output
buildChunkNode _                                   []                = Nothing

oneItemOnly :: [a] -> Maybe a
oneItemOnly (item:[]) = Just item
oneItemOnly _         = Nothing

-- Normalize line endings of HTML to remove carriage returns.
normalizeLineEndings :: T.Text -> T.Text
normalizeLineEndings inputText = T.replace (T.pack "\r") (T.pack "\n") $ T.replace (T.pack "\r\n") (T.pack "\n") inputText

-- Note, we only handle paragraph elements at the moment, but should probably add support for other 
-- elements that don't need to be closed correctly.
normalizeParagraphElements :: [Token] -> [Token]
normalizeParagraphElements tokens = 
  let
    processToken :: ([Token], Bool, Int) -> Token -> ([Token], Bool, Int)
    processToken (previousTokens, False, _) inputToken = 
      case inputToken of
        (TagOpen tagName _) | T.toLower tagName == T.pack "p" ->
          (inputToken:previousTokens, True, 1)
        (TagClose tagName)  | T.toLower tagName == T.pack "p" ->
          (previousTokens, False, 0)
        _                                                     ->
          (inputToken:previousTokens, False, 0)
    processToken (previousTokens, True, pDepth)  inputToken =
      case (inputToken, pDepth) of
        (TagOpen tagName _, _)      | T.toLower tagName == T.pack "p"             ->
          (inputToken:(TagClose $ T.pack "p"):previousTokens, True, 1)
        (TagOpen tagName _, _)      | elem (T.toLower tagName) paragraphEndTokens ->
          (inputToken:(TagClose $ T.pack "p"):previousTokens, False, 0)
        (TagSelfClose tagName _, _) | elem (T.toLower tagName) paragraphEndTokens ->
          (inputToken:(TagClose $ T.pack "p"):previousTokens, False, 0)
        (TagClose tagName, _)       | T.toLower tagName == T.pack "p"             ->
          (inputToken:previousTokens, False, 0)
        (TagClose tagName, _)       | elem (T.toLower tagName) paragraphEndTokens ->
          (inputToken:(TagClose $ T.pack "p"):previousTokens, False, 0)
        (TagClose _, 1)                                                           ->
          (inputToken:(TagClose $ T.pack "p"):previousTokens, False, 0)
        (TagClose _, _)                                                           ->
          (inputToken:previousTokens, True, pDepth - 1)
        (TagOpen _ _, _)                                                          ->
          (inputToken:previousTokens, True, pDepth + 1)
        (_, _)                                                                    ->
          (inputToken:previousTokens, True, pDepth)
    (reverseOutputTokens, isParagraphOpen, _) = L.foldl' processToken ([], False, 0) tokens
  in
    L.reverse $ (if isParagraphOpen then (TagClose $ T.pack "p") : reverseOutputTokens else reverseOutputTokens)

-- Normalize tokens in such a way that the DOM we will build in the chunk-tree will match what's built in the browser.
normalizeHTMLTokens :: [Token] -> [Token]
normalizeHTMLTokens tokens = 
  let
    (outputTokens, finalText, _) = foldr processBackwardsToken ([], Nothing, False) tokens
    processBackwardsToken inputToken (nextTokens, aggregateText, stopAggregating) =
      case (inputToken, aggregateText) of
        (TagOpen tagName _, _) | T.toLower tagName == T.pack "html"                                       ->
          (inputToken:nextTokens, Nothing, True)
        (TagClose tagName, _)  | T.toLower tagName == T.pack "html" || T.toLower tagName == T.pack "body" ->
          (inputToken:nextTokens, aggregateText, False) -- Push text nodes from after the close of html/body into the end child of body.
        (ContentText _, _) | stopAggregating                                                              ->
          (nextTokens, Nothing, True) -- Don't collect text nodes if text nodes
        (ContentChar _, _) | stopAggregating                                                              ->
          (nextTokens, Nothing, True)
        (ContentText text, Nothing)                                                                       ->
          (nextTokens, Just $ normalizeLineEndings text, False) -- Start aggregating text (normalize line endings)
        (ContentText text, Just previousText)                                                             ->
          (nextTokens, Just $ previousText `T.append` normalizeLineEndings text, False) -- Append text (with normalized line endings)
        (ContentChar single, Nothing)                                                                     ->
          (nextTokens, Just $ normalizeLineEndings $ T.singleton single, False) -- Single char case
        (ContentChar single, Just previousText)                                                           ->
          (nextTokens, Just $ previousText `T.append` (normalizeLineEndings $ T.singleton single), False)
        (Comment _, _)                                                                                    ->
          (nextTokens, aggregateText, stopAggregating) -- Drop comments, keep the accumatulated text
        (Doctype _, _)                                                                                    ->
          (inputToken:nextTokens, Nothing, True) -- Doctype, drop any text nodes that appear after it
        (TagOpen tagName attributes, Just previousText) | elem (T.toLower tagName) voidTokens             ->
          ((TagSelfClose tagName attributes):(ContentText previousText):nextTokens, Nothing, stopAggregating) -- Convert void token to self-closing tag
        (TagOpen tagName attributes, Nothing) | elem (T.toLower tagName) voidTokens                       ->
          ((TagSelfClose tagName attributes):nextTokens, Nothing, stopAggregating)  -- Convert void token to self-closing tag
        (_, Just previousText)                                                                            ->
          (inputToken:(ContentText previousText):nextTokens, Nothing, stopAggregating) -- normal tag close, put out tag close followed by aggregate text
        (_, Nothing)                                                                                      ->
          (inputToken:nextTokens, Nothing, stopAggregating) -- no aggregate text
    preParagraphNormalization =
      case finalText of
        (Just text) -> ContentText text : outputTokens
        Nothing     -> outputTokens
  in
    normalizeParagraphElements preParagraphNormalization

-- Build a Chunk tree from a set of HTML tokens (which may represent a forest)
-- Will return Nothing if the set of HTML tokens is not well formed.
buildChunkTree :: [Token] -> Maybe ChunkTree
buildChunkTree tokens = 
    buildChunkTreeInner >>=
    oneItemOnly
  where
    buildChunkTreeInner :: Maybe [ChunkTree]
    buildChunkTreeInner =
        L.foldl' (\ms tkn -> buildChunkNode tkn =<< ms) (Just [ChunkTree MD5.init Nothing 0 []]) (normalizeHTMLTokens tokens)

-- Note, this will content aggregate a chunk tree node and it's children, top down, but stopping at branches with
-- two or less children. 
-- Basically, creates "ghost" intermediary nodes, which don't effect the hashes of the nodes of the tree with tokens,
-- but adds aggregate "ghost" child nodes in intermediate levels that have their own hash, as a way of representing
-- a set of multiple children. 
contentAggregateChunkTreeNode :: ChunkTree -> ChunkTree
contentAggregateChunkTreeNode unaggregated = 
    ChunkTree (fullNodeHash unaggregated) (nodeToken unaggregated) (fullNodeLength unaggregated) $ maybeToList $ aggregateChildren (branches unaggregated)
  where  
    aggregateChildren :: [ChunkTree] -> Maybe ChunkTree
    aggregateChildren []                = Nothing
    aggregateChildren (first:[])        = Just first
    aggregateChildren (first:second:[]) = Just $ createAggregateChild first second
    aggregateChildren children          = do
      let (leftChildren, rightChildren) = splitAt (maximumDigestIndex children) children
      leftAggregate  <- aggregateChildren leftChildren
      rightAggregate <- aggregateChildren rightChildren  
      return $ createAggregateChild leftAggregate rightAggregate
    createAggregateChild :: ChunkTree -> ChunkTree -> ChunkTree
    createAggregateChild left right =
      let
        aggregateHash   = MD5.update (MD5.update MD5.init $ treeDigest left) (treeDigest right)
        aggregateLength = fullNodeLength left + fullNodeLength right
      in 
        ChunkTree aggregateHash Nothing aggregateLength [left, right]  
    greaterDigest (maxIndex, maxDigest) (index, branchDigest) = 
      if branchDigest > maxDigest then (index, branchDigest) else (maxIndex, maxDigest)  
    maximumDigestIndex (_:second:fromList) =
      let (foundIndex, _) = L.foldl' greaterDigest (1, treeDigest second) $ zip [2..] $ map treeDigest fromList in foundIndex
    maximumDigestIndex _                   = 0 -- should never get here

-- Given HTML, build a chunk tree. 
buildChunkTreeFromHTML :: T.Text -> Maybe ChunkTree
buildChunkTreeFromHTML text = buildChunkTree $ parseTokens text

-- Given lazy text HTML, build a chunk tree.
buildChunkTreeFromHTMLLazy :: TL.Text -> Maybe ChunkTree
buildChunkTreeFromHTMLLazy text = buildChunkTree $ parseTokensLazy text

-- Get the hash digest of a ChunkTree node.
treeDigest :: ChunkTree -> ByteString
treeDigest = MD5.finalize . fullNodeHash

-- Produces a map of hash digests to chunk-tree nodes, as long as they are
-- above the break even size.
mapChunkTree :: ChunkTree -> Map.Map ByteString ChunkTree
mapChunkTree tree = 
    mapChunkTreeInner tree mempty
  where 
    mapChunkTreeInner branch parentMap =
      foldr (mapChunkTreeInner) (insertTreeMaybe parentMap branch) $ branches branch
    insertTreeMaybe inputMap currentBranch =
      if fullNodeLength currentBranch > refMaxSize then
        Map.insert (treeDigest currentBranch) currentBranch inputMap
      else
        inputMap

-- Produce diff tokens for a given a chunk tree node if it can not be matched in the source.
treeTokenToDiffTokens :: ChunkTree -> [DiffToken]
treeTokenToDiffTokens (ChunkTree _ Nothing _ _ )       = []
treeTokenToDiffTokens (ChunkTree _ (Just aToken) _ _ ) =
    case aToken of
        (TagOpen tagName attributes)      -> DiffOpenTag tagName : map translateAttribute attributes
        (TagClose _)                      -> [DiffCloseTag]
        (TagSelfClose tagName attributes) -> DiffSoloTag tagName : map translateAttribute attributes
        (ContentText value)               -> [DiffContent value]
        (ContentChar value)               -> [DiffContent $ T.singleton value]
        (Doctype value)                   -> [DiffDoctype value]
        (Comment _)                       -> []
  where
    translateAttribute (Attr name value) = DiffAttribute name value

-- After the nodes have been traversed in a tree, this is used to produce a closing token.
treeTokenToCloseDiffTokens :: ChunkTree -> [DiffToken]
treeTokenToCloseDiffTokens (ChunkTree _ (Just (TagOpen _ _ )) _ _ ) = [DiffCloseTag]
treeTokenToCloseDiffTokens _                                        = []

-- Given a map of a source chunk-tree and a target chunk-tree, create a diff patch 
diffChunkTree :: Map.Map ByteString ChunkTree -> ChunkTree -> [DiffToken]
diffChunkTree sourceMap tree  =
    let nodeDigest = treeDigest tree in
    case Map.lookup (nodeDigest) sourceMap of
        Nothing -> treeTokenToDiffTokens tree ++
                   ( foldr (++) [] $ map (diffChunkTree sourceMap) $ branches tree ) ++
                   treeTokenToCloseDiffTokens tree
        Just _  -> [DiffChunkReference $ nodeDigest]

-- Change a chunktree node into a set of diff tokens with no references
chunkTreeToDiffTokens :: ChunkTree -> [DiffToken]
chunkTreeToDiffTokens tree  =
        treeTokenToDiffTokens tree ++ ( foldr (++) [] $ map (chunkTreeToDiffTokens) $ branches tree ) ++ treeTokenToCloseDiffTokens tree

-- Return all the patch refs in a delta (in the form as a set of diff tokens), with their chunk-tree tokenized to difftokens (with no references)
getPatchRefsAsDiffTokens :: [DiffToken] -> Map.Map ByteString ChunkTree -> [(ByteString, Maybe [DiffToken])]
getPatchRefsAsDiffTokens delta sourceMap =
  let patchRefs = getPatchRefs delta
      patchRefToDiffTokens patchRef = do
        patchTree <- Map.lookup patchRef sourceMap
        return (chunkTreeToDiffTokens patchTree)
  in
    map (\patchRef -> (patchRef, patchRefToDiffTokens patchRef)) patchRefs

-- Perform a diff between two well formed HTML texts and return the diff on success.
diffHTML :: T.Text -> T.Text -> Maybe [DiffToken]
diffHTML sourceHTML targetHTML = do
    sourceChunkTree <- buildChunkTreeFromHTML sourceHTML
    targetChunkTree <- buildChunkTreeFromHTML targetHTML
    if treeDigest sourceChunkTree /= treeDigest targetChunkTree then do
      let sourceMap = mapChunkTree sourceChunkTree
      return (diffChunkTree sourceMap targetChunkTree) 
    else
      return []

-- Perform a diff between two well formed HTML texts and return the diff and the full version of the patches the diff references.
diffHTMLWithPatchRefs :: T.Text -> T.Text -> Maybe DiffWithPatchRefs
diffHTMLWithPatchRefs sourceHTML targetHTML = do
    sourceChunkTree <- buildChunkTreeFromHTML sourceHTML
    targetChunkTree <- buildChunkTreeFromHTML targetHTML
    if treeDigest sourceChunkTree /= treeDigest targetChunkTree then do
      let sourceMap = mapChunkTree sourceChunkTree
      let diff      = diffChunkTree sourceMap targetChunkTree 
      let patches   = getPatchRefsAsDiffTokens diff sourceMap
      return (DiffWithPatchRefs diff patches)
    else
      return (DiffWithPatchRefs [] [])

diffHTMLWithPatchRefsLazy :: TL.Text -> TL.Text -> Maybe DiffWithPatchRefs
diffHTMLWithPatchRefsLazy sourceHTML targetHTML = do
    sourceChunkTree <- buildChunkTreeFromHTMLLazy sourceHTML
    targetChunkTree <- buildChunkTreeFromHTMLLazy targetHTML
    if treeDigest sourceChunkTree /= treeDigest targetChunkTree then do
      let sourceMap = mapChunkTree sourceChunkTree
      let diff      = diffChunkTree sourceMap targetChunkTree 
      let patches   = getPatchRefsAsDiffTokens diff sourceMap
      return (DiffWithPatchRefs diff patches)
    else
      return (DiffWithPatchRefs [] [])

-- Same as diff html but with lazy text
diffHTMLLazy :: TL.Text -> TL.Text -> Maybe [DiffToken]
diffHTMLLazy sourceHTML targetHTML = do
    sourceChunkTree <- buildChunkTreeFromHTMLLazy sourceHTML
    targetChunkTree <- buildChunkTreeFromHTMLLazy targetHTML
    if treeDigest sourceChunkTree /= treeDigest targetChunkTree then do
      let sourceMap = mapChunkTree sourceChunkTree
      return (diffChunkTree sourceMap targetChunkTree)
    else 
      return []

-- Render the chunk tree node back to a set of tokens.
getTreeTokens :: ChunkTree -> [Token]
getTreeTokens tree = 
    ( L.foldl' accumulateChildTokens (maybeToList treeNodeToken) (branches tree) ) ++ closeToken treeNodeToken
  where
    accumulateChildTokens previous currentTree = previous ++ getTreeTokens currentTree 
    treeNodeToken = nodeToken tree 
    closeToken (Just (TagOpen tagName _)) = [TagClose tagName]
    closeToken _                          = []

-- Given a tree and a set of diff tokens, use them to patch into a stream of HTML tokens.
patchTreeToTokens :: ChunkTree -> [DiffToken] -> Maybe [Token]
patchTreeToTokens tree patch = 
  let 
    sourceMap = mapChunkTree tree
    patchDiffToken (Just (tokenStack, outputTokens))                                   (DiffChunkReference refValue) = do
      foundTree <- Map.lookup refValue sourceMap
      let treeTokens = reverse $ getTreeTokens foundTree 
      return (tokenStack, treeTokens ++ outputTokens)
    patchDiffToken (Just (tokenStack, outputTokens))                                   (DiffContent value)           = 
      Just (tokenStack, ContentText value : outputTokens)
    patchDiffToken (Just (tokenStack, outputTokens))                                   (DiffOpenTag tagName)         =
      let outputToken = TagOpen tagName [] in Just (outputToken:tokenStack, outputToken:outputTokens)
    patchDiffToken (Just (tokenStack, (TagOpen tagName attributes):outputTokens))      (DiffAttribute name value)    =
      Just (tokenStack, TagOpen tagName (attributes ++ [Attr name value]):outputTokens ) 
    patchDiffToken (Just (tokenStack, (TagSelfClose tagName attributes):outputTokens)) (DiffAttribute name value)    =
      Just (tokenStack, TagSelfClose tagName (attributes ++ [Attr name value]):outputTokens ) 
    patchDiffToken _                                                                   (DiffAttribute _    _)        = Nothing
    patchDiffToken (Just ((TagOpen tagName _):tokenStack, outputTokens))               (DiffCloseTag)                =
      Just (tokenStack, TagClose tagName : outputTokens)
    patchDiffToken _                                                                   (DiffCloseTag)                = Nothing
    patchDiffToken (Just (tokenStack, outputTokens))                                   (DiffSoloTag tagName)         =
      Just (tokenStack, (TagSelfClose tagName []) : outputTokens)
    patchDiffToken (Just (tokenStack, outputTokens))                                   (DiffDoctype value)           =
      Just (tokenStack, (Doctype value):outputTokens)
    patchDiffToken _                                                                   _                             = Nothing
  in do
    tokenState <- ( L.foldl' patchDiffToken (Just ([],[])) patch )
    let (_, tokenStream) = tokenState
    return (reverse tokenStream)

-- Given HTML and a set of diff tokens, patch to create a target HTML
patchHTML :: T.Text -> [DiffToken] -> Maybe TL.Text
patchHTML htmlText patch = do
  sourceTree <- buildChunkTreeFromHTML htmlText
  patchedTokens <- patchTreeToTokens sourceTree patch
  return (renderTokens patchedTokens)

-- Given HTML (lazy) and a set of diff tokens, patch to create a target HTML
patchHTMLLazy :: TL.Text -> [DiffToken] -> Maybe TL.Text
patchHTMLLazy htmlText patch = do
  sourceTree <- buildChunkTreeFromHTMLLazy htmlText
  patchedTokens <- patchTreeToTokens sourceTree patch
  return (renderTokens patchedTokens)

-- Get all the references 
getPatchRefs :: [DiffToken] -> [ByteString]
getPatchRefs tokens =
    foldr prependChunkRefs [] tokens
  where
    prependChunkRefs (DiffChunkReference reference) currentList = reference : currentList
    prependChunkRefs _                              currentList = currentList 

-- Serialize to a compact format in binary 
serializeDiffTokens :: [DiffToken] -> BL.ByteString
serializeDiffTokens tokens = P.runPutL $ ( sequence_ (map serializeToken tokens) >> S.serialize ( 7 :: Word8 ) )
  where
    serializeToken (DiffChunkReference refValue) = encodeTypeAndBytes 0 refValue
    serializeToken (DiffContent text)            = encodeTypeAndText 1 text
    serializeToken (DiffOpenTag text)            = encodeTypeAndText 2 text
    serializeToken (DiffAttribute name value)    =
        encodeTypeAndText 3 name >> S.serialize (V.VarInt $ B.length valueBytes ) >> P.putByteString valueBytes
      where
        valueBytes = encodeUtf8 value
    serializeToken (DiffCloseTag)                = S.serialize (4 :: Word8)
    serializeToken (DiffSoloTag text)            = encodeTypeAndText 5 text
    serializeToken (DiffDoctype text)            = encodeTypeAndText 6 text
    combineByte :: Int -> Int -> Word8
    combineByte typeInt contentLength = fromIntegral (typeInt .|. (contentLength `shiftL` 3))
    encodeTypeAndText typeId content = encodeTypeAndBytes typeId $ encodeUtf8 content
    encodeTypeAndBytes typeId content = ( serializeLengthAndType typeId $ B.length content ) >> P.putByteString content
    serializeLengthAndType typeId contentLength | contentLength < 32 = S.serialize (combineByte typeId contentLength)
                                                | otherwise          = S.serialize ( (fromIntegral typeId ) :: Word8 ) >>
                                                                       S.serialize (V.VarInt ((fromIntegral $ contentLength - 32 ) :: Word))