{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedLabels #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TupleSections #-} -- | Much of the parsing logic is based on the XML productions found in https://www.w3.org/TR/xml/#NT-content -- -- As an Xml parser, this is very incomplete and rudimentary, hence not calling it an xml parser. -- -- My other reference was https://www.w3schools.com/xml/xml_syntax.asp (don't laugh). module Chart.Markup.Parser ( markupP, contentP, XmlDocument (..), xmlDocument, xmlProlog, xmlXMLDecl, xmlDoctypedecl, XmlMiscType, XmlMisc (..), xmlMisc, xmlComment, lt, gt, gtc, oct, sq, dq, wrappedQ, wrappedQNoGuard, eq, xmlName, xmlAtt, openTag, closeTag, emptyElemTag, -- * testing exampleDocument, ) where import Chart.FlatParse import Chart.Markup ( Content (..), Markup (Markup), attribute, ) import Data.ByteString (ByteString) import Data.String.Interpolate import FlatParse.Basic hiding (cut) import qualified FlatParse.Basic.Text as T import GHC.Generics import Prelude -- $setup -- -- >>> :set -XOverloadedLabels -- >>> :set -XOverloadedStrings -- >>> import Chart -- >>> import Optics.Core -- >>> import FlatParse.Basic -- >>> import Chart.FlatParse -- * special XML chars -- | opening tag -- -- >>> runParserMaybe lt "<" -- Just () lt :: Parser e () lt = $(char '<') -- `cut'` Lit "<" -- | closing tag char -- -- >>> runParserMaybe gt ">" -- Just () gt :: Parser e () gt = $(char '>') -- | self-closing tag -- -- >>> runParserMaybe gtc "/>" -- Just () gtc :: Parser e () gtc = $(string "/>") -- | open closer tag -- -- >>> runParserMaybe oct ">> runParserMaybe sq "''" -- Just () sq :: ParserT st e () sq = $(char '\'') -- | double quote -- -- >>> runParserMaybe dq "\"" -- Just () dq :: ParserT st e () dq = $(char '"') wrappedDq :: Parser e ByteString wrappedDq = wrapped dq (byteStringOf $ many (T.satisfy (/= '"'))) -- | guard check for closing quote wrappedSq :: Parser e ByteString wrappedSq = wrapped sq (byteStringOf $ many (T.satisfy (/= '\''))) -- | quote or double quote wrapped -- -- >>> runParserMaybe wrappedQ "\"quoted\"" -- Just "quoted" -- -- >>> runParserMaybe wrappedQ "'quoted'" -- Just "quoted" wrappedQ :: Parser e ByteString wrappedQ = wrappedDq <|> wrappedSq -- | quote or double quote wrapped -- -- >>> runParserMaybe (wrappedQNoGuard xmlName) "\"name\"" -- Just "name" -- -- but will consume quotes if the underlying parser does. -- -- >>> runParserMaybe (wrappedQNoGuard (many anyChar)) "\"name\"" -- Nothing wrappedQNoGuard :: Parser e a -> Parser e a wrappedQNoGuard p = wrapped dq p <|> wrapped sq p -- | xml production [25] -- -- >>> runParserMaybe eq " = " -- Just () -- -- >>> runParserMaybe eq "=" -- Just () eq :: Parser e () eq = optional wss *> $(char '=') <* optional wss -- [4] nameStartChar :: Parser e Char nameStartChar = fusedSatisfy isLatinLetter isNameStartChar isNameStartChar isNameStartChar isNameStartChar :: Char -> Bool isNameStartChar x = (x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || (x == ':') || (x == '_') || (x >= '\xC0' && x <= '\xD6') || (x >= '\xD8' && x <= '\xF6') || (x >= '\xF8' && x <= '\x2FF') || (x >= '\x370' && x <= '\x37D') || (x >= '\x37F' && x <= '\x1FFF') || (x >= '\x200C' && x <= '\x200D') || (x >= '\x2070' && x <= '\x218F') || (x >= '\x2C00' && x <= '\x2FEF') || (x >= '\x3001' && x <= '\xD7FF') || (x >= '\xF900' && x <= '\xFDCF') || (x >= '\xFDF0' && x <= '\xFFFD') || (x >= '\x10000' && x <= '\xEFFFF') -- [4a] nameChar :: Parser e Char nameChar = fusedSatisfy isNameCharAscii isNameCharExt isNameCharExt isNameCharExt isNameCharAscii :: Char -> Bool isNameCharAscii x = (x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || (x >= '0' && x <= '9') || (x == ':') || (x == '_') || (x == '-') || (x == '.') isNameCharExt :: Char -> Bool isNameCharExt x = (x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || (x >= '0' && x <= '9') || (x == ':') || (x == '_') || (x == '-') || (x == '.') || (x == '\xB7') || (x >= '\xC0' && x <= '\xD6') || (x >= '\xD8' && x <= '\xF6') || (x >= '\xF8' && x <= '\x2FF') || (x >= '\x300' && x <= '\x36F') || (x >= '\x370' && x <= '\x37D') || (x >= '\x37F' && x <= '\x1FFF') || (x >= '\x200C' && x <= '\x200D') || (x >= '\x203F' && x <= '\x2040') || (x >= '\x2070' && x <= '\x218F') || (x >= '\x2C00' && x <= '\x2FEF') || (x >= '\x3001' && x <= '\xD7FF') || (x >= '\xF900' && x <= '\xFDCF') || (x >= '\xFDF0' && x <= '\xFFFD') || (x >= '\x10000' && x <= '\xEFFFF') -- | name string according to xml production rule [5] -- -- >>> runParserMaybe xmlName "name" -- Just "name" xmlName :: Parser e ByteString xmlName = byteStringOf (nameStartChar >> many nameChar) -- | attribute pair -- -- >>> runParserMaybe xmlAtt "style = 'fancy'" -- Just ("style","fancy") xmlAtt :: Parser e (ByteString, ByteString) xmlAtt = (,) <$> (xmlName <* eq) <*> wrappedQ -- | open xml tag as per xml production rule [40] -- -- >>> runParserMaybe openTag "" -- Just ("g",[("style","fancy")]) openTag :: Parser Error (ByteString, [(ByteString, ByteString)]) openTag = lt *> ((,) <$> xmlName <*> many (wss *> xmlAtt) <* optional wss) <* gt `cut'` Msg "open tag expected" -- | closing tag as per [42] -- -- >>> runParserMaybe closeTag "" -- Just "g" closeTag :: Parser Error ByteString closeTag = oct *> xmlName <* optional wss <* gt `cut'` Msg "close tag expected" -- | empty element tag as per [44] -- -- >>> runParserMaybe emptyElemTag "
" -- Just ("br",[]) emptyElemTag :: Parser Error (ByteString, [(ByteString, ByteString)]) emptyElemTag = lt *> ((,) <$> xmlName <*> many (wss *> xmlAtt) <* optional wss) <* gtc -- * comments xmlCommentOpen :: Parser e () xmlCommentOpen = $(string "") xmlCharNotMinus :: Parser e ByteString xmlCharNotMinus = byteStringOf $ satisfy (/= '-') xmlMinusPlusChar :: Parser e ByteString xmlMinusPlusChar = byteStringOf $ $(char '-') *> xmlCharNotMinus -- | xml comment -- -- -- >>> runParserMaybe xmlComment "" -- Just " comment " xmlComment :: Parser e ByteString xmlComment = xmlCommentOpen *> byteStringOf (many (xmlCharNotMinus <|> xmlMinusPlusChar)) <* xmlCommentClose -- * prolog -- | xml production rule [22] -- -- The library doesn't do any analysis of the prolog string nor produces it, hence it is just parsed as a ByteString -- -- >>> runParser (ws_ *> xmlProlog) exampleDocument -- OK "\n\n\n\n \n\n\n]>\n" "Hello World.\n" xmlProlog :: Parser e ByteString xmlProlog = byteStringOf $ xmlXMLDecl >> many xmlMisc >> optional (xmlDoctypedecl >> optional xmlMisc) -- | XML declaration as per production rule [23] -- -- >>> runParserMaybe xmlXMLDecl "" -- Just "" xmlXMLDecl :: Parser e ByteString xmlXMLDecl = byteStringOf $ $(string "> xmlVersionInfo >> optional xmlEncodingDecl >> optional wssDDecl >> optional wss >> $(string "?>") -- xml production [24] xmlVersionInfo :: Parser e ByteString xmlVersionInfo = byteStringOf $ wss >> $(string "version") >> eq >> wrappedQNoGuard xmlVersionNum -- | xml production [26] xmlVersionNum :: Parser e ByteString xmlVersionNum = byteStringOf ($(string "1.") >> some (satisfy isDigit)) -- | Whether an 'XmlMisc' is comment or whitespace data XmlMiscType = XMiscComment | XMiscS deriving (Generic, Show, Eq) -- | A comment or whitespace outside of the main document [27] -- -- not as per [27] (missing PI) data XmlMisc = XmlMisc {xmiscType :: XmlMiscType, xmiscContent :: ByteString} deriving (Generic, Show, Eq) -- | Parser for miscellaneous guff xmlMisc :: Parser e XmlMisc xmlMisc = (XmlMisc XMiscComment <$> xmlComment) <|> (XmlMisc XMiscS <$> wss) -- | Typical xml header text exampleDocument :: ByteString exampleDocument = [i| ]> Hello World. |] -- | Doctype declaration as per production rule [28] -- -- >>> runParserMaybe xmlDoctypedecl "" -- Just "" xmlDoctypedecl :: Parser e ByteString xmlDoctypedecl = byteStringOf $ $(string "> wss >> xmlName >> -- optional (wss >> xmlExternalID) >> optional wss >> optional bracketedSB >> optional wss >> $(char '>') bracketedSB :: Parser e [Char] bracketedSB = bracketed $(char '[') $(char ']') (many (satisfy (/= ']'))) -- [32] wssDDecl :: Parser e ByteString wssDDecl = byteStringOf $ wss *> $(string "standalone") *> eq *> xmlYesNo xmlYesNo :: Parser e ByteString xmlYesNo = wrappedQNoGuard (byteStringOf $ $(string "yes") <|> $(string "no")) -- | xml production [80] xmlEncodingDecl :: Parser e ByteString xmlEncodingDecl = wss *> $(string "encoding") *> eq *> wrappedQNoGuard xmlEncName -- [81] xmlEncName :: Parser e ByteString xmlEncName = byteStringOf (satisfyAscii isLatinLetter >> many (satisfyAscii (\x -> isLatinLetter x || isDigit x || elem x ("._-" :: [Char])))) -- main Parser -- | An XML document as pre production rule [1] data XmlDocument = XmlDocument ByteString Markup [XmlMisc] deriving (Show, Eq) -- | Note that the library builds a Markup as per the SVG standards and not a Document. -- -- >>> runParser (ws_ *> xmlDocument) exampleDocument -- OK (XmlDocument "\n\n\n\n \n\n\n]>\n" (Markup {tag = "foo", atts = Attributes {attMap = fromList []}, contents = [Content "Hello World."]}) [XmlMisc {xmiscType = XMiscS, xmiscContent = "\n"}]) "" xmlDocument :: Parser Error XmlDocument xmlDocument = XmlDocument <$> (ws_ *> xmlProlog) <*> markupP <*> many xmlMisc -- | Main parser for a single Markup (xml-like) element -- -- >>> runParser markupP "Hello World." -- OK (Markup {tag = "foo", atts = Attributes {attMap = fromList []}, contents = [Content "Hello World."]}) "" markupP :: Parser Error Markup markupP = ((\(n, as) -> Markup n (mconcat $ attribute <$> as) mempty) <$> emptyElemTag) <|> -- no close tag = open tag test ((\(n, as) c _ -> Markup n (mconcat $ attribute <$> as) c) <$> openTag <*> many contentP <*> closeTag `cut` ["open tag", "content", "close tag"]) -- | Inner contents of an element. -- -- >>> runParser (some contentP) "Hello World.content" -- OK [MarkupLeaf (Markup {tag = "foo", atts = Attributes {attMap = fromList []}, contents = [Content "Hello World."]}),Content "content",Comment " comment "] "" contentP :: Parser Error Content contentP = (MarkupLeaf <$> markupP) <|> (Comment <$> xmlComment) <|> (Content <$> byteStringOf (some (satisfy (/= '<'))))