-- | A parser for gtk-doc formatted documentation, see -- https://developer.gnome.org/gtk-doc-manual/ for the spec. module Data.GI.CodeGen.GtkDoc ( parseGtkDoc , GtkDoc(..) , Token(..) , Language(..) , Link(..) , ListItem(..) , CRef(..) ) where import Prelude hiding (takeWhile) #if !MIN_VERSION_base(4,8,0) import Control.Applicative ((<$>), (<*)) #endif #if !MIN_VERSION_base(4,13,0) import Data.Monoid ((<>)) #endif import Control.Applicative ((<|>)) import Data.GI.GIR.BasicTypes (Name(Name)) import Data.Attoparsec.Text import Data.Char (isAlphaNum, isAlpha, isAscii) import qualified Data.Text as T import Data.Text (Text) -- | A parsed gtk-doc token. data Token = Literal Text | Comment Text | Verbatim Text | CodeBlock (Maybe Language) Text | ExternalLink Link | Image Link | List [ListItem] | SectionHeader Int GtkDoc -- ^ A section header of the given depth. | SymbolRef CRef deriving (Show, Eq) -- | A link to a resource, either offline or a section of the documentation. data Link = Link { linkName :: Text , linkAddress :: Text } deriving (Show, Eq) -- | An item in a list, given by a list of lines (not including ending -- newlines). The list is always non-empty, so we represent it by the -- first line and then a possibly empty list with the rest of the lines. data ListItem = ListItem GtkDoc [GtkDoc] deriving (Show, Eq) -- | The language for an embedded code block. newtype Language = Language Text deriving (Show, Eq) -- | A reference to some symbol in the API. data CRef = FunctionRef Name | OldFunctionRef Text | MethodRef Name Text | ParamRef Text | ConstantRef Text | SignalRef Name Text | OldSignalRef Text Text | LocalSignalRef Text | PropertyRef Name Text | OldPropertyRef Text Text | VMethodRef Text Text | VFuncRef Name Text | StructFieldRef Text Text | CTypeRef Text | TypeRef Name deriving (Show, Eq, Ord) -- | A parsed representation of gtk-doc formatted documentation. newtype GtkDoc = GtkDoc [Token] deriving (Show, Eq) -- | Parse the given gtk-doc formatted documentation. -- -- === __Examples__ -- >>> parseGtkDoc "" -- GtkDoc [] -- -- >>> parseGtkDoc "func()" -- GtkDoc [SymbolRef (OldFunctionRef "func")] -- -- >>> parseGtkDoc "literal" -- GtkDoc [Literal "literal"] -- -- >>> parseGtkDoc "This is a long literal" -- GtkDoc [Literal "This is a long literal"] -- -- >>> parseGtkDoc "Call foo() for free cookies" -- GtkDoc [Literal "Call ",SymbolRef (OldFunctionRef "foo"),Literal " for free cookies"] -- -- >>> parseGtkDoc "The signal ::activate is related to gtk_button_activate()." -- GtkDoc [Literal "The signal ",SymbolRef (LocalSignalRef "activate"),Literal " is related to ",SymbolRef (OldFunctionRef "gtk_button_activate"),Literal "."] -- -- >>> parseGtkDoc "The signal ##%#GtkButton::activate is related to gtk_button_activate()." -- GtkDoc [Literal "The signal ##%",SymbolRef (OldSignalRef "GtkButton" "activate"),Literal " is related to ",SymbolRef (OldFunctionRef "gtk_button_activate"),Literal "."] -- -- >>> parseGtkDoc "# A section\n\n## and a subsection ##\n" -- GtkDoc [SectionHeader 1 (GtkDoc [Literal "A section"]),Literal "\n",SectionHeader 2 (GtkDoc [Literal "and a subsection "])] -- -- >>> parseGtkDoc "Compact list:\n- First item\n- Second item" -- GtkDoc [Literal "Compact list:\n",List [ListItem (GtkDoc [Literal "First item"]) [],ListItem (GtkDoc [Literal "Second item"]) []]] -- -- >>> parseGtkDoc "Spaced list:\n\n- First item\n\n- Second item" -- GtkDoc [Literal "Spaced list:\n",List [ListItem (GtkDoc [Literal "First item"]) [],ListItem (GtkDoc [Literal "Second item"]) []]] -- -- >>> parseGtkDoc "List with urls:\n- [test](http://test)\n- ![](image.png)" -- GtkDoc [Literal "List with urls:\n",List [ListItem (GtkDoc [ExternalLink (Link {linkName = "test", linkAddress = "http://test"})]) [],ListItem (GtkDoc [Image (Link {linkName = "", linkAddress = "image.png"})]) []]] parseGtkDoc :: Text -> GtkDoc parseGtkDoc raw = case parseOnly (parseTokens <* endOfInput) raw of Left e -> error $ "gtk-doc parsing failed with error \"" <> e <> "\" on the input \"" <> T.unpack raw <> "\"" Right tks -> GtkDoc . coalesceLiterals . restoreSHPreNewlines . restoreListPreNewline $ tks -- | `parseSectionHeader` eats the newline before the section header, -- but `parseInitialSectionHeader` does not, since it only matches at -- the beginning of the text. This restores the newlines eaten by -- `parseSectionHeader`, so a `SectionHeader` returned by the parser -- can always be assumed /not/ to have an implicit starting newline. restoreSHPreNewlines :: [Token] -> [Token] restoreSHPreNewlines [] = [] restoreSHPreNewlines (i : rest) = i : restoreNewlines rest where restoreNewlines :: [Token] -> [Token] restoreNewlines [] = [] restoreNewlines (s@(SectionHeader _ _) : rest) = Literal "\n" : s : restoreNewlines rest restoreNewlines (x : rest) = x : restoreNewlines rest -- | `parseList` eats the newline before the list, restore it. restoreListPreNewline :: [Token] -> [Token] restoreListPreNewline [] = [] restoreListPreNewline (l@(List _) : rest) = Literal "\n" : l : restoreListPreNewline rest restoreListPreNewline (x : rest) = x : restoreListPreNewline rest -- | Accumulate consecutive literals into a single literal. coalesceLiterals :: [Token] -> [Token] coalesceLiterals tks = go Nothing tks where go :: Maybe Text -> [Token] -> [Token] go Nothing [] = [] go (Just l) [] = [Literal l] go Nothing (Literal l : rest) = go (Just l) rest go (Just l) (Literal l' : rest) = go (Just (l <> l')) rest go Nothing (tk : rest) = tk : go Nothing rest go (Just l) (tk : rest) = Literal l : tk : go Nothing rest -- | Parser for tokens. parseTokens :: Parser [Token] parseTokens = headerAndTokens <|> justTokens where -- In case the input starts by a section header. headerAndTokens :: Parser [Token] headerAndTokens = do header <- parseInitialSectionHeader tokens <- justTokens return (header : tokens) justTokens :: Parser [Token] justTokens = many' parseToken -- | Parse a single token. -- -- === __Examples__ -- >>> parseOnly (parseToken <* endOfInput) "func()" -- Right (SymbolRef (OldFunctionRef "func")) parseToken :: Parser Token parseToken = -- Note that the parsers overlap, so this is not as -- efficient as it could be (if we had combined parsers -- and then branched, so that there is no -- backtracking). But speed is not an issue here, so for -- clarity we keep the parsers distinct. The exception -- is parseFunctionRef, since it does not complicate the -- parser much, and it is the main source of -- backtracking. parseFunctionRef <|> parseMethod <|> parseConstructor <|> parseSignal <|> parseId <|> parseLocalSignal <|> parseProperty <|> parseVMethod <|> parseStructField <|> parseClass <|> parseCType <|> parseConstant <|> parseParam <|> parseEscaped <|> parseCodeBlock <|> parseVerbatim <|> parseUrl <|> parseImage <|> parseSectionHeader <|> parseList <|> parseComment <|> parseBoringLiteral -- | Whether the given character is valid in a C identifier. isCIdent :: Char -> Bool isCIdent '_' = True isCIdent c = isAscii c && isAlphaNum c -- | Something that could be a valid C identifier (loosely speaking, -- we do not need to be too strict here). parseCIdent :: Parser Text parseCIdent = takeWhile1 isCIdent -- | Parse a function ref parseFunctionRef :: Parser Token parseFunctionRef = parseOldFunctionRef <|> parseNewFunctionRef -- | Parse an unresolved reference to a C symbol in new gtk-doc notation. parseId :: Parser Token parseId = do _ <- string "[id@" ident <- parseCIdent _ <- char ']' return (SymbolRef (OldFunctionRef ident)) -- | Parse a function ref, given by a valid C identifier followed by -- '()', for instance 'gtk_widget_show()'. If the identifier is not -- followed by "()", return it as a literal instead. -- -- === __Examples__ -- >>> parseOnly (parseFunctionRef <* endOfInput) "test_func()" -- Right (SymbolRef (OldFunctionRef "test_func")) -- -- >>> parseOnly (parseFunctionRef <* endOfInput) "not_a_func" -- Right (Literal "not_a_func") parseOldFunctionRef :: Parser Token parseOldFunctionRef = do ident <- parseCIdent option (Literal ident) (string "()" >> return (SymbolRef (OldFunctionRef ident))) -- | Parse a function name in new style, of the form -- > [func@Namespace.c_func_name] -- -- === __Examples__ -- >>> parseOnly (parseFunctionRef <* endOfInput) "[func@Gtk.init]" -- Right (SymbolRef (FunctionRef (Name {namespace = "Gtk", name = "init"}))) parseNewFunctionRef :: Parser Token parseNewFunctionRef = do _ <- string "[func@" ns <- takeWhile1 (\c -> isAscii c && isAlpha c) _ <- char '.' n <- takeWhile1 isCIdent _ <- char ']' return $ SymbolRef $ FunctionRef (Name ns n) -- | Parse a method name, of the form -- > [method@Namespace.Object.c_func_name] -- -- === __Examples__ -- >>> parseOnly (parseMethod <* endOfInput) "[method@Gtk.Button.set_child]" -- Right (SymbolRef (MethodRef (Name {namespace = "Gtk", name = "Button"}) "set_child")) parseMethod :: Parser Token parseMethod = do _ <- string "[method@" ns <- takeWhile1 (\c -> isAscii c && isAlpha c) _ <- char '.' n <- takeWhile1 isCIdent _ <- char '.' method <- takeWhile1 isCIdent _ <- char ']' return $ SymbolRef $ MethodRef (Name ns n) method -- | Parse a reference to a constructor, of the form -- > [ctor@Namespace.Object.c_func_name] -- -- === __Examples__ -- >>> parseOnly (parseConstructor <* endOfInput) "[ctor@Gtk.Builder.new_from_file]" -- Right (SymbolRef (MethodRef (Name {namespace = "Gtk", name = "Builder"}) "new_from_file")) parseConstructor :: Parser Token parseConstructor = do _ <- string "[ctor@" ns <- takeWhile1 (\c -> isAscii c && isAlpha c) _ <- char '.' n <- takeWhile1 isCIdent _ <- char '.' method <- takeWhile1 isCIdent _ <- char ']' return $ SymbolRef $ MethodRef (Name ns n) method -- | Parse a reference to a type, of the form -- > [class@Namespace.Name] -- an interface of the form -- > [iface@Namespace.Name] -- or an enum type: -- > [enum@Namespace.Name] -- -- === __Examples__ -- >>> parseOnly (parseClass <* endOfInput) "[class@Gtk.Dialog]" -- Right (SymbolRef (TypeRef (Name {namespace = "Gtk", name = "Dialog"}))) -- -- >>> parseOnly (parseClass <* endOfInput) "[iface@Gtk.Editable]" -- Right (SymbolRef (TypeRef (Name {namespace = "Gtk", name = "Editable"}))) -- -- >>> parseOnly (parseClass <* endOfInput) "[enum@Gtk.SizeRequestMode]" -- Right (SymbolRef (TypeRef (Name {namespace = "Gtk", name = "SizeRequestMode"}))) parseClass :: Parser Token parseClass = do _ <- string "[class@" <|> string "[iface@" <|> string "[enum@" ns <- takeWhile1 (\c -> isAscii c && isAlpha c) _ <- char '.' n <- takeWhile1 isCIdent _ <- char ']' return $ SymbolRef $ TypeRef (Name ns n) parseSignal :: Parser Token parseSignal = parseOldSignal <|> parseNewSignal -- | Parse an old style signal name, of the form -- > #Object::signal -- -- === __Examples__ -- >>> parseOnly (parseOldSignal <* endOfInput) "#GtkButton::activate" -- Right (SymbolRef (OldSignalRef "GtkButton" "activate")) parseOldSignal :: Parser Token parseOldSignal = do _ <- char '#' obj <- parseCIdent _ <- string "::" signal <- signalOrPropName return (SymbolRef (OldSignalRef obj signal)) -- | Parse a new style signal ref, of the form -- > [signal@Namespace.Object::signal-name] -- -- === __Examples__ -- >>> parseOnly (parseNewSignal <* endOfInput) "[signal@Gtk.AboutDialog::activate-link]" -- Right (SymbolRef (SignalRef (Name {namespace = "Gtk", name = "AboutDialog"}) "activate-link")) parseNewSignal :: Parser Token parseNewSignal = do _ <- string "[signal@" ns <- takeWhile1 (\c -> isAscii c && isAlpha c) _ <- char '.' n <- parseCIdent _ <- string "::" signal <- takeWhile1 (\c -> (isAscii c && isAlpha c) || c == '-') _ <- char ']' return (SymbolRef (SignalRef (Name ns n) signal)) -- | Parse a reference to a signal defined in the current module, of the form -- > ::signal -- -- === __Examples__ -- >>> parseOnly (parseLocalSignal <* endOfInput) "::activate" -- Right (SymbolRef (LocalSignalRef "activate")) parseLocalSignal :: Parser Token parseLocalSignal = do _ <- string "::" signal <- signalOrPropName return (SymbolRef (LocalSignalRef signal)) -- | Parse a property name in the old style, of the form -- > #Object:property -- -- === __Examples__ -- >>> parseOnly (parseOldProperty <* endOfInput) "#GtkButton:always-show-image" -- Right (SymbolRef (OldPropertyRef "GtkButton" "always-show-image")) parseOldProperty :: Parser Token parseOldProperty = do _ <- char '#' obj <- parseCIdent _ <- char ':' property <- signalOrPropName return (SymbolRef (OldPropertyRef obj property)) -- | Parse a property name in the new style: -- > [property@Namespace.Object:property-name] -- -- === __Examples__ -- >>> parseOnly (parseNewProperty <* endOfInput) "[property@Gtk.ProgressBar:show-text]" -- Right (SymbolRef (PropertyRef (Name {namespace = "Gtk", name = "ProgressBar"}) "show-text")) parseNewProperty :: Parser Token parseNewProperty = do _ <- string "[property@" ns <- takeWhile1 (\c -> isAscii c && isAlpha c) _ <- char '.' n <- parseCIdent _ <- char ':' property <- takeWhile1 (\c -> (isAscii c && isAlpha c) || c == '-') _ <- char ']' return (SymbolRef (PropertyRef (Name ns n) property)) -- | Parse a property parseProperty :: Parser Token parseProperty = parseOldProperty <|> parseNewProperty -- | Parse an xml comment, of the form -- > -- Note that this function keeps spaces. -- -- === __Examples__ -- >>> parseOnly (parseComment <* endOfInput) "" -- Right (Comment " comment ") parseComment :: Parser Token parseComment = do comment <- string "") return (Comment $ T.pack comment) -- | Parse an old style reference to a virtual method, of the form -- > #Struct.method() -- -- === __Examples__ -- >>> parseOnly (parseOldVMethod <* endOfInput) "#Foo.bar()" -- Right (SymbolRef (VMethodRef "Foo" "bar")) parseOldVMethod :: Parser Token parseOldVMethod = do _ <- char '#' obj <- parseCIdent _ <- char '.' method <- parseCIdent _ <- string "()" return (SymbolRef (VMethodRef obj method)) -- | Parse a new style reference to a virtual function, of the form -- > [vfunc@Namespace.Object.vfunc_name] -- -- >>> parseOnly (parseVFunc <* endOfInput) "[vfunc@Gtk.Widget.get_request_mode]" -- Right (SymbolRef (VFuncRef (Name {namespace = "Gtk", name = "Widget"}) "get_request_mode")) parseVFunc :: Parser Token parseVFunc = do _ <- string "[vfunc@" ns <- takeWhile1 (\c -> isAscii c && isAlpha c) _ <- char '.' n <- parseCIdent _ <- char '.' vfunc <- parseCIdent _ <- char ']' return (SymbolRef (VFuncRef (Name ns n) vfunc)) -- | Parse a reference to a virtual method parseVMethod :: Parser Token parseVMethod = parseOldVMethod <|> parseVFunc -- | Parse a reference to a struct field, of the form -- > #Struct.field -- -- === __Examples__ -- >>> parseOnly (parseStructField <* endOfInput) "#Foo.bar" -- Right (SymbolRef (StructFieldRef "Foo" "bar")) parseStructField :: Parser Token parseStructField = do _ <- char '#' obj <- parseCIdent _ <- char '.' field <- parseCIdent return (SymbolRef (StructFieldRef obj field)) -- | Parse a reference to a C type, of the form -- > #Type -- -- === __Examples__ -- >>> parseOnly (parseCType <* endOfInput) "#Foo" -- Right (SymbolRef (CTypeRef "Foo")) parseCType :: Parser Token parseCType = do _ <- char '#' obj <- parseCIdent return (SymbolRef (CTypeRef obj)) -- | Parse a constant, of the form -- > %CONSTANT_NAME -- -- === __Examples__ -- >>> parseOnly (parseConstant <* endOfInput) "%TEST_CONSTANT" -- Right (SymbolRef (ConstantRef "TEST_CONSTANT")) parseConstant :: Parser Token parseConstant = do _ <- char '%' c <- parseCIdent return (SymbolRef (ConstantRef c)) -- | Parse a reference to a parameter, of the form -- > @param_name -- -- === __Examples__ -- >>> parseOnly (parseParam <* endOfInput) "@test_param" -- Right (SymbolRef (ParamRef "test_param")) parseParam :: Parser Token parseParam = do _ <- char '@' param <- parseCIdent return (SymbolRef (ParamRef param)) -- | Name of a signal or property name. Similar to a C identifier, but -- hyphens are allowed too. signalOrPropName :: Parser Text signalOrPropName = takeWhile1 isSignalOrPropIdent where isSignalOrPropIdent :: Char -> Bool isSignalOrPropIdent '-' = True isSignalOrPropIdent c = isCIdent c -- | Parse a escaped special character, i.e. one preceded by '\'. parseEscaped :: Parser Token parseEscaped = do _ <- char '\\' c <- satisfy (`elem` ("#@%\\`" :: [Char])) return $ Literal (T.singleton c) -- | Parse a literal, i.e. anything without a known special -- meaning. Note that this parser always consumes the first character, -- regardless of what it is. parseBoringLiteral :: Parser Token parseBoringLiteral = do c <- anyChar boring <- takeWhile (not . special) return $ Literal (T.cons c boring) -- | List of special characters from the point of view of the parser -- (in the sense that they may be the beginning of something with a -- special interpretation). special :: Char -> Bool special '#' = True special '@' = True special '%' = True special '\\' = True special '`' = True special '|' = True special '[' = True special '!' = True special '\n' = True special ':' = True special c = isCIdent c -- | Parse a verbatim string, of the form -- > `verbatim text` -- -- === __Examples__ -- >>> parseOnly (parseVerbatim <* endOfInput) "`Example quote!`" -- Right (Verbatim "Example quote!") parseVerbatim :: Parser Token parseVerbatim = do _ <- char '`' v <- takeWhile1 (/= '`') _ <- char '`' return $ Verbatim v -- | Parse a URL in Markdown syntax, of the form -- > [name](url) -- -- === __Examples__ -- >>> parseOnly (parseUrl <* endOfInput) "[haskell](http://haskell.org)" -- Right (ExternalLink (Link {linkName = "haskell", linkAddress = "http://haskell.org"})) parseUrl :: Parser Token parseUrl = do _ <- char '[' name <- takeWhile1 (/= ']') _ <- string "](" address <- takeWhile1 (/= ')') _ <- char ')' return $ ExternalLink $ Link {linkName = name, linkAddress = address} -- | Parse an image reference, of the form -- > ![label](url) -- -- === __Examples__ -- >>> parseOnly (parseImage <* endOfInput) "![](diagram.png)" -- Right (Image (Link {linkName = "", linkAddress = "diagram.png"})) parseImage :: Parser Token parseImage = do _ <- string "![" name <- takeWhile (/= ']') _ <- string "](" address <- takeWhile1 (/= ')') _ <- char ')' return $ Image $ Link {linkName = name, linkAddress = address} -- | Parse a code block embedded in the documentation. parseCodeBlock :: Parser Token parseCodeBlock = parseOldStyleCodeBlock <|> parseNewStyleCodeBlock -- | Parse a new style code block, of the form -- > ```c -- > some c code -- > ``` -- -- === __Examples__ -- >>> parseOnly (parseNewStyleCodeBlock <* endOfInput) "```c\nThis is C code\n```" -- Right (CodeBlock (Just (Language "c")) "This is C code") parseNewStyleCodeBlock :: Parser Token parseNewStyleCodeBlock = do _ <- string "```" lang <- T.strip <$> takeWhile (/= '\n') _ <- char '\n' let maybeLang = if T.null lang then Nothing else Just lang code <- T.pack <$> manyTill anyChar (string "\n```") return $ CodeBlock (Language <$> maybeLang) code -- | Parse an old style code block, of the form -- > |[ code ]| -- -- === __Examples__ -- >>> parseOnly (parseOldStyleCodeBlock <* endOfInput) "|[this is code]|" -- Right (CodeBlock Nothing "this is code") -- -- >>> parseOnly (parseOldStyleCodeBlock <* endOfInput) "|[this is C code]|" -- Right (CodeBlock (Just (Language "C")) "this is C code") parseOldStyleCodeBlock :: Parser Token parseOldStyleCodeBlock = do _ <- string "|[" lang <- (Just <$> parseLanguage) <|> return Nothing code <- T.pack <$> manyTill anyChar (string "]|") return $ CodeBlock lang code -- | Parse the language of a code block, specified as a comment. parseLanguage :: Parser Language parseLanguage = do _ <- string "" return $ Language lang -- | Parse a section header, given by a number of hash symbols, and -- then ordinary text. Note that this parser "eats" the newline before -- and after the section header. parseSectionHeader :: Parser Token parseSectionHeader = char '\n' >> parseInitialSectionHeader -- | Parse a section header at the beginning of the text. I.e. this is -- the same as `parseSectionHeader`, but we do not expect a newline as -- a first character. -- -- === __Examples__ -- >>> parseOnly (parseInitialSectionHeader <* endOfInput) "### Hello! ###\n" -- Right (SectionHeader 3 (GtkDoc [Literal "Hello! "])) -- -- >>> parseOnly (parseInitialSectionHeader <* endOfInput) "# Hello!\n" -- Right (SectionHeader 1 (GtkDoc [Literal "Hello!"])) parseInitialSectionHeader :: Parser Token parseInitialSectionHeader = do hashes <- takeWhile1 (== '#') _ <- many1 space heading <- takeWhile1 (notInClass "#\n") _ <- (string hashes >> char '\n') <|> (char '\n') return $ SectionHeader (T.length hashes) (parseGtkDoc heading) -- | Parse a list header. Note that the newline before the start of -- the list is "eaten" by this parser, but is restored later by -- `parseGtkDoc`. -- -- === __Examples__ -- >>> parseOnly (parseList <* endOfInput) "\n- First item\n- Second item" -- Right (List [ListItem (GtkDoc [Literal "First item"]) [],ListItem (GtkDoc [Literal "Second item"]) []]) -- -- >>> parseOnly (parseList <* endOfInput) "\n\n- Two line\n item\n\n- Second item,\n also two lines" -- Right (List [ListItem (GtkDoc [Literal "Two line"]) [GtkDoc [Literal "item"]],ListItem (GtkDoc [Literal "Second item,"]) [GtkDoc [Literal "also two lines"]]]) parseList :: Parser Token parseList = do items <- many1 parseListItem return $ List items where parseListItem :: Parser ListItem parseListItem = do _ <- char '\n' _ <- string "\n- " <|> string "- " first <- takeWhile1 (/= '\n') rest <- many' parseLine return $ ListItem (parseGtkDoc first) (map parseGtkDoc rest) parseLine :: Parser Text parseLine = string "\n " >> takeWhile1 (/= '\n')