-- | This module provides conversions between various different markup -- formats. In principle, it provides four different conversions: -- -- 1. Converting the Superdoc markup language to HTML. -- -- 2. Converting ASCII-armored Unicode to HTML. -- -- 3. Converting Unicode streams to ASCII-armor. -- -- 4. Converting Unicode streams to HTML. -- -- Conversions 1 and 2 are combined into a single parser for the -- Superdoc markup language, which is exposed by the function -- 'markup'. This is used by the post-Haddock hook. -- -- Conversion 3 is provided by the 'to_armor' function. Within the -- Superdoc workflow, this is used by the @superdoc-armor@ -- preprocessor, which is in turns run by the Haddock hook. It makes -- sense to keep conversions 2 and 3 in a single module, because -- they jointly define the format for the ASCII armor. -- -- Conversion 4 is provided by the 'to_html' function. It is used by -- the post-HsColour hook. module Distribution.Superdoc.Markup where import Distribution.Superdoc.UTF8 import Data.Char import qualified Data.Set as Set import Data.Set (Set) import System.IO import Text.ParserCombinators.ReadP -- ---------------------------------------------------------------------- -- * Format definitions -- $ The Superdoc markup language provides tags for superscripts, -- subscripts, and more. The ASCII armor format has been designed to -- hide Unicode characters from tools that do not understand them. The -- following markup is recognized: -- -- \[bold Markup:] -- -- * [bold [literal [super /text/]]]: superscript. -- -- * [bold [literal [sup /text/]]]: superscript. A synonym for -- [literal [super /text/]]. -- -- * [bold [literal [sub /text/]]]: subscript. -- -- * [bold [literal [exp /text/]]]: exponential function. -- -- * [bold [literal [bold /text/]]]: bold. -- -- * [bold [literal [center /text/]]]: centered. -- -- * [bold [literal [nobr /text/]]]: inhibit line breaks. -- -- * [bold [literal [image /filename/]]]: insert image. -- -- * [bold [literal [uni /nnnn/]]]: Unicode character. -- -- * [bold [literal [literal text]]]: literal text. Brackets \'[\' and -- \']\' may only occur in nested pairs. -- -- \[bold ASCII armor:] #ARMOR# -- -- * [bold uni_x_/nnnn/_x_]: armored Unicode lower-case character. -- -- * [bold UNI_x_/nnnn/_x_]: armored Unicode upper-case character. -- -- * [bold ==|/ssss/|==]: armored Unicode symbol and punctuation. -- -- Here, /nnnn/ is a decimal number representing a Unicode code -- point. Also /ssss/ is an encoding of a decimal number representing -- a Unicode code point, using the following symbols for digits: -- -- > ! = 1 ^ = 6 -- > ? = 2 + = 7 -- > ~ = 3 * = 8 -- > $ = 4 - = 9 -- > % = 5 . = 0 -- ---------------------------------------------------------------------- -- * Filters -- | A filter is basically a function from strings to strings. Ideally -- a filter is lazy, so that the input string is consumed -- incrementally; however, this is not strictly necessary. A filter -- may also return another result in addition to a string. type Filter a = String -> (String, a) -- | The identity filter. filter_id :: Filter () filter_id s = (s, ()) -- | Run a filter by reading from one handle and writing to another. -- The handles are set to binary mode. filter_handles :: Filter a -> Handle -> Handle -> IO a filter_handles filter fin fout = do hSetBinaryMode fin True hSetBinaryMode fout True input <- hGetContents fin let (output, a) = filter input hPutStr fout output return a -- | Run a filter by reading from a file and writing to another file. -- We do not assume that the two files are necessarily distinct, so -- special care is taken not to overwrite the output file until after -- the input file has been read. filter_file :: Filter a -> FilePath -> FilePath -> IO a filter_file filter infile outfile = do h <- openBinaryFile infile ReadMode input <- hGetContents h -- make sure we read it all case length input of 0 -> return () _ -> return () hClose h let (output, a) = filter input withBinaryFile outfile WriteMode $ \h -> do hPutStr h output return a -- | Run a filter on a number of files, overwriting each file in -- place. filter_files :: Filter a -> [FilePath] -> IO [a] filter_files filter files = do sequence [ filter_file filter f f | f <- files ] -- ---------------------------------------------------------------------- -- * Markup parser -- $ This section defines a simple grammar and parser for the Superdoc -- markup language, translating it to HTML. In addition, the parser -- also converts ASCII-armored Unicode to HTML. This is used to -- post-process Haddock's output. -- ---------------------------------------------------------------------- -- ** Top-level function -- | The top-level parser for Superdoc markup and ASCII armor, -- expressed as a filter. In addition to producing HTML output, this -- filter also returns the set of all image files that were linked to. markup :: Filter (Set FilePath) markup input = case readP_to_S markup_top input of ((output, images), "") : _ -> (output, images) _ -> error "markup: parse error" -- this should not happen -- ---------------------------------------------------------------------- -- ** Grammar definition -- | Top-level parser for Superdoc markup and ASCII armor. -- -- /top/ ::= (/other/ | /tag/ | /uni/ | /char/)*. markup_top :: ReadP (String, Set FilePath) markup_top = do lst <- many (lift markup_other <++ markup_tag <++ lift markup_uni <++ lift markup_char) eof return (concat (map fst lst), Set.unions (map snd lst)) -- | Lift a parser returning a string to a parser returning a string and an empty set. lift :: ReadP String -> ReadP (String, Set FilePath) lift p = do s <- p return (s, Set.empty) -- | Like 'markup', but only permit brackets \"[\" and \"]\" to occur -- in nested pairs. -- -- /nested/ ::= (/other/ | /tag/ | /uni/ | /bracketed/ | /underscore/)*. markup_nested :: ReadP (String, Set FilePath) markup_nested = do lst <- many (lift markup_other <++ markup_tag <++ lift markup_uni <++ markup_bracketed <++ lift markup_nonbracket) return (concat (map fst lst), Set.unions (map snd lst)) -- | Parse bracketed text. -- -- /bracketed/ ::= \"[\" /nested/ \"]\". markup_bracketed :: ReadP (String, Set FilePath) markup_bracketed = do char '[' (s, images) <- markup_nested char ']' return ("[" ++ s ++ "]", images) -- | Parse a single underscore \'_\'. -- -- /underscore/ ::= \"_\". markup_underscore :: ReadP String markup_underscore = do c <- satisfy (== '_') return [c] -- | Parse any single character except \'[\' and \']\'. -- -- /nonbracket/ ::= any character besides \'[\', \']\'. markup_nonbracket :: ReadP String markup_nonbracket = do c <- satisfy (\x -> x /= '[' && x /= ']') return [c] -- | Parse any sequence of non-special characters: anything but \'[\', -- \'u\', \'U\', \'=\', and \']\'. -- -- /other/ ::= (any character besides \'[\', \'u\', \'U\', \'=\', \']\')+. markup_other :: ReadP String markup_other = do s <- munch1 (\x -> not (x `elem` ['[', 'u', 'U', '=', ']'])) return s -- | Parse any one character. -- -- /char/ ::= any character. markup_char :: ReadP String markup_char = do c <- get return [c] -- | Parse a tag. -- -- /tag/ ::= \"[\" /keyword/ /body/ \"]\". markup_tag :: ReadP (String, Set FilePath) markup_tag = do char '[' name <- markup_keyword munch1 isSpace res <- markup_body name char ']' return res -- | Parse a keyword. -- -- /keyword/ ::= \"sup\" | \"super\" | \"sub\" | \"exp\" | \"bold\" | \"center\" | \"nobr\" | \"image\" | \"uni\" | \"literal\". markup_keyword :: ReadP String markup_keyword = choice [ string name | name <- keywords ] where keywords = ["sup", "super", "sub", "exp", "bold", "center", "nobr", "image", "uni", "literal"] -- | Parse an armored Unicode character. markup_uni :: ReadP String markup_uni = do s <- markup_uni_upper <++ markup_uni_lower <++ markup_uni_symbol return s -- | Parse an upper-case Unicode letter. -- -- /uni_upper/ ::= \"UNI_x_\" /digit/+ \"_x_\". markup_uni_upper :: ReadP String markup_uni_upper = do string "UNI_x_" n <- munch1 isDigit string "_x_" return ("&#" ++ n ++ ";") -- | Parse a lower-case Unicode letter. -- -- /uni_lower/ ::= \"uni_x_\" /digit/+ \"_x_\". markup_uni_lower :: ReadP String markup_uni_lower = do string "uni_x_" n <- munch1 isDigit string "_x_" return ("&#" ++ n ++ ";") -- | Parse a Unicode operator symbol. -- -- /uni_symbol/ ::= \"==|\" /symbol_digit/+ \"|==\". markup_uni_symbol :: ReadP String markup_uni_symbol = do string "==|" n <- many1 markup_symbol_digit string "|==" return ("&#" ++ n ++ ";") -- | Parse a symbol encoding a decimal digit. See 'to_armor' for the -- encoding used. markup_symbol_digit :: ReadP Char markup_symbol_digit = do c <- get case c of '!' -> return '1' '?' -> return '2' '~' -> return '3' '$' -> return '4' '%' -> return '5' '^' -> return '6' '+' -> return '7' '*' -> return '8' '-' -> return '9' '.' -> return '0' _ -> pfail -- | Parse any text with balanced brackets. -- -- /literal/ ::= (/nonbracket/ | /bracketed_literal/)*. markup_literal :: ReadP String markup_literal = do lst <- many (markup_nonbracket <++ markup_bracketed_literal) return (concat lst) -- | Parse any bracketed text with balanced brackets. -- -- /bracketed_literal/ ::= \"[\" /literal/ \"]\". markup_bracketed_literal :: ReadP String markup_bracketed_literal = do char '[' s <- markup_literal char ']' return ("[" ++ s ++ "]") -- | Parse a tag's body. What to do depends on the tag name. -- -- /body/ ::= /nested/ (for /keyword/ = sup, super, sub, exp, bold, center, nobr), -- -- /body/ ::= /filename/ (for /keyword/ = image), -- -- /body/ ::= /digit/+ (for /keyword/ = uni). -- -- /body/ ::= /literal/ (for /keyword/ = literal). markup_body :: String -> ReadP (String, Set FilePath) markup_body "sup" = do (s, images) <- markup_nested return ("" ++ s ++ "", images) markup_body "super" = do (s, images) <- markup_nested return ("" ++ s ++ "", images) markup_body "sub" = do (s, images) <- markup_nested return ("" ++ s ++ "", images) markup_body "exp" = do (s, images) <- markup_nested return ("e" ++ s ++ "", images) markup_body "bold" = do (s, images) <- markup_nested return ("" ++ s ++ "", images) markup_body "center" = do (s, images) <- markup_nested return ("
" ++ s ++ "
", images) markup_body "nobr" = do (s, images) <- markup_nested return ("" ++ s ++ "", images) markup_body "image" = do filename <- munch1 (\x -> x /= ']' && not (isSpace x)) return ("", Set.singleton filename) markup_body "uni" = do n <- munch1 isDigit skipSpaces return ("&#" ++ n ++ ";", Set.empty) markup_body "literal" = do n <- markup_literal return (n, Set.empty) -- ---------------------------------------------------------------------- -- * Unicode to HTML conversion -- | Convert a tokenized Unicode stream into HTML entities. Non-ASCII -- characters are represented as HTML entities of the form @&#@/nnnn/@;@. -- Any invalid characters are simply copied to the output. to_html :: [Token] -> String to_html [] = [] to_html (Unicode c:cs) | isAscii c = c : to_html cs | otherwise = ("&#" ++ (show (ord c)) ++ ";") ++ to_html cs to_html (Invalid c:cs) = c : to_html cs -- ---------------------------------------------------------------------- -- * Unicode to ASCII armor conversion -- | Convert a tokenized Unicode stream to ASCII armor. -- -- The armor is designed to preserve lexical validity: thus, the -- armored version of a valid Haskell lower-case identifier, -- upper-case identifier, or operator is again a valid identifier or -- operator of the same kind. This makes it possible to use armored -- Unicode in source code as well as documentation comments. -- -- The armoring is further designed to use only symbols that will not -- confuse GHC or Haddock. See <#ARMOR ASCII armor> for a description -- of the format. to_armor :: [Token] -> String to_armor [] = [] to_armor (Unicode c:cs) | isAscii c = c : to_armor cs | isUpper c = ("UNI_x_" ++ (show (ord c)) ++ "_x_") ++ to_armor cs | isSymbol c || isPunctuation c = ("==|" ++ (encode (show (ord c))) ++ "|==") ++ to_armor cs | otherwise = ("uni_x_" ++ (show (ord c)) ++ "_x_") ++ to_armor cs to_armor (Invalid c:cs) = c : to_armor cs -- | Encode a string of decimal digits as a string of symbols. See -- <#ARMOR ASCII armor> for the specific mapping used. encode :: String -> String encode = map encode_digit where encode_digit '1' = '!' encode_digit '2' = '?' encode_digit '3' = '~' encode_digit '4' = '$' encode_digit '5' = '%' encode_digit '6' = '^' encode_digit '7' = '+' encode_digit '8' = '*' encode_digit '9' = '-' encode_digit '0' = '.' encode_digit _ = error "encode_digit"