-- | This module provides conversions between various different markup
-- formats. In principle, it provides four different conversions:
--
-- 1. Converting the Superdoc markup language to HTML.
--
-- 2. Converting ASCII-armored Unicode to HTML.
--
-- 3. Converting Unicode streams to ASCII-armor.
--
-- 4. Converting Unicode streams to HTML.
--
-- Conversions 1 and 2 are combined into a single parser for the
-- Superdoc markup language, which is exposed by the function
-- 'markup'. This is used by the post-Haddock hook.
--
-- Conversion 3 is provided by the 'to_armor' function. Within the
-- Superdoc workflow, this is used by the @superdoc-armor@
-- preprocessor, which is in turns run by the Haddock hook. It makes
-- sense to keep conversions 2 and 3 in a single module, because
-- they jointly define the format for the ASCII armor.
--
-- Conversion 4 is provided by the 'to_html' function. It is used by
-- the post-HsColour hook.
module Distribution.Superdoc.Markup where
import Distribution.Superdoc.UTF8
import Data.Char
import qualified Data.Set as Set
import Data.Set (Set)
import System.IO
import Text.ParserCombinators.ReadP
-- ----------------------------------------------------------------------
-- * Format definitions
-- $ The Superdoc markup language provides tags for superscripts,
-- subscripts, and more. The ASCII armor format has been designed to
-- hide Unicode characters from tools that do not understand them. The
-- following markup is recognized:
--
-- \[bold Markup:]
--
-- * [bold [literal [super /text/]]]: superscript.
--
-- * [bold [literal [sup /text/]]]: superscript. A synonym for
-- [literal [super /text/]].
--
-- * [bold [literal [sub /text/]]]: subscript.
--
-- * [bold [literal [exp /text/]]]: exponential function.
--
-- * [bold [literal [bold /text/]]]: bold.
--
-- * [bold [literal [nobr /text/]]]: inhibit line breaks.
--
-- * [bold [literal [image /filename/]]]: insert image.
--
-- * [bold [literal [uni /nnnn/]]]: Unicode character.
--
-- * [bold [literal [literal text]]]: literal text. Brackets \'[\' and
-- \']\' may only occur in nested pairs.
--
-- \[bold ASCII armor:] #ARMOR#
--
-- * [bold uni__/nnnn/__]: armored Unicode lower-case character.
--
-- * [bold UNI__/nnnn/__]: armored Unicode upper-case character.
--
-- * [bold ==|/ssss/|==]: armored Unicode symbol and punctuation.
--
-- Here, /nnnn/ is a decimal number representing a Unicode code
-- point. Also /ssss/ is an encoding of a decimal number representing
-- a Unicode code point, using the following symbols for digits:
--
-- > ! = 1 ^ = 6
-- > ? = 2 + = 7
-- > ~ = 3 * = 8
-- > $ = 4 - = 9
-- > % = 5 . = 0
-- ----------------------------------------------------------------------
-- * Filters
-- | A filter is basically a function from strings to strings. Ideally
-- a filter is lazy, so that the input string is consumed
-- incrementally; however, this is not strictly necessary. A filter
-- may also return another result in addition to a string.
type Filter a = String -> (String, a)
-- | The identity filter.
filter_id :: Filter ()
filter_id s = (s, ())
-- | Run a filter by reading from one handle and writing to another.
-- The handles are set to binary mode.
filter_handles :: Filter a -> Handle -> Handle -> IO a
filter_handles filter fin fout = do
hSetBinaryMode fin True
hSetBinaryMode fout True
input <- hGetContents fin
let (output, a) = filter input
hPutStr fout output
return a
-- | Run a filter by reading from a file and writing to another file.
-- We do not assume that the two files are necessarily distinct, so
-- special care is taken not to overwrite the output file until after
-- the input file has been read.
filter_file :: Filter a -> FilePath -> FilePath -> IO a
filter_file filter infile outfile = do
h <- openBinaryFile infile ReadMode
input <- hGetContents h
-- make sure we read it all
case length input of
0 -> return ()
_ -> return ()
hClose h
let (output, a) = filter input
withBinaryFile outfile WriteMode $ \h -> do
hPutStr h output
return a
-- | Run a filter on a number of files, overwriting each file in
-- place.
filter_files :: Filter a -> [FilePath] -> IO [a]
filter_files filter files = do
sequence [ filter_file filter f f | f <- files ]
-- ----------------------------------------------------------------------
-- * Markup parser
-- $ This section defines a simple grammar and parser for the Superdoc
-- markup language, translating it to HTML. In addition, the parser
-- also converts ASCII-armored Unicode to HTML. This is used to
-- post-process Haddock's output.
-- ----------------------------------------------------------------------
-- ** Top-level function
-- | The top-level parser for Superdoc markup and ASCII armor,
-- expressed as a filter. In addition to producing HTML output, this
-- filter also returns the set of all image files that were linked to.
markup :: Filter (Set FilePath)
markup input = case readP_to_S markup_top input of
((output, images), "") : _ -> (output, images)
_ -> error "markup: parse error" -- this should not happen
-- ----------------------------------------------------------------------
-- ** Grammar definition
-- | Top-level parser for Superdoc markup and ASCII armor.
--
-- /top/ ::= (/other/ | /tag/ | /uni/ | /char/)*.
markup_top :: ReadP (String, Set FilePath)
markup_top = do
lst <- many (lift markup_other <++ markup_tag <++ lift markup_uni <++ lift markup_char)
eof
return (concat (map fst lst), Set.unions (map snd lst))
-- | Lift a parser returning a string to a parser returning a string and an empty set.
lift :: ReadP String -> ReadP (String, Set FilePath)
lift p = do
s <- p
return (s, Set.empty)
-- | Like 'markup', but only permit brackets \"[\" and \"]\" to occur
-- in nested pairs.
--
-- /nested/ ::= (/other/ | /tag/ | /uni/ | /bracketed/ | /underscore/)*.
markup_nested :: ReadP (String, Set FilePath)
markup_nested = do
lst <- many (lift markup_other <++ markup_tag <++ lift markup_uni <++ markup_bracketed <++ lift markup_nonbracket)
return (concat (map fst lst), Set.unions (map snd lst))
-- | Parse bracketed text.
--
-- /bracketed/ ::= \"[\" /nested/ \"]\".
markup_bracketed :: ReadP (String, Set FilePath)
markup_bracketed = do
char '['
(s, images) <- markup_nested
char ']'
return ("[" ++ s ++ "]", images)
-- | Parse a single underscore \'_\'.
--
-- /underscore/ ::= \"_\".
markup_underscore :: ReadP String
markup_underscore = do
c <- satisfy (== '_')
return [c]
-- | Parse any single character except \'[\' and \']\'.
--
-- /nonbracket/ ::= any character besides \'[\', \']\'.
markup_nonbracket :: ReadP String
markup_nonbracket = do
c <- satisfy (\x -> x /= '[' && x /= ']')
return [c]
-- | Parse any sequence of non-special characters: anything but \'[\',
-- \'u\', \'U\', \'=\', and \']\'.
--
-- /other/ ::= (any character besides \'[\', \'u\', \'U\', \'=\', \']\')+.
markup_other :: ReadP String
markup_other = do
s <- munch1 (\x -> not (x `elem` ['[', 'u', 'U', '=', ']']))
return s
-- | Parse any one character.
--
-- /char/ ::= any character.
markup_char :: ReadP String
markup_char = do
c <- get
return [c]
-- | Parse a tag.
--
-- /tag/ ::= \"[\" /keyword/ /body/ \"]\".
markup_tag :: ReadP (String, Set FilePath)
markup_tag = do
char '['
name <- markup_keyword
munch1 isSpace
res <- markup_body name
char ']'
return res
-- | Parse a keyword.
--
-- /keyword/ ::= \"sup\" | \"super\" | \"sub\" | \"exp\" | \"bold\" | \"nobr\" | \"image\" | \"uni\" | \"literal\".
markup_keyword :: ReadP String
markup_keyword =
choice [ string name | name <- keywords ]
where
keywords = ["sup", "super", "sub", "exp", "bold", "nobr", "image", "uni", "literal"]
-- | Parse an armored Unicode character.
markup_uni :: ReadP String
markup_uni = do
s <- markup_uni_upper <++ markup_uni_lower <++ markup_uni_symbol
return s
-- | Parse an upper-case Unicode letter.
--
-- /uni_upper/ ::= \"UNI__\" /digit/+ \"__\".
markup_uni_upper :: ReadP String
markup_uni_upper = do
string "UNI__"
n <- munch1 isDigit
string "__"
return ("" ++ n ++ ";")
-- | Parse a lower-case Unicode letter.
--
-- /uni_lower/ ::= \"uni__\" /digit/+ \"__\".
markup_uni_lower :: ReadP String
markup_uni_lower = do
string "uni__"
n <- munch1 isDigit
string "__"
return ("" ++ n ++ ";")
-- | Parse a Unicode operator symbol.
--
-- /uni_symbol/ ::= \"==|\" /symbol_digit/+ \"|==\".
markup_uni_symbol :: ReadP String
markup_uni_symbol = do
string "==|"
n <- many1 markup_symbol_digit
string "|=="
return ("" ++ n ++ ";")
-- | Parse a symbol encoding a decimal digit. See 'to_armor' for the
-- encoding used.
markup_symbol_digit :: ReadP Char
markup_symbol_digit = do
c <- get
case c of
'!' -> return '1'
'?' -> return '2'
'~' -> return '3'
'$' -> return '4'
'%' -> return '5'
'^' -> return '6'
'+' -> return '7'
'*' -> return '8'
'-' -> return '9'
'.' -> return '0'
_ -> pfail
-- | Parse any text with balanced brackets.
--
-- /literal/ ::= (/nonbracket/ | /bracketed_literal/)*.
markup_literal :: ReadP String
markup_literal = do
lst <- many (markup_nonbracket <++ markup_bracketed_literal)
return (concat lst)
-- | Parse any bracketed text with balanced brackets.
--
-- /bracketed_literal/ ::= \"[\" /literal/ \"]\".
markup_bracketed_literal :: ReadP String
markup_bracketed_literal = do
char '['
s <- markup_literal
char ']'
return ("[" ++ s ++ "]")
-- | Parse a tag's body. What to do depends on the tag name.
--
-- /body/ ::= /nested/ (for /keyword/ = sup, super, sub, exp, bold, nobr),
--
-- /body/ ::= /filename/ (for /keyword/ = image),
--
-- /body/ ::= /digit/+ (for /keyword/ = uni).
--
-- /body/ ::= /literal/ (for /keyword/ = literal).
markup_body :: String -> ReadP (String, Set FilePath)
markup_body "sup" = do
(s, images) <- markup_nested
return ("" ++ s ++ "", images)
markup_body "super" = do
(s, images) <- markup_nested
return ("" ++ s ++ "", images)
markup_body "sub" = do
(s, images) <- markup_nested
return ("" ++ s ++ "", images)
markup_body "exp" = do
(s, images) <- markup_nested
return ("e" ++ s ++ "", images)
markup_body "bold" = do
(s, images) <- markup_nested
return ("" ++ s ++ "", images)
markup_body "nobr" = do
(s, images) <- markup_nested
return ("" ++ s ++ "", images)
markup_body "image" = do
filename <- munch1 (\x -> x /= ']' && not (isSpace x))
return ("", Set.singleton filename)
markup_body "uni" = do
n <- munch1 isDigit
skipSpaces
return ("" ++ n ++ ";", Set.empty)
markup_body "literal" = do
n <- markup_literal
return (n, Set.empty)
-- ----------------------------------------------------------------------
-- * Unicode to HTML conversion
-- | Convert a tokenized Unicode stream into HTML entities. Non-ASCII
-- characters are represented as HTML entities of the form @@/nnnn/@;@.
-- Any invalid characters are simply copied to the output.
to_html :: [Token] -> String
to_html [] = []
to_html (Unicode c:cs)
| isAscii c = c : to_html cs
| otherwise = ("" ++ (show (ord c)) ++ ";") ++ to_html cs
to_html (Invalid c:cs) = c : to_html cs
-- ----------------------------------------------------------------------
-- * Unicode to ASCII armor conversion
-- | Convert a tokenized Unicode stream to ASCII armor.
--
-- The armor is designed to preserve lexical validity: thus, the
-- armored version of a valid Haskell lower-case identifier,
-- upper-case identifier, or operator is again a valid identifier or
-- operator of the same kind. This makes it possible to use armored
-- Unicode in source code as well as documentation comments.
--
-- The armoring is further designed to use only symbols that will not
-- confuse GHC or Haddock. See <#ARMOR ASCII armor> for a description
-- of the format.
to_armor :: [Token] -> String
to_armor [] = []
to_armor (Unicode c:cs)
| isAscii c = c : to_armor cs
| isUpper c =
("UNI__" ++ (show (ord c)) ++ "__") ++ to_armor cs
| isSymbol c || isPunctuation c =
("==|" ++ (encode (show (ord c))) ++ "|==") ++ to_armor cs
| otherwise =
("uni__" ++ (show (ord c)) ++ "__") ++ to_armor cs
to_armor (Invalid c:cs) = c : to_armor cs
-- | Encode a string of decimal digits as a string of symbols. See
-- <#ARMOR ASCII armor> for the specific mapping used.
encode :: String -> String
encode = map encode_digit where
encode_digit '1' = '!'
encode_digit '2' = '?'
encode_digit '3' = '~'
encode_digit '4' = '$'
encode_digit '5' = '%'
encode_digit '6' = '^'
encode_digit '7' = '+'
encode_digit '8' = '*'
encode_digit '9' = '-'
encode_digit '0' = '.'
encode_digit _ = error "encode_digit"