Distribution/Superdoc/Markup.hs

-- | This module provides conversions between various different markup
-- formats. In principle, it provides four different conversions:
-- 
-- 1. Converting the Superdoc markup language to HTML.
-- 
-- 2. Converting ASCII-armored Unicode to HTML.
-- 
-- 3. Converting Unicode streams to ASCII-armor.
-- 
-- 4. Converting Unicode streams to HTML.
-- 
-- Conversions 1 and 2 are combined into a single parser for the
-- Superdoc markup language, which is exposed by the function
-- 'markup'. This is used by the post-Haddock hook.
-- 
-- Conversion 3 is provided by the 'to_armor' function. Within the
-- Superdoc workflow, this is used by the @superdoc-armor@
-- preprocessor, which is in turns run by the Haddock hook. It makes
-- sense to keep conversions 2 and 3 in a single module, because
-- they jointly define the format for the ASCII armor.
-- 
-- Conversion 4 is provided by the 'to_html' function. It is used by
-- the post-HsColour hook.

module Distribution.Superdoc.Markup where

import Distribution.Superdoc.UTF8

import Data.Char
import qualified Data.Set as Set
import Data.Set (Set)
import System.IO
import Text.ParserCombinators.ReadP

-- ----------------------------------------------------------------------
-- * Format definitions

-- $ The Superdoc markup language provides tags for superscripts,
-- subscripts, and more. The ASCII armor format has been designed to
-- hide Unicode characters from tools that do not understand them. The
-- following markup is recognized:
-- 
-- \[bold Markup:]
-- 
-- * [bold [literal [super /text/]]]: superscript.
-- 
-- * [bold [literal [sup /text/]]]: superscript.  A synonym for 
--   [literal [super /text/]].
-- 
-- * [bold [literal [sub /text/]]]: subscript.
-- 
-- * [bold [literal [exp /text/]]]: exponential function.
-- 
-- * [bold [literal [bold /text/]]]: bold.
-- 
-- * [bold [literal [center /text/]]]: centered.
-- 
-- * [bold [literal [nobr /text/]]]: inhibit line breaks.
-- 
-- * [bold [literal [image /filename/]]]: insert image.
-- 
-- * [bold [literal [uni /nnnn/]]]: Unicode character.
-- 
-- * [bold [literal [literal text]]]: literal text. Brackets \'[\' and
-- \']\' may only occur in nested pairs.
-- 
-- \[bold ASCII armor:] #ARMOR#
-- 
-- * [bold uni_x_/nnnn/_x_]: armored Unicode lower-case character.
-- 
-- * [bold UNI_x_/nnnn/_x_]: armored Unicode upper-case character.
-- 
-- * [bold ==|/ssss/|==]: armored Unicode symbol and punctuation.
-- 
-- Here, /nnnn/ is a decimal number representing a Unicode code
-- point. Also /ssss/ is an encoding of a decimal number representing
-- a Unicode code point, using the following symbols for digits:
-- 
-- > ! = 1     ^ = 6
-- > ? = 2     + = 7
-- > ~ = 3     * = 8
-- > $ = 4     - = 9
-- > % = 5     . = 0

-- ----------------------------------------------------------------------
-- * Filters

-- | A filter is basically a function from strings to strings. Ideally
-- a filter is lazy, so that the input string is consumed
-- incrementally; however, this is not strictly necessary.  A filter
-- may also return another result in addition to a string.
type Filter a = String -> (String, a)

-- | The identity filter.
filter_id :: Filter ()
filter_id s = (s, ())

-- | Run a filter by reading from one handle and writing to another.
-- The handles are set to binary mode.
filter_handles :: Filter a -> Handle -> Handle -> IO a
filter_handles filter fin fout = do
  hSetBinaryMode fin True
  hSetBinaryMode fout True
  input <- hGetContents fin
  let (output, a) = filter input
  hPutStr fout output
  return a

-- | Run a filter by reading from a file and writing to another file.
-- We do not assume that the two files are necessarily distinct, so
-- special care is taken not to overwrite the output file until after
-- the input file has been read.
filter_file :: Filter a -> FilePath -> FilePath -> IO a
filter_file filter infile outfile = do
  h <- openBinaryFile infile ReadMode
  input <- hGetContents h
  -- make sure we read it all
  case length input of
    0 -> return ()
    _ -> return ()
  hClose h
  let (output, a) = filter input
  withBinaryFile outfile WriteMode $ \h -> do
    hPutStr h output
  return a

-- | Run a filter on a number of files, overwriting each file in
-- place.
filter_files :: Filter a -> [FilePath] -> IO [a]
filter_files filter files = do
  sequence [ filter_file filter f f | f <- files ]

-- ----------------------------------------------------------------------
-- * Markup parser

-- $ This section defines a simple grammar and parser for the Superdoc
-- markup language, translating it to HTML. In addition, the parser
-- also converts ASCII-armored Unicode to HTML. This is used to
-- post-process Haddock's output.

-- ----------------------------------------------------------------------
-- ** Top-level function

-- | The top-level parser for Superdoc markup and ASCII armor,
-- expressed as a filter. In addition to producing HTML output, this
-- filter also returns the set of all image files that were linked to.
markup :: Filter (Set FilePath)
markup input = case readP_to_S markup_top input of
  ((output, images), "") : _ -> (output, images)
  _ -> error "markup: parse error" -- this should not happen

-- ----------------------------------------------------------------------
-- ** Grammar definition

-- | Top-level parser for Superdoc markup and ASCII armor.
-- 
-- /top/ ::= (/other/ | /tag/ | /uni/ | /char/)*.
markup_top :: ReadP (String, Set FilePath)
markup_top = do
  lst <- many (lift markup_other <++ markup_tag <++ lift markup_uni <++ lift markup_char)
  eof
  return (concat (map fst lst), Set.unions (map snd lst))

-- | Lift a parser returning a string to a parser returning a string and an empty set.
lift :: ReadP String -> ReadP (String, Set FilePath)
lift p = do
  s <- p
  return (s, Set.empty)

-- | Like 'markup', but only permit brackets \"[\" and \"]\" to occur
-- in nested pairs.
-- 
-- /nested/ ::= (/other/ | /tag/ | /uni/ | /bracketed/ | /underscore/)*.
markup_nested :: ReadP (String, Set FilePath)
markup_nested = do
  lst <- many (lift markup_other <++ markup_tag <++ lift markup_uni <++ markup_bracketed <++ lift markup_nonbracket)
  return (concat (map fst lst), Set.unions (map snd lst))

-- | Parse bracketed text.
-- 
-- /bracketed/ ::= \"[\" /nested/ \"]\".
markup_bracketed :: ReadP (String, Set FilePath)
markup_bracketed = do
  char '['
  (s, images) <- markup_nested
  char ']'
  return ("[" ++ s ++ "]", images)

-- | Parse a single underscore \'_\'.
-- 
-- /underscore/ ::= \"_\".
markup_underscore :: ReadP String
markup_underscore = do
  c <- satisfy (== '_')
  return [c]

-- | Parse any single character except \'[\' and \']\'.
-- 
-- /nonbracket/ ::= any character besides \'[\', \']\'.
markup_nonbracket :: ReadP String
markup_nonbracket = do
  c <- satisfy (\x -> x /= '[' && x /= ']')
  return [c]

-- | Parse any sequence of non-special characters: anything but \'[\',
-- \'u\', \'U\', \'=\', and \']\'.
-- 
-- /other/ ::= (any character besides \'[\', \'u\', \'U\', \'=\', \']\')+.
markup_other :: ReadP String
markup_other = do
  s <- munch1 (\x -> not (x `elem` ['[', 'u', 'U', '=', ']']))
  return s

-- | Parse any one character.
-- 
-- /char/ ::= any character.
markup_char :: ReadP String
markup_char = do
  c <- get
  return [c]

-- | Parse a tag.
-- 
-- /tag/ ::= \"[\" /keyword/ /body/ \"]\".
markup_tag :: ReadP (String, Set FilePath)
markup_tag = do
  char '['
  name <- markup_keyword
  munch1 isSpace
  res <- markup_body name
  char ']'
  return res

-- | Parse a keyword.
-- 
-- /keyword/ ::= \"sup\" | \"super\" | \"sub\" | \"exp\" | \"bold\" | \"center\" | \"nobr\" | \"image\" | \"uni\" | \"literal\".
markup_keyword :: ReadP String
markup_keyword =
  choice [ string name | name <- keywords ]
  where
    keywords = ["sup", "super", "sub", "exp", "bold", "center", "nobr", "image", "uni", "literal"]

-- | Parse an armored Unicode character.
markup_uni :: ReadP String
markup_uni = do
  s <- markup_uni_upper <++ markup_uni_lower <++ markup_uni_symbol
  return s

-- | Parse an upper-case Unicode letter.
-- 
-- /uni_upper/ ::= \"UNI_x_\" /digit/+ \"_x_\".
markup_uni_upper :: ReadP String
markup_uni_upper = do
  string "UNI_x_"
  n <- munch1 isDigit
  string "_x_"
  return ("&#" ++ n ++ ";")

-- | Parse a lower-case Unicode letter.
-- 
-- /uni_lower/ ::= \"uni_x_\" /digit/+ \"_x_\".
markup_uni_lower :: ReadP String
markup_uni_lower = do
  string "uni_x_"
  n <- munch1 isDigit
  string "_x_"
  return ("&#" ++ n ++ ";")

-- | Parse a Unicode operator symbol.
-- 
-- /uni_symbol/ ::= \"==|\" /symbol_digit/+ \"|==\".
markup_uni_symbol :: ReadP String
markup_uni_symbol = do
  string "==|"
  n <- many1 markup_symbol_digit
  string "|=="
  return ("&#" ++ n ++ ";")

-- | Parse a symbol encoding a decimal digit. See 'to_armor' for the
-- encoding used.
markup_symbol_digit :: ReadP Char
markup_symbol_digit = do
  c <- get
  case c of
    '!' -> return '1'
    '?' -> return '2'
    '~' -> return '3'
    '$' -> return '4'
    '%' -> return '5'
    '^' -> return '6'
    '+' -> return '7'
    '*' -> return '8'
    '-' -> return '9'
    '.' -> return '0'
    _ -> pfail

-- | Parse any text with balanced brackets.
-- 
-- /literal/ ::= (/nonbracket/ | /bracketed_literal/)*.
markup_literal :: ReadP String
markup_literal = do
  lst <- many (markup_nonbracket <++ markup_bracketed_literal)
  return (concat lst)
  
-- | Parse any bracketed text with balanced brackets.
-- 
-- /bracketed_literal/ ::= \"[\" /literal/ \"]\".
markup_bracketed_literal :: ReadP String
markup_bracketed_literal = do
  char '['
  s <- markup_literal
  char ']'
  return ("[" ++ s ++ "]")

-- | Parse a tag's body. What to do depends on the tag name.
-- 
-- /body/ ::= /nested/ (for /keyword/ = sup, super, sub, exp, bold, center, nobr),
-- 
-- /body/ ::= /filename/ (for /keyword/ = image),
-- 
-- /body/ ::= /digit/+ (for /keyword/ = uni).
-- 
-- /body/ ::= /literal/ (for /keyword/ = literal).
markup_body :: String -> ReadP (String, Set FilePath)
markup_body "sup" = do
  (s, images) <- markup_nested
  return ("<sup>" ++ s ++ "</sup>", images)
markup_body "super" = do
  (s, images) <- markup_nested
  return ("<sup>" ++ s ++ "</sup>", images)
markup_body "sub" = do
  (s, images) <- markup_nested
  return ("<sub>" ++ s ++ "</sub>", images)
markup_body "exp" = do
  (s, images) <- markup_nested
  return ("<i>e</i><sup>" ++ s ++ "</sup>", images)
markup_body "bold" = do
  (s, images) <- markup_nested
  return ("<b>" ++ s ++ "</b>", images)
markup_body "center" = do
  (s, images) <- markup_nested
  return ("<center>" ++ s ++ "</center>", images)
markup_body "nobr" = do
  (s, images) <- markup_nested
  return ("<nobr>" ++ s ++ "</nobr>", images)
markup_body "image" = do
  filename <- munch1 (\x -> x /= ']' && not (isSpace x))
  return ("<img src=\"" ++ filename ++ "\">", Set.singleton filename)
markup_body "uni" = do
  n <- munch1 isDigit
  skipSpaces
  return ("&#" ++ n ++ ";", Set.empty)
markup_body "literal" = do
  n <- markup_literal
  return (n, Set.empty)

-- ----------------------------------------------------------------------
-- * Unicode to HTML conversion

-- | Convert a tokenized Unicode stream into HTML entities. Non-ASCII
-- characters are represented as HTML entities of the form @&#@/nnnn/@;@.
-- Any invalid characters are simply copied to the output.
to_html :: [Token] -> String
to_html [] = []
to_html (Unicode c:cs)
  | isAscii c = c : to_html cs
  | otherwise = ("&#" ++ (show (ord c)) ++ ";") ++ to_html cs
to_html (Invalid c:cs) = c : to_html cs

-- ----------------------------------------------------------------------
-- * Unicode to ASCII armor conversion

-- | Convert a tokenized Unicode stream to ASCII armor. 
-- 
-- The armor is designed to preserve lexical validity: thus, the
-- armored version of a valid Haskell lower-case identifier,
-- upper-case identifier, or operator is again a valid identifier or
-- operator of the same kind. This makes it possible to use armored
-- Unicode in source code as well as documentation comments.
-- 
-- The armoring is further designed to use only symbols that will not
-- confuse GHC or Haddock. See <#ARMOR ASCII armor> for a description
-- of the format.
to_armor :: [Token] -> String
to_armor [] = []
to_armor (Unicode c:cs)
  | isAscii c = c : to_armor cs
  | isUpper c =
    ("UNI_x_" ++ (show (ord c)) ++ "_x_") ++ to_armor cs
  | isSymbol c || isPunctuation c =
    ("==|" ++ (encode (show (ord c))) ++ "|==") ++ to_armor cs
  | otherwise =
    ("uni_x_" ++ (show (ord c)) ++ "_x_") ++ to_armor cs
to_armor (Invalid c:cs) = c : to_armor cs

-- | Encode a string of decimal digits as a string of symbols. See
-- <#ARMOR ASCII armor> for the specific mapping used.
encode :: String -> String
encode = map encode_digit where
  encode_digit '1' = '!'
  encode_digit '2' = '?'
  encode_digit '3' = '~'
  encode_digit '4' = '$'
  encode_digit '5' = '%'
  encode_digit '6' = '^'
  encode_digit '7' = '+'
  encode_digit '8' = '*'
  encode_digit '9' = '-'
  encode_digit '0' = '.'
  encode_digit _ = error "encode_digit"