Distribution/Superdoc/Markup.hs

-- | This module provides conversions between various different markup
-- formats. In principle, it provides two different conversions:
-- 
-- 1. Converting the Superdoc markup language to HTML.
-- 
-- 2. Converting Unicode streams to HTML.
-- 
-- Conversion 1 is exposed by the function 'markup'. This is used by
-- the post-Haddock hook.
-- 
-- Conversion 2 is provided by the 'to_html' function. It is used by
-- the post-HsColour hook.

module Distribution.Superdoc.Markup where

import Distribution.Superdoc.UTF8

import Data.Char
import qualified Data.Set as Set
import Data.Set (Set)
import System.IO
import Text.ParserCombinators.ReadP

-- ----------------------------------------------------------------------
-- * Format definitions

-- $ The Superdoc markup language provides tags for superscripts,
-- subscripts, and more. The following markup is recognized:
-- 
-- \[bold Markup:]
-- 
-- * [bold [literal [super /text/]]]: superscript.
-- 
-- * [bold [literal [sup /text/]]]: superscript.  A synonym for 
--   [literal [super /text/]].
-- 
-- * [bold [literal [sub /text/]]]: subscript.
-- 
-- * [bold [literal [exp /text/]]]: exponential function.
-- 
-- * [bold [literal [bold /text/]]]: bold.
-- 
-- * [bold [literal [center /text/]]]: centered.
-- 
-- * [bold [literal [nobr /text/]]]: inhibit line breaks.
-- 
-- * [bold [literal [image /filename/]]]: insert image.
-- 
-- * [bold [literal [uni /nnnn/]]]: Unicode character.
-- 
-- * [bold [literal [literal text]]]: literal text. Brackets \'[\' and
-- \']\' may only occur in nested pairs.

-- ----------------------------------------------------------------------
-- * Filters

-- | A filter is basically a function from strings to strings. Ideally
-- a filter is lazy, so that the input string is consumed
-- incrementally; however, this is not strictly necessary.  A filter
-- may also return another result in addition to a string.
type Filter a = String -> (String, a)

-- | The identity filter.
filter_id :: Filter ()
filter_id s = (s, ())

-- | Run a filter by reading from one handle and writing to another.
-- The handles are set to binary mode.
filter_handles :: Filter a -> Handle -> Handle -> IO a
filter_handles filter fin fout = do
  hSetBinaryMode fin True
  hSetBinaryMode fout True
  input <- hGetContents fin
  let (output, a) = filter input
  hPutStr fout output
  return a

-- | Run a filter by reading from a file and writing to another file.
-- We do not assume that the two files are necessarily distinct, so
-- special care is taken not to overwrite the output file until after
-- the input file has been read.
filter_file :: Filter a -> FilePath -> FilePath -> IO a
filter_file filter infile outfile = do
  h <- openBinaryFile infile ReadMode
  input <- hGetContents h
  -- make sure we read it all
  case length input of
    0 -> return ()
    _ -> return ()
  hClose h
  let (output, a) = filter input
  withBinaryFile outfile WriteMode $ \h -> do
    hPutStr h output
  return a

-- | Run a filter on a number of files, overwriting each file in
-- place.
filter_files :: Filter a -> [FilePath] -> IO [a]
filter_files filter files = do
  sequence [ filter_file filter f f | f <- files ]

-- ----------------------------------------------------------------------
-- * Markup parser

-- $ This section defines a simple grammar and parser for the Superdoc
-- markup language, translating it to HTML. This is used to
-- post-process Haddock's output.

-- ----------------------------------------------------------------------
-- ** Top-level function

-- | The top-level parser for Superdoc markup, expressed as a
-- filter. In addition to producing HTML output, this filter also
-- returns the set of all image files that were linked to.
markup :: Filter (Set FilePath)
markup input = case readP_to_S markup_top input of
  ((output, images), "") : _ -> (output, images)
  _ -> error "markup: parse error" -- this should not happen

-- ----------------------------------------------------------------------
-- ** Grammar definition

-- | Top-level parser for Superdoc markup.
-- 
-- /top/ ::= (/other/ | /tag/ | /char/)*.
markup_top :: ReadP (String, Set FilePath)
markup_top = do
  lst <- many (lift markup_other <++ markup_tag <++ lift markup_char)
  eof
  return (concat (map fst lst), Set.unions (map snd lst))

-- | Lift a parser returning a string to a parser returning a string and an empty set.
lift :: ReadP String -> ReadP (String, Set FilePath)
lift p = do
  s <- p
  return (s, Set.empty)

-- | Like 'markup', but only permit brackets \"[\" and \"]\" to occur
-- in nested pairs.
-- 
-- /nested/ ::= (/other/ | /tag/ | /bracketed/ | /underscore/)*.
markup_nested :: ReadP (String, Set FilePath)
markup_nested = do
  lst <- many (lift markup_other <++ markup_tag <++ markup_bracketed <++ lift markup_nonbracket)
  return (concat (map fst lst), Set.unions (map snd lst))

-- | Parse bracketed text.
-- 
-- /bracketed/ ::= \"[\" /nested/ \"]\".
markup_bracketed :: ReadP (String, Set FilePath)
markup_bracketed = do
  char '['
  (s, images) <- markup_nested
  char ']'
  return ("[" ++ s ++ "]", images)

-- | Parse any single character except \'[\' and \']\'.
-- 
-- /nonbracket/ ::= any character besides \'[\', \']\'.
markup_nonbracket :: ReadP String
markup_nonbracket = do
  c <- satisfy (\x -> x /= '[' && x /= ']')
  return [c]

-- | Parse any sequence of non-special characters: anything but \'[\' and \']\'.
-- 
-- /other/ ::= (any character besides \'[\', \']\')+.
markup_other :: ReadP String
markup_other = do
  s <- munch1 (\x -> not (x `elem` ['[', ']']))
  return s

-- | Parse any one character.
-- 
-- /char/ ::= any character.
markup_char :: ReadP String
markup_char = do
  c <- get
  return [c]

-- | Parse a tag.
-- 
-- /tag/ ::= \"[\" /keyword/ /body/ \"]\".
markup_tag :: ReadP (String, Set FilePath)
markup_tag = do
  char '['
  name <- markup_keyword
  munch1 isSpace
  res <- markup_body name
  char ']'
  return res

-- | Parse a keyword.
-- 
-- /keyword/ ::= \"sup\" | \"super\" | \"sub\" | \"exp\" | \"bold\" | \"center\" | \"nobr\" | \"image\" | \"uni\" | \"literal\".
markup_keyword :: ReadP String
markup_keyword =
  choice [ string name | name <- keywords ]
  where
    keywords = ["sup", "super", "sub", "exp", "bold", "center", "nobr", "image", "uni", "literal"]

-- | Parse any text with balanced brackets.
-- 
-- /literal/ ::= (/nonbracket/ | /bracketed_literal/)*.
markup_literal :: ReadP String
markup_literal = do
  lst <- many (markup_nonbracket <++ markup_bracketed_literal)
  return (concat lst)
  
-- | Parse any bracketed text with balanced brackets.
-- 
-- /bracketed_literal/ ::= \"[\" /literal/ \"]\".
markup_bracketed_literal :: ReadP String
markup_bracketed_literal = do
  char '['
  s <- markup_literal
  char ']'
  return ("[" ++ s ++ "]")

-- | Parse a tag's body. What to do depends on the tag name.
-- 
-- /body/ ::= /nested/ (for /keyword/ = sup, super, sub, exp, bold, center, nobr),
-- 
-- /body/ ::= /filename/ (for /keyword/ = image),
-- 
-- /body/ ::= /digit/+ (for /keyword/ = uni).
-- 
-- /body/ ::= /literal/ (for /keyword/ = literal).
markup_body :: String -> ReadP (String, Set FilePath)
markup_body "sup" = do
  (s, images) <- markup_nested
  return ("<sup>" ++ s ++ "</sup>", images)
markup_body "super" = do
  (s, images) <- markup_nested
  return ("<sup>" ++ s ++ "</sup>", images)
markup_body "sub" = do
  (s, images) <- markup_nested
  return ("<sub>" ++ s ++ "</sub>", images)
markup_body "exp" = do
  (s, images) <- markup_nested
  return ("<i>e</i><sup>" ++ s ++ "</sup>", images)
markup_body "bold" = do
  (s, images) <- markup_nested
  return ("<b>" ++ s ++ "</b>", images)
markup_body "center" = do
  (s, images) <- markup_nested
  return ("<center>" ++ s ++ "</center>", images)
markup_body "nobr" = do
  (s, images) <- markup_nested
  return ("<nobr>" ++ s ++ "</nobr>", images)
markup_body "image" = do
  filename <- munch1 (\x -> x /= ']' && not (isSpace x))
  return ("<img src=\"" ++ filename ++ "\">", Set.singleton filename)
markup_body "uni" = do
  n <- munch1 isDigit
  skipSpaces
  return ("&#" ++ n ++ ";", Set.empty)
markup_body "literal" = do
  n <- markup_literal
  return (n, Set.empty)
markup_body _ = do
  error "markup_body: unknown tag"

-- ----------------------------------------------------------------------
-- * Unicode to HTML conversion

-- | Convert a tokenized Unicode stream into HTML entities. Non-ASCII
-- characters are represented as HTML entities of the form @&#@/nnnn/@;@.
-- Any invalid characters are simply copied to the output.
to_html :: [Token] -> String
to_html [] = []
to_html (Unicode c:cs)
  | isAscii c = c : to_html cs
  | otherwise = ("&#" ++ (show (ord c)) ++ ";") ++ to_html cs
to_html (Invalid c:cs) = c : to_html cs