{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE RecordWildCards #-}

-----------------------------------------------------------------------------
-- |
-- Module      :  Text.Regex.PCRE.Rex
-- Copyright   :  (c) Michael Sloan 2011
--
-- Maintainer  :  Michael Sloan (mgsloan@gmail.com)
-- Stability   :  unstable
-- Portability :  unportable
--
-- This module provides a template Haskell quasiquoter for regular
-- expressions, which provides the following features:
--
-- 1) Compile-time checking that the regular expression is valid.
--
-- 2) Arity of resulting tuple based on the number of selected capture patterns
-- in the regular expression.
--
-- 3) Allows for the inline interpolation of mapping functions :: String -> a.
--
-- 4) Precompiles the regular expression at compile time, by calling into the
-- PCRE library and storing a 'ByteString' literal representation of its state.
--
-- 5) Compile-time configurable to use different PCRE options, turn off
-- precompilation, use 'ByteString's, or set a default mapping expression.
--
-- Since this is a quasiquoter library that generates code using view patterns,
-- the following extensions are required:
--
-- > {-# LANGUAGE TemplateHaskell, QuasiQuotes, ViewPatterns #-}
--
-- Here's a silly example which parses peano numbers of the form Z, S Z,
-- S S Z, etc.  The \s+ means that it is not sensitive to the quantity or type
-- of seperating whitespace. (these examples can also be found in Test.hs)
--
-- > peano :: String -> Maybe Int
-- > peano = [rex|^(?{ length . filter (=='S') } \s* (?:S\s+)*Z)\s*$|]
--
-- > *Main> peano "Z"
-- > Just 0
-- > *Main> peano "S Z"
-- > Just 1
-- > *Main> peano "S   S Z"
-- > Just 2
-- > *Main> peano "S S S Z"
-- > Just 3
-- > *Main> peano "invalid"
-- > Nothing
--
-- The token \"(?{\" introduces a capture group which has a mapping applied to
-- the -- result - in this case \"length . filter (=='S')\".  If the ?{ ... }
-- are omitted, then the capture group is not taken as part of the results of
-- the match.  If the contents of the ?{ ... } is omitted, then 'id' is assumed:
--
-- > parsePair :: String -> Maybe (String, String)
-- > parsePair = [rex|^<\s* (?{ }[^\s,>]+) \s*,\s* (?{ }[^\s,>]+) \s*>$|]
--
-- The following example is derived from
-- http://www.regular-expressions.info/dates.html
--
-- > parseDate :: String -> Maybe (Int, Int, Int)
-- > parseDate [rex|^(?{ read -> y }(?:19|20)\d\d)[- /.]
-- >                 (?{ read -> m }0[1-9]|1[012])[- /.]
-- >                 (?{ read -> d }0[1-9]|[12][0-9]|3[01])$|]
-- >   |  (d > 30 && (m `elem` [4, 6, 9, 11]))
-- >   || (m == 2 &&
-- >       (d == 29 && not (mod y 4 == 0 && (mod y 100 /= 0 || mod y 400 == 0)))
-- >      || (d > 29)) = Nothing
-- >   | otherwise = Just (y, m, d)
-- > parseDate _ = Nothing
--
-- The above example makes use of the regex quasi-quoter as a pattern matcher.
-- The interpolated Haskell patterns are used to construct an implicit view
-- pattern out of the inlined ones.  The above pattern is expanded to the
-- equivalent:
--
-- > parseDate ([rex|^(?{ read }(?:19|20)\d\d)[- /.]
-- >                  (?{ read }0[1-9]|1[012])[- /.]
-- >                  (?{ read }0[1-9]|[12][0-9]|3[01])$|]
-- >           -> Just (y, m, d))
--
--
-- There are also a few other inelegances:
--
-- 1) PCRE captures, unlike .NET regular expressions, yield the last capture
-- made by a particular pattern.  So, for example, (...)*, will only yield one
-- match for '...'.  Ideally these would be detected and yield an implicit [a].
--
-- 2) Patterns with disjunction between captures ((?{f}a) | (?{g}b)) will
-- provide the empty string to one of f / g.  In the case of pattern
-- expressions, it would be convenient to be able to map multiple captures into
-- a single variable / pattern, preferring the first non-empty option.  The
-- general logic for this is a bit complicated, and postponed for a later
-- release.
--
-- Since pcre-light is a wrapper over a C API, the most efficient interface is
-- ByteStrings, as it does not natively speak Haskell lists.  The [rex| ... ]
-- quasiquoter implicitely packs the input into a bystestring, and unpacks the
-- results to strings before providing them to your mappers.  The 'brex'
-- 'QuasiQuoter' is provided for this purpose.  You can also define your own
-- 'QuasiQuoter' - the definitions of the default configurations are as follows:
--
-- > rex  = rexWithConf $ defaultRexConf
-- > brex = rexWithConf $ defaultRexConf { rexByteString = True }
-- >
-- > defaultRexConf = RexConf False True "id" [PCRE.extended] []
--
-- The first @False@ specifies to use @String@ rather than 'ByteString'.  The
-- @True@ argument specifies to use precompilation.  --  The
-- string following is the default mapping expression, used when omitted.
-- Due to GHC staging restrictions, your configuration will need to be in a
-- different module than its usage.
--
-- Inspired by Matt Morrow's regexqq package:
-- <http://hackage.haskell.org/packages/archive/regexqq/latest/doc/html/src/Text-Regex-PCRE-QQ.html>
--
-- And code from Erik Charlebois's interpolatedstring-qq package:
-- <http://hackage.haskell.org/packages/archive/interpolatedstring-qq/latest/doc/html/Text-InterpolatedString-QQ.html>
--
-----------------------------------------------------------------------------

module Text.Regex.PCRE.Rex
  (
  -- * Quasiquoters
    rex, brex
  -- * Configurable QuasiQuoter
  , rexWithConf, RexConf(..), defaultRexConf
  -- * Utility
  , makeQuasiMultiline
  -- * Used by Generated Code
  , maybeRead, padRight
  ) where

import Text.Regex.PCRE.Precompile

import qualified Text.Regex.PCRE.Light as PCRE

import Control.Applicative   ( (<$>) )
import Control.Arrow         ( first )
import Data.ByteString.Char8 ( pack, unpack, empty )
import Data.Either           ( partitionEithers )
import Data.Maybe            ( catMaybes, listToMaybe, fromJust, isJust )
import Data.Char             ( isSpace )
import System.IO.Unsafe      ( unsafePerformIO )

import Language.Haskell.TH
import Language.Haskell.TH.Quote
import Language.Haskell.Meta (toExp,toPat)
import Language.Haskell.Exts.Extension (Extension(..), KnownExtension(..))
import Language.Haskell.Exts (parseExpWithMode, parsePatWithMode,
                              ParseMode, defaultParseMode, extensions,
                              ParseResult(..))

{- TODO:
  * Target Text.Regex.Base ?
  * Add unit tests
-}

data RexConf = RexConf
  { rexByteString :: Bool
  , rexCompiled :: Bool
  , rexView :: String
  , rexPCREOpts :: [PCRE.PCREOption]
  , rexPCREExecOpts :: [PCRE.PCREExecOption]
  }

-- | Default regular expression quasiquoter for 'String's and 'ByteString's,
-- respectively.
rex, brex :: QuasiQuoter
rex  = rexWithConf $ defaultRexConf
brex = rexWithConf $ defaultRexConf { rexByteString = True }

-- | This is a 'QuasiQuoter' transformer, which allows for a whitespace-
--   sensitive quasi-quoter to be broken over multiple lines.  The default 'rex'
--   and 'brex' functions do not need this as they are already whitespace
--   insensitive. However, if you create your own configuration, which omits the
--   'PCRE.extended' parameter, then this could be useful. The leading space of
--   each line is ignored, and all newlines removed.
makeQuasiMultiline :: QuasiQuoter -> QuasiQuoter
makeQuasiMultiline (QuasiQuoter a b c d) =
    QuasiQuoter (a . pre) (b . pre) (c . pre) (d . pre)
  where
    pre = concat . (\(x:xs) -> x : map (dropWhile isSpace) xs) . lines

-- | Default rex configuration, which specifies that the regexes operate on
--   strings, don't postprocess the matched patterns, and use 'PCRE.extended'.
--   This setting causes whitespace to be nonsemantic, and ignores # comments.
defaultRexConf :: RexConf
defaultRexConf = RexConf False False "id" [PCRE.extended] []

-- | A configureable regular-expression QuasiQuoter.  Takes the options to pass
--   to the PCRE engine, along with 'Bool's to flag 'ByteString' usage and
--   non-compilation respecively.  The provided 'String' indicates which mapping
--   function to use, when one is omitted - \"(?{} ...)\".
rexWithConf :: RexConf -> QuasiQuoter
rexWithConf conf =
  QuasiQuoter
    (makeExp conf . parseRex)
    (makePat conf . parseRex)
    undefined
    undefined

-- Template Haskell Code Generation
--------------------------------------------------------------------------------

-- Creates the template haskell Exp which corresponds to the parsed interpolated
-- regex.  This particular code mainly just handles making "read" the
-- default for captures which lack a parser definition, and defaulting to making
-- the parser that doesn't exist
makeExp :: RexConf -> ParseChunks -> ExpQ
makeExp conf (cnt, pat, exs) = buildExp conf cnt pat $
  map (fmap $ forceEitherMsg "makeExp" . parseExp conf) exs

-- Creates the template haskell Pat which corresponds to the parsed interpolated
-- regex. As well as handling the aforementioned defaulting considerations, this
-- turns per-capture view patterns into a single tuple-resulting view pattern.
--
-- E.g. [reg| ... (?{e1 -> v1} ...) ... (?{e2 -> v2} ...) ... |] becomes
--      [reg| ... (?{e1} ...) ... (?{e2} ...) ... |] -> (v1, v2)
makePat :: RexConf -> ParseChunks -> PatQ
makePat conf (cnt, pat, exs) = do
  viewExp <- buildExp conf cnt pat $ map (fmap fst) views
  return . ViewP viewExp
         . (\xs -> ConP 'Just [TupP xs])
         . map snd $ catMaybes views
 where
  views :: [Maybe (Exp, Pat)]
  views = map (fmap processView) exs

  processView :: String -> (Exp, Pat)
  processView xs = case processPat ("("++xs++")") of
    ParseOk (ParensP (ViewP e p)) -> (e,p)
    ParseOk p -> (forceEitherMsg "impossible" (parseExp conf ""), p)
    ParseFailed _ b -> error b

-- Here's where the main meat of the template haskell is generated.  Given the
-- number of captures, the pattern string, and a list of capture expressions,
-- yields the template Haskell Exp which parses a string into a tuple.
buildExp :: RexConf -> Int -> String -> [Maybe Exp] -> ExpQ
buildExp RexConf{..} cnt pat xs =
    [| let r = $(get_regex) in
       $(process) . (flip $ PCRE.match r) $(liftRS rexPCREExecOpts)
     . $(if rexByteString then [| id |] else [| pack |]) |]
  where
    liftRS x = [| read shown |] where shown = show x

    --TODO: make sure this takes advantage of bytestring fusion stuff - is
    -- the right pack / unpack. Or use XOverloadedStrings
    get_regex
      | rexCompiled = [| unsafePerformIO (regexFromTable $! $(table_bytes)) |]
      | otherwise = [| PCRE.compile (pack pat) $(liftRS pcreOpts) |]
    table_bytes = [| pack $(LitE . StringL . unpack <$> runIO table_string) |]
    table_string =
      forceMaybeMsg "Error while getting PCRE compiled representation\n" <$>
      precompile (pack pat) pcreOpts
    pcreOpts = rexPCREOpts

    process = case (null vs, rexByteString) of
      (True, _)  -> [| fmap ( const () ) |]
      (_, False) -> [| fmap ($(return maps) . padRight "" pad . map unpack) |]
      (_, True)  -> [| fmap ($(return maps) . padRight empty pad) |]
    pad = cnt + 2
    maps = LamE [ListP . (WildP:) $ map VarP vs]
         . TupE . map (uncurry AppE)
         -- filter out all "Nothing" exprs
         . map (first fromJust) . filter (isJust . fst)
         -- [(Expr, Variable applied to)]
         . zip xs $ map VarE vs
    vs = [mkName $ "v" ++ show i | i <- [0..cnt]]

-- Parse a Haskell expression into a template Haskell Exp
parseExp :: RexConf -> String -> ParseResult Exp
parseExp conf xs
  = fmap toExp
  . parseExpWithMode rexParseMode
  $ onSpace xs (rexView conf) id

-- probably the quasiquote should have access to the pragmas in the current
-- file, but for now just enable some common extensions that do not steal
-- much syntax
rexParseMode :: ParseMode
rexParseMode = defaultParseMode { extensions = map EnableExtension exts }
  where
    exts =
      [ ViewPatterns
      , ImplicitParams
      , RecordPuns
      , RecordWildCards
      , ScopedTypeVariables
      , TupleSections
      , TypeFamilies
      , TypeOperators
      ]

-- Parse a Haskell pattern match into a template Haskell Pat, yielding Nothing
-- for patterns which consist of just whitespace.
processPat :: String -> ParseResult Pat
processPat xs = fmap toPat $ parsePatWithMode rexParseMode xs

-- Parsing
--------------------------------------------------------------------------------

type ParseChunk = Either String (Maybe String)
type ParseChunks = (Int, String, [Maybe String])

-- Postprocesses the results of the chunk-wise parse output, into the pattern to
-- be pased to the regex engine, with the interpolated patterns / expressions.
parseRex :: String -> ParseChunks
parseRex xs = (cnt, concat chunks, quotes)
  where
    (chunks, quotes) = partitionEithers results
    (cnt, results) = parseRegex (filter (`notElem` "\r\n") xs) "" (-1)

-- A pair of mutually-recursive functions, one for processing the quotation
-- and the other for the anti-quotation.

-- TODO: add check for erroneous { }

parseRegex :: String -> String -> Int -> (Int, [ParseChunk])
parseRegex inp s ix = case inp of
  -- Disallow branch-reset capture.
  ('(':'?':'|':_) ->
    error "Branch reset pattern (?| not allowed in quasi-quoted regex."

  -- Ignore non-capturing parens / handle backslash escaping.
  ('\\':'\\'  :xs) -> parseRegex xs ("\\\\" ++ s) ix
  ('\\':'('   :xs) -> parseRegex xs (")\\"  ++ s) ix
  ('\\':')'   :xs) -> parseRegex xs ("(\\"  ++ s) ix
  ('(':'?':':':xs) -> parseRegex xs (":?("  ++ s) ix

  -- Anti-quote for processing a capture group.
  ('(':'?':'{':xs) -> mapSnd ((Left $ reverse ('(':s)) :)
                    $ parseAntiquote xs "" (ix + 1)

  -- Keep track of how many capture groups we've seen.
  ('(':xs) -> mapSnd (Right Nothing :)
            $ parseRegex xs ('(':s) (ix + 1)

  -- Consume the regular expression contents.
  (x:xs) -> parseRegex xs (x:s) ix
  [] -> (ix, [Left $ reverse s])

parseAntiquote :: String -> String -> Int -> (Int, [ParseChunk])
parseAntiquote inp s ix = case inp of
  -- Escape } in the Haskell splice using a backslash.
  ('\\':'}':xs) -> parseAntiquote xs ('}':s) ix

  -- Capture accumulated antiquote, and continue parsing regex literal.
  ('}':xs) -> mapSnd ((Right (Just (reverse s))):)
            $ parseRegex xs "" ix

  -- Consume the antiquoute contents, appending to a reverse accumulator.
  (x:xs) -> parseAntiquote xs (x:s) ix
  [] -> error "Rex haskell splice terminator, }, never found"

-- Utils
--------------------------------------------------------------------------------

-- | A possibly useful utility function - yields 'Just' x when there is a
-- valid parse, and 'Nothing' otherwise.
maybeRead :: (Read a) => String -> Maybe a
maybeRead = fmap fst . listToMaybe . reads

onSpace :: String -> a -> (String -> a) -> a
onSpace s x f | all isSpace s = x
              | otherwise = f s

-- | Given a desired list-length, if the passed list is too short, it is padded
-- with the given element.  Otherwise, it trims.
padRight :: a -> Int -> [a] -> [a]
padRight _ 0 _ = []
padRight v i [] = replicate i v
padRight v i (x:xs) = x : padRight v (i-1) xs

mapSnd :: (t -> t2) -> (t1, t) -> (t1, t2)
mapSnd f (x, y) = (x, f y)

-- From MissingH

{- | Like 'forceMaybe', but lets you customize the error message raised if
Nothing is supplied. -}
forceMaybeMsg :: String -> Maybe a -> a
forceMaybeMsg msg Nothing = error msg
forceMaybeMsg _ (Just x) = x

{- | Like 'forceEither', but can raise a specific message with the error. -}
forceEitherMsg :: String -> ParseResult a -> a
forceEitherMsg msg (ParseFailed x _) = error $ msg ++ ": " ++ show x
forceEitherMsg _ (ParseOk x) = x