--------------------------------------------------------------------
-- |
-- Module   : Text.Regex.PCRE.Light.Char8
-- Copyright: Copyright (c) 2007-2008, Don Stewart
-- License  : BSD3
--
-- Maintainer:  Don Stewart <dons@galois.com>
-- Stability :  experimental
-- Portability: H98 + FFI
--
--------------------------------------------------------------------
--
-- A simple, portable binding to perl-compatible regular expressions
-- (PCRE) via 8-bit latin1 Strings.
--

module Text.Regex.PCRE.Light.Char8 (

        -- * The abstract PCRE Regex type
          Regex

        -- * String interface
        , compile, compileM
        , match
        , S.captureCount
        , captureNames

        -- * Regex types and constructors externally visible

        -- ** PCRE compile-time bit flags
        , PCREOption

        , anchored
        , auto_callout
        {-, bsr_anycrlf-}
        {-, bsr_unicode-}
        , caseless
        , dollar_endonly
        , dotall
        , dupnames
        , extended
        , extra
        , firstline
        , multiline
        {-, newline_any-}
        {-, newline_anycrlf-}
        , newline_cr
        , newline_crlf
        , newline_lf
        , no_auto_capture
        , ungreedy
        , utf8
        , no_utf8_check

        -- ** PCRE exec-time bit flags
        , PCREExecOption

        , exec_anchored
        {-, exec_newline_any     -}
        {-, exec_newline_anycrlf -}
        , exec_newline_cr
        , exec_newline_crlf
        , exec_newline_lf
        , exec_notbol
        , exec_noteol
        , exec_notempty
        , exec_no_utf8_check
        , exec_partial

    ) where

import qualified Data.ByteString.Char8 as S
import qualified Text.Regex.PCRE.Light as S
import Text.Regex.PCRE.Light hiding (match, compile, compileM, captureNames)

-- | 'compile'
--
-- Compile a perl-compatible regular expression, in a strict bytestring.
-- The arguments are:
--
-- * 'pat': A ByteString, which may or may not be zero-terminated,
-- containing the regular expression to be compiled.
--
-- * 'flags', optional bit flags. If 'Nothing' is provided, defaults are used.
--
-- Valid compile-time flags are:
--
-- * 'anchored'        - Force pattern anchoring
--
-- * 'auto_callout'    - Compile automatic callouts
--
-- * 'bsr_anycrlf'     - \\R matches only CR, LF, or CRLF
--
-- * 'bsr_unicode'     - \\R matches all Unicode line endings
--
-- * 'caseless'        - Do caseless matching
--
-- * 'dollar_endonly'  - '$' not to match newline at end
--
-- * 'dotall'          - matches anything including NL
--
-- * 'dupnames'        - Allow duplicate names for subpatterns
--
-- * 'extended'        - Ignore whitespace and # comments
--
-- * 'extra'           - PCRE extra features (not much use currently)
--
-- * 'firstline'       - Force matching to be  before  newline
--
-- * 'multiline'       - '^' and '$' match newlines within data
--
-- * 'newline_any'     - Recognize any Unicode newline sequence
--
-- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'newline_cr'      - Set CR as the newline sequence
--
-- * 'newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'newline_lf'      - Set LF as the newline sequence
--
-- * 'no_auto_capture' - Disable numbered capturing parentheses (named ones available)
--
-- * 'ungreedy'        - Invert greediness of quantifiers
--
-- * 'utf8'            - Run in UTF-8 mode
--
-- * 'no_utf8_check'   - Do not check the pattern for UTF-8 validity
--
-- If compilation of the pattern fails, the 'Left' constructor is
-- returned with the error string. Otherwise an abstract type
-- representing the compiled regular expression is returned.
-- The regex is allocated via malloc on the C side, and will be
-- deallocated by the runtime when the Haskell value representing it
-- goes out of scope.
--
-- As regexes are often defined statically, GHC will compile them
-- to null-terminated, strict C strings, enabling compilation of the
-- pattern without copying. This may be useful for very large patterns.
--
-- See man pcreapi for more details.
--
compile :: String -> [PCREOption] -> Regex
compile :: String -> [PCREOption] -> Regex
compile String
str [PCREOption]
os = ByteString -> [PCREOption] -> Regex
S.compile (String -> ByteString
S.pack String
str) [PCREOption]
os
{-# INLINE compile #-}

-- | 'compileM'
-- A safe version of 'compile' with failure lifted into an Either
compileM :: String -> [PCREOption] -> Either String Regex
compileM :: String -> [PCREOption] -> Either String Regex
compileM String
str [PCREOption]
os = ByteString -> [PCREOption] -> Either String Regex
S.compileM (String -> ByteString
S.pack String
str) [PCREOption]
os
{-# INLINE compileM #-}


-- | 'match'
--
-- Matches a compiled regular expression against a given subject string,
-- using a matching algorithm that is similar to Perl's. If the subject
-- string doesn't match the regular expression, 'Nothing' is returned,
-- otherwise the portion of the string that matched is returned, along
-- with any captured subpatterns.
--
-- The arguments are:
--
-- * 'regex', a PCRE regular expression value produced by compile
--
-- * 'subject', the subject string to match against
--
-- * 'options', an optional set of exec-time flags to exec.
--
-- Available runtime options are:
--
-- * 'anchored'        - Match only at the first position
--
-- * 'bsr_anycrlf'     - '\\R' matches only CR, LF, or CRLF
--
-- * 'bsr_unicode'     - '\\R' matches all Unicode line endings
--
-- * 'newline_any'     - Recognize any Unicode newline sequence
--
-- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'newline_cr'      - Set CR as the newline sequence
--
-- * 'newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'newline_lf'      - Set LF as the newline sequence
--
-- * 'notbol'          - Subject is not the beginning of a line
--
-- * 'noteol'          - Subject is not the end of a line
--
-- * 'notempty'        - An empty string is not a valid match
--
-- * 'no_utf8_check'   - Do not check the subject for UTF-8
--
-- * 'partial'         - Return PCRE_ERROR_PARTIAL for a partial match
--
-- The result value, and any captured subpatterns, are returned.
-- If the regex is invalid, or the subject string is empty, Nothing
-- is returned.
--
match :: Regex -> String -> [PCREExecOption] -> Maybe [String]
match :: Regex -> String -> [PCREExecOption] -> Maybe [String]
match Regex
r String
subject [PCREExecOption]
os =
    case Regex -> ByteString -> [PCREExecOption] -> Maybe [ByteString]
S.match Regex
r (String -> ByteString
S.pack String
subject) [PCREExecOption]
os of
           Maybe [ByteString]
Nothing -> forall a. Maybe a
Nothing
           Just [ByteString]
x  -> forall a. a -> Maybe a
Just (forall a b. (a -> b) -> [a] -> [b]
map ByteString -> String
S.unpack [ByteString]
x)
{-# INLINE match #-}


-- | 'captureNames'
--
-- Returns the names and numbers of all named subpatterns in the regular
-- expression. Groups are zero-indexed. Unnamed groups are counted, but don't appear in the
-- result list.
--
-- >>> captureNames (compile "(?<one>abc) (def) (?<three>ghi)")
-- [("one", 0), ("three", 2)]
captureNames :: Regex -> [(String, Int)]
captureNames :: Regex -> [(String, Int)]
captureNames Regex
r = forall a b. (a -> b) -> [a] -> [b]
map (\(ByteString
n,Int
i) -> (ByteString -> String
S.unpack ByteString
n, Int
i)) forall a b. (a -> b) -> a -> b
$ Regex -> [(ByteString, Int)]
S.captureNames Regex
r