{-# OPTIONS_GHC -fno-warn-missing-methods #-}
{-# LANGUAGE CPP                   #-}
{-# LANGUAGE DeriveFunctor         #-}
{-# LANGUAGE FlexibleInstances     #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings     #-}
{-# LANGUAGE StandaloneDeriving    #-}
{-# LANGUAGE TypeSynonymInstances  #-}
module Skylighting.Tokenizer (
    tokenize
  , TokenizerConfig(..)
  ) where

import Control.Applicative
import Control.Monad.Except
import Control.Monad.Reader
import Control.Monad.State.Strict
import qualified Data.Attoparsec.ByteString.Char8 as A
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.UTF8 as UTF8
import Data.CaseInsensitive (mk)
import Data.Char (isAlphaNum, isAscii, isDigit, isLetter, isPrint, isSpace, ord)
import qualified Data.Map as Map
import Data.Maybe (catMaybes)
import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as Text
import Data.Text.Encoding (decodeUtf8', encodeUtf8)
import Debug.Trace
import Skylighting.Regex
import Skylighting.Types
import Text.Printf (printf)
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup
#endif

newtype ContextStack = ContextStack{ ContextStack -> [Context]
unContextStack :: [Context] }
  deriving (Int -> ContextStack -> ShowS
[ContextStack] -> ShowS
ContextStack -> String
(Int -> ContextStack -> ShowS)
-> (ContextStack -> String)
-> ([ContextStack] -> ShowS)
-> Show ContextStack
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [ContextStack] -> ShowS
$cshowList :: [ContextStack] -> ShowS
show :: ContextStack -> String
$cshow :: ContextStack -> String
showsPrec :: Int -> ContextStack -> ShowS
$cshowsPrec :: Int -> ContextStack -> ShowS
Show)

data TokenizerState = TokenizerState{
    TokenizerState -> ByteString
input               :: ByteString
  , TokenizerState -> Bool
endline             :: Bool
  , TokenizerState -> Char
prevChar            :: Char
  , TokenizerState -> ContextStack
contextStack        :: ContextStack
  , TokenizerState -> [ByteString]
captures            :: [ByteString]
  , TokenizerState -> Int
column              :: Int
  , TokenizerState -> Bool
lineContinuation    :: Bool
  , TokenizerState -> Maybe Int
firstNonspaceColumn :: Maybe Int
  , TokenizerState -> Map RE Regex
compiledRegexes     :: Map.Map RE Regex
}

-- | Configuration options for 'tokenize'.
data TokenizerConfig = TokenizerConfig{
    TokenizerConfig -> SyntaxMap
syntaxMap   :: SyntaxMap  -- ^ Syntax map to use
  , TokenizerConfig -> Bool
traceOutput :: Bool       -- ^ Generate trace output for debugging
} deriving (Int -> TokenizerConfig -> ShowS
[TokenizerConfig] -> ShowS
TokenizerConfig -> String
(Int -> TokenizerConfig -> ShowS)
-> (TokenizerConfig -> String)
-> ([TokenizerConfig] -> ShowS)
-> Show TokenizerConfig
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [TokenizerConfig] -> ShowS
$cshowList :: [TokenizerConfig] -> ShowS
show :: TokenizerConfig -> String
$cshow :: TokenizerConfig -> String
showsPrec :: Int -> TokenizerConfig -> ShowS
$cshowsPrec :: Int -> TokenizerConfig -> ShowS
Show)

data Result e a = Success a
                | Failure
                | Error e
     deriving (a -> Result e b -> Result e a
(a -> b) -> Result e a -> Result e b
(forall a b. (a -> b) -> Result e a -> Result e b)
-> (forall a b. a -> Result e b -> Result e a)
-> Functor (Result e)
forall a b. a -> Result e b -> Result e a
forall a b. (a -> b) -> Result e a -> Result e b
forall e a b. a -> Result e b -> Result e a
forall e a b. (a -> b) -> Result e a -> Result e b
forall (f :: * -> *).
(forall a b. (a -> b) -> f a -> f b)
-> (forall a b. a -> f b -> f a) -> Functor f
<$ :: a -> Result e b -> Result e a
$c<$ :: forall e a b. a -> Result e b -> Result e a
fmap :: (a -> b) -> Result e a -> Result e b
$cfmap :: forall e a b. (a -> b) -> Result e a -> Result e b
Functor)

deriving instance (Show a, Show e) => Show (Result e a)

data TokenizerM a = TM { TokenizerM a
-> TokenizerConfig
-> TokenizerState
-> (TokenizerState, Result String a)
runTokenizerM :: TokenizerConfig
                                       -> TokenizerState
                                       -> (TokenizerState, Result String a) }

mapsnd :: (a -> b) -> (c, a) -> (c, b)
mapsnd :: (a -> b) -> (c, a) -> (c, b)
mapsnd a -> b
f (c
x, a
y) = (c
x, a -> b
f a
y)

instance Functor TokenizerM where
  fmap :: (a -> b) -> TokenizerM a -> TokenizerM b
fmap a -> b
f (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
g) = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String b))
-> TokenizerM b
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s -> (Result String a -> Result String b)
-> (TokenizerState, Result String a)
-> (TokenizerState, Result String b)
forall a b c. (a -> b) -> (c, a) -> (c, b)
mapsnd ((a -> b) -> Result String a -> Result String b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap a -> b
f) (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
g TokenizerConfig
c TokenizerState
s))

instance Applicative TokenizerM where
  pure :: a -> TokenizerM a
pure a
x = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, a -> Result String a
forall e a. a -> Result e a
Success a
x))
  (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String (a -> b))
f) <*> :: TokenizerM (a -> b) -> TokenizerM a -> TokenizerM b
<*> (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
y) = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String b))
-> TokenizerM b
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                           case (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String (a -> b))
f TokenizerConfig
c TokenizerState
s) of
                              (TokenizerState
s', Result String (a -> b)
Failure   ) -> (TokenizerState
s', Result String b
forall e a. Result e a
Failure)
                              (TokenizerState
s', Error String
e   ) -> (TokenizerState
s', String -> Result String b
forall e a. e -> Result e a
Error String
e)
                              (TokenizerState
s', Success a -> b
f') ->
                                  case (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
y TokenizerConfig
c TokenizerState
s') of
                                    (TokenizerState
s'', Result String a
Failure   ) -> (TokenizerState
s'', Result String b
forall e a. Result e a
Failure)
                                    (TokenizerState
s'', Error String
e'  ) -> (TokenizerState
s'', String -> Result String b
forall e a. e -> Result e a
Error String
e')
                                    (TokenizerState
s'', Success a
y') -> (TokenizerState
s'', b -> Result String b
forall e a. a -> Result e a
Success (a -> b
f' a
y')))


instance Monad TokenizerM where
  return :: a -> TokenizerM a
return = a -> TokenizerM a
forall (f :: * -> *) a. Applicative f => a -> f a
pure
  (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
x) >>= :: TokenizerM a -> (a -> TokenizerM b) -> TokenizerM b
>>= a -> TokenizerM b
f = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String b))
-> TokenizerM b
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                       case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
x TokenizerConfig
c TokenizerState
s of
                            (TokenizerState
s', Result String a
Failure   ) -> (TokenizerState
s', Result String b
forall e a. Result e a
Failure)
                            (TokenizerState
s', Error String
e   ) -> (TokenizerState
s', String -> Result String b
forall e a. e -> Result e a
Error String
e)
                            (TokenizerState
s', Success a
x') -> TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String b)
g TokenizerConfig
c TokenizerState
s'
                              where TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String b)
g = a -> TokenizerM b
f a
x')

instance Alternative TokenizerM where
  empty :: TokenizerM a
empty = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, Result String a
forall e a. Result e a
Failure))
  <|> :: TokenizerM a -> TokenizerM a -> TokenizerM a
(<|>) (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
x) (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
y) = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                           case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
x TokenizerConfig
c TokenizerState
s of
                                (TokenizerState
_, Result String a
Failure   )  -> TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
y TokenizerConfig
c TokenizerState
s
                                (TokenizerState
s', Error String
e   ) -> (TokenizerState
s', String -> Result String a
forall e a. e -> Result e a
Error String
e)
                                (TokenizerState
s', Success a
x') -> (TokenizerState
s', a -> Result String a
forall e a. a -> Result e a
Success a
x'))
  many :: TokenizerM a -> TokenizerM [a]
many (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
x) = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String [a]))
-> TokenizerM [a]
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                    case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
x TokenizerConfig
c TokenizerState
s of
                       (TokenizerState
_, Result String a
Failure   )  -> (TokenizerState
s, [a] -> Result String [a]
forall e a. a -> Result e a
Success [])
                       (TokenizerState
s', Error String
e   ) -> (TokenizerState
s', String -> Result String [a]
forall e a. e -> Result e a
Error String
e)
                       (TokenizerState
s', Success a
x') -> (Result String [a] -> Result String [a])
-> (TokenizerState, Result String [a])
-> (TokenizerState, Result String [a])
forall a b c. (a -> b) -> (c, a) -> (c, b)
mapsnd (([a] -> [a]) -> Result String [a] -> Result String [a]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (a
x'a -> [a] -> [a]
forall a. a -> [a] -> [a]
:)) (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String [a])
g TokenizerConfig
c TokenizerState
s')
                         where TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String [a])
g = TokenizerM a -> TokenizerM [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many ((TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
x))
  some :: TokenizerM a -> TokenizerM [a]
some TokenizerM a
x = (:) (a -> [a] -> [a]) -> TokenizerM a -> TokenizerM ([a] -> [a])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> TokenizerM a
x TokenizerM ([a] -> [a]) -> TokenizerM [a] -> TokenizerM [a]
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> TokenizerM a -> TokenizerM [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many TokenizerM a
x

instance MonadPlus TokenizerM where
  mzero :: TokenizerM a
mzero = TokenizerM a
forall (f :: * -> *) a. Alternative f => f a
empty
  mplus :: TokenizerM a -> TokenizerM a -> TokenizerM a
mplus = TokenizerM a -> TokenizerM a -> TokenizerM a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
(<|>)

instance MonadReader TokenizerConfig TokenizerM where
  ask :: TokenizerM TokenizerConfig
ask = (TokenizerConfig
 -> TokenizerState
 -> (TokenizerState, Result String TokenizerConfig))
-> TokenizerM TokenizerConfig
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s -> (TokenizerState
s, TokenizerConfig -> Result String TokenizerConfig
forall e a. a -> Result e a
Success TokenizerConfig
c))
  local :: (TokenizerConfig -> TokenizerConfig)
-> TokenizerM a -> TokenizerM a
local TokenizerConfig -> TokenizerConfig
f (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
x) = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s -> TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
x (TokenizerConfig -> TokenizerConfig
f TokenizerConfig
c) TokenizerState
s)

instance MonadState TokenizerState TokenizerM where
  get :: TokenizerM TokenizerState
get = (TokenizerConfig
 -> TokenizerState
 -> (TokenizerState, Result String TokenizerState))
-> TokenizerM TokenizerState
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, TokenizerState -> Result String TokenizerState
forall e a. a -> Result e a
Success TokenizerState
s))
  put :: TokenizerState -> TokenizerM ()
put TokenizerState
x = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String ()))
-> TokenizerM ()
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
_ -> (TokenizerState
x, () -> Result String ()
forall e a. a -> Result e a
Success ()))

instance MonadError String TokenizerM where
  throwError :: String -> TokenizerM a
throwError String
e = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, String -> Result String a
forall e a. e -> Result e a
Error String
e))
  catchError :: TokenizerM a -> (String -> TokenizerM a) -> TokenizerM a
catchError (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
x) String -> TokenizerM a
f = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result String a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s -> case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
x TokenizerConfig
c TokenizerState
s of
                                      (TokenizerState
_, Error String
e) -> let TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
y = String -> TokenizerM a
f String
e in TokenizerConfig
-> TokenizerState -> (TokenizerState, Result String a)
y TokenizerConfig
c TokenizerState
s
                                      (TokenizerState, Result String a)
z            -> (TokenizerState, Result String a)
z)

-- | Tokenize some text using 'Syntax'.
tokenize :: TokenizerConfig -> Syntax -> Text -> Either String [SourceLine]
tokenize :: TokenizerConfig -> Syntax -> Text -> Either String [SourceLine]
tokenize TokenizerConfig
config Syntax
syntax Text
inp =
  case TokenizerM [SourceLine]
-> TokenizerConfig
-> TokenizerState
-> (TokenizerState, Result String [SourceLine])
forall a.
TokenizerM a
-> TokenizerConfig
-> TokenizerState
-> (TokenizerState, Result String a)
runTokenizerM TokenizerM [SourceLine]
action TokenizerConfig
config TokenizerState
initState of
       (TokenizerState
_, Success [SourceLine]
ls) -> [SourceLine] -> Either String [SourceLine]
forall a b. b -> Either a b
Right [SourceLine]
ls
       (TokenizerState
_, Error String
e)    -> String -> Either String [SourceLine]
forall a b. a -> Either a b
Left String
e
       (TokenizerState
_, Result String [SourceLine]
Failure)    -> String -> Either String [SourceLine]
forall a b. a -> Either a b
Left String
"Could not tokenize code"
  where
    action :: TokenizerM [SourceLine]
action = ((ByteString, Int) -> TokenizerM SourceLine)
-> [(ByteString, Int)] -> TokenizerM [SourceLine]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (ByteString, Int) -> TokenizerM SourceLine
tokenizeLine ([ByteString] -> [Int] -> [(ByteString, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip (ByteString -> [ByteString]
BS.lines (Text -> ByteString
encodeUtf8 Text
inp)) [Int
1..])
    initState :: TokenizerState
initState = TokenizerState
startingState{ endline :: Bool
endline = Text -> Bool
Text.null Text
inp
                             , contextStack :: ContextStack
contextStack =
                                   case Text -> Syntax -> Maybe Context
lookupContext
                                         (Syntax -> Text
sStartingContext Syntax
syntax) Syntax
syntax of
                                         Just Context
c  -> [Context] -> ContextStack
ContextStack [Context
c]
                                         Maybe Context
Nothing -> [Context] -> ContextStack
ContextStack [] }


info :: String -> TokenizerM ()
info :: String -> TokenizerM ()
info String
s = do
  Bool
tr <- (TokenizerConfig -> Bool) -> TokenizerM Bool
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> Bool
traceOutput
  Bool -> TokenizerM () -> TokenizerM ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
tr (TokenizerM () -> TokenizerM ()) -> TokenizerM () -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ String -> TokenizerM () -> TokenizerM ()
forall a. String -> a -> a
trace String
s (() -> TokenizerM ()
forall (m :: * -> *) a. Monad m => a -> m a
return ())

infoContextStack :: TokenizerM ()
infoContextStack :: TokenizerM ()
infoContextStack = do
  Bool
tr <- (TokenizerConfig -> Bool) -> TokenizerM Bool
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> Bool
traceOutput
  Bool -> TokenizerM () -> TokenizerM ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
tr (TokenizerM () -> TokenizerM ()) -> TokenizerM () -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ do
    ContextStack [Context]
stack <- (TokenizerState -> ContextStack) -> TokenizerM ContextStack
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ContextStack
contextStack
    String -> TokenizerM ()
info (String -> TokenizerM ()) -> String -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ String
"CONTEXT STACK " String -> ShowS
forall a. [a] -> [a] -> [a]
++ [Text] -> String
forall a. Show a => a -> String
show ((Context -> Text) -> [Context] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Context -> Text
cName [Context]
stack)

popContextStack :: TokenizerM ()
popContextStack :: TokenizerM ()
popContextStack = do
  ContextStack [Context]
cs <- (TokenizerState -> ContextStack) -> TokenizerM ContextStack
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ContextStack
contextStack
  case [Context]
cs of
       []     -> String -> TokenizerM ()
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError String
"Empty context stack (the impossible happened)"
       -- programming error
       (Context
_:[]) -> () -> TokenizerM ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
       (Context
_:[Context]
rest) -> do
         (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify (\TokenizerState
st -> TokenizerState
st{ contextStack :: ContextStack
contextStack = [Context] -> ContextStack
ContextStack [Context]
rest })
         TokenizerM Context
currentContext TokenizerM Context -> (Context -> TokenizerM ()) -> TokenizerM ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Context -> TokenizerM ()
checkLineEnd
         TokenizerM ()
infoContextStack

pushContextStack :: Context -> TokenizerM ()
pushContextStack :: Context -> TokenizerM ()
pushContextStack Context
cont = do
  (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify (\TokenizerState
st -> TokenizerState
st{ contextStack :: ContextStack
contextStack =
                      [Context] -> ContextStack
ContextStack (Context
cont Context -> [Context] -> [Context]
forall a. a -> [a] -> [a]
: ContextStack -> [Context]
unContextStack (TokenizerState -> ContextStack
contextStack TokenizerState
st)) } )
  -- not sure why we need this in pop but not here, but if we
  -- put it here we can get loops...
  -- checkLineEnd cont
  TokenizerM ()
infoContextStack

currentContext :: TokenizerM Context
currentContext :: TokenizerM Context
currentContext = do
  ContextStack [Context]
cs <- (TokenizerState -> ContextStack) -> TokenizerM ContextStack
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ContextStack
contextStack
  case [Context]
cs of
       []    -> String -> TokenizerM Context
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError String
"Empty context stack" -- programming error
       (Context
c:[Context]
_) -> Context -> TokenizerM Context
forall (m :: * -> *) a. Monad m => a -> m a
return Context
c

doContextSwitch :: ContextSwitch -> TokenizerM ()
doContextSwitch :: ContextSwitch -> TokenizerM ()
doContextSwitch ContextSwitch
Pop = TokenizerM ()
popContextStack
doContextSwitch (Push (Text
syn,Text
c)) = do
  SyntaxMap
syntaxes <- (TokenizerConfig -> SyntaxMap) -> TokenizerM SyntaxMap
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> SyntaxMap
syntaxMap
  case Text -> SyntaxMap -> Maybe Syntax
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
syn SyntaxMap
syntaxes Maybe Syntax -> (Syntax -> Maybe Context) -> Maybe Context
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Text -> Syntax -> Maybe Context
lookupContext Text
c of
       Just Context
con -> Context -> TokenizerM ()
pushContextStack Context
con
       Maybe Context
Nothing  -> String -> TokenizerM ()
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError (String -> TokenizerM ()) -> String -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ String
"Unknown syntax or context: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ (Text, Text) -> String
forall a. Show a => a -> String
show (Text
syn, Text
c)

doContextSwitches :: [ContextSwitch] -> TokenizerM ()
doContextSwitches :: [ContextSwitch] -> TokenizerM ()
doContextSwitches [] = () -> TokenizerM ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
doContextSwitches [ContextSwitch]
xs = do
  (ContextSwitch -> TokenizerM ())
-> [ContextSwitch] -> TokenizerM ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ ContextSwitch -> TokenizerM ()
doContextSwitch [ContextSwitch]
xs

lookupContext :: Text -> Syntax -> Maybe Context
lookupContext :: Text -> Syntax -> Maybe Context
lookupContext Text
name Syntax
syntax | Text -> Bool
Text.null Text
name =
  if Text -> Bool
Text.null (Syntax -> Text
sStartingContext Syntax
syntax)
     then Maybe Context
forall a. Maybe a
Nothing
     else Text -> Syntax -> Maybe Context
lookupContext (Syntax -> Text
sStartingContext Syntax
syntax) Syntax
syntax
lookupContext Text
name Syntax
syntax = Text -> Map Text Context -> Maybe Context
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
name (Map Text Context -> Maybe Context)
-> Map Text Context -> Maybe Context
forall a b. (a -> b) -> a -> b
$ Syntax -> Map Text Context
sContexts Syntax
syntax

startingState :: TokenizerState
startingState :: TokenizerState
startingState =
  TokenizerState :: ByteString
-> Bool
-> Char
-> ContextStack
-> [ByteString]
-> Int
-> Bool
-> Maybe Int
-> Map RE Regex
-> TokenizerState
TokenizerState{ input :: ByteString
input = ByteString
BS.empty
                , endline :: Bool
endline = Bool
True
                , prevChar :: Char
prevChar = Char
'\n'
                , contextStack :: ContextStack
contextStack = [Context] -> ContextStack
ContextStack []
                , captures :: [ByteString]
captures = []
                , column :: Int
column = Int
0
                , lineContinuation :: Bool
lineContinuation = Bool
False
                , firstNonspaceColumn :: Maybe Int
firstNonspaceColumn = Maybe Int
forall a. Maybe a
Nothing
                , compiledRegexes :: Map RE Regex
compiledRegexes = Map RE Regex
forall k a. Map k a
Map.empty
                }

tokenizeLine :: (ByteString, Int) -> TokenizerM [Token]
tokenizeLine :: (ByteString, Int) -> TokenizerM SourceLine
tokenizeLine (ByteString
ln, Int
linenum) = do
  (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((TokenizerState -> TokenizerState) -> TokenizerM ())
-> (TokenizerState -> TokenizerState) -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ input :: ByteString
input = ByteString
ln, endline :: Bool
endline = ByteString -> Bool
BS.null ByteString
ln, prevChar :: Char
prevChar = Char
'\n' }
  Context
cur <- TokenizerM Context
currentContext
  Bool
lineCont <- (TokenizerState -> Bool) -> TokenizerM Bool
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Bool
lineContinuation
  if Bool
lineCont
     then (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((TokenizerState -> TokenizerState) -> TokenizerM ())
-> (TokenizerState -> TokenizerState) -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ lineContinuation :: Bool
lineContinuation = Bool
False }
     else do
       (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((TokenizerState -> TokenizerState) -> TokenizerM ())
-> (TokenizerState -> TokenizerState) -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ column :: Int
column = Int
0
                         , firstNonspaceColumn :: Maybe Int
firstNonspaceColumn =
                              (Char -> Bool) -> ByteString -> Maybe Int
BS.findIndex (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace) ByteString
ln }
       [ContextSwitch] -> TokenizerM ()
doContextSwitches (Context -> [ContextSwitch]
cLineBeginContext Context
cur)
  if ByteString -> Bool
BS.null ByteString
ln
     then [ContextSwitch] -> TokenizerM ()
doContextSwitches (Context -> [ContextSwitch]
cLineEmptyContext Context
cur)
     else [ContextSwitch] -> TokenizerM ()
doContextSwitches (Context -> [ContextSwitch]
cLineBeginContext Context
cur)
  SourceLine
ts <- SourceLine -> SourceLine
normalizeHighlighting (SourceLine -> SourceLine)
-> ([Maybe Token] -> SourceLine) -> [Maybe Token] -> SourceLine
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Maybe Token] -> SourceLine
forall a. [Maybe a] -> [a]
catMaybes ([Maybe Token] -> SourceLine)
-> TokenizerM [Maybe Token] -> TokenizerM SourceLine
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> TokenizerM (Maybe Token) -> TokenizerM [Maybe Token]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many TokenizerM (Maybe Token)
getToken
  Bool
eol <- (TokenizerState -> Bool) -> TokenizerM Bool
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Bool
endline
  if Bool
eol
     then do
       TokenizerM Context
currentContext TokenizerM Context -> (Context -> TokenizerM ()) -> TokenizerM ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Context -> TokenizerM ()
checkLineEnd
       SourceLine -> TokenizerM SourceLine
forall (m :: * -> *) a. Monad m => a -> m a
return SourceLine
ts
     else do  -- fail if we haven't consumed whole line
       Int
col <- (TokenizerState -> Int) -> TokenizerM Int
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Int
column
       String -> TokenizerM SourceLine
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError (String -> TokenizerM SourceLine)
-> String -> TokenizerM SourceLine
forall a b. (a -> b) -> a -> b
$ String
"Could not match anything at line " String -> ShowS
forall a. [a] -> [a] -> [a]
++
         Int -> String
forall a. Show a => a -> String
show Int
linenum String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
" column " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
col

getToken :: TokenizerM (Maybe Token)
getToken :: TokenizerM (Maybe Token)
getToken = do
  ByteString
inp <- (TokenizerState -> ByteString) -> TokenizerM ByteString
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ByteString
input
  (TokenizerState -> Bool) -> TokenizerM Bool
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Bool
endline TokenizerM Bool -> (Bool -> TokenizerM ()) -> TokenizerM ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> TokenizerM ()) -> (Bool -> Bool) -> Bool -> TokenizerM ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Bool -> Bool
not
  Context
context <- TokenizerM Context
currentContext
  [TokenizerM (Maybe Token)] -> TokenizerM (Maybe Token)
forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum ((Rule -> TokenizerM (Maybe Token))
-> [Rule] -> [TokenizerM (Maybe Token)]
forall a b. (a -> b) -> [a] -> [b]
map (\Rule
r -> Rule -> ByteString -> TokenizerM (Maybe Token)
tryRule Rule
r ByteString
inp) (Context -> [Rule]
cRules Context
context)) TokenizerM (Maybe Token)
-> TokenizerM (Maybe Token) -> TokenizerM (Maybe Token)
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
     if Context -> Bool
cFallthrough Context
context
        then do
          let fallthroughContext :: [ContextSwitch]
fallthroughContext = case Context -> [ContextSwitch]
cFallthroughContext Context
context of
                                        [] -> [ContextSwitch
Pop]
                                        [ContextSwitch]
cs -> [ContextSwitch]
cs
          [ContextSwitch] -> TokenizerM ()
doContextSwitches [ContextSwitch]
fallthroughContext
          TokenizerM (Maybe Token)
getToken
        else do
          Text
t <- TokenizerM Text
normalChunk
          let mbtok :: Maybe Token
mbtok = Token -> Maybe Token
forall a. a -> Maybe a
Just (Context -> TokenType
cAttribute Context
context, Text
t)
          String -> TokenizerM ()
info (String -> TokenizerM ()) -> String -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ String
"FALLTHROUGH " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Maybe Token -> String
forall a. Show a => a -> String
show Maybe Token
mbtok
          Maybe Token -> TokenizerM (Maybe Token)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Token
mbtok

takeChars :: Int -> TokenizerM Text
takeChars :: Int -> TokenizerM Text
takeChars Int
0 = TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero
takeChars Int
numchars = do
  ByteString
inp <- (TokenizerState -> ByteString) -> TokenizerM ByteString
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ByteString
input
  let (ByteString
bs,ByteString
rest) = Int -> ByteString -> (ByteString, ByteString)
UTF8.splitAt Int
numchars ByteString
inp
  Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> TokenizerM ()) -> Bool -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (ByteString -> Bool
BS.null ByteString
bs)
  Text
t <- ByteString -> TokenizerM Text
decodeBS ByteString
bs
  (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((TokenizerState -> TokenizerState) -> TokenizerM ())
-> (TokenizerState -> TokenizerState) -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ input :: ByteString
input = ByteString
rest,
                      endline :: Bool
endline = ByteString -> Bool
BS.null ByteString
rest,
                      prevChar :: Char
prevChar = Text -> Char
Text.last Text
t,
                      column :: Int
column = TokenizerState -> Int
column TokenizerState
st Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
numchars }
  Text -> TokenizerM Text
forall (m :: * -> *) a. Monad m => a -> m a
return Text
t

tryRule :: Rule -> ByteString -> TokenizerM (Maybe Token)
tryRule :: Rule -> ByteString -> TokenizerM (Maybe Token)
tryRule Rule
_    ByteString
""  = TokenizerM (Maybe Token)
forall (m :: * -> *) a. MonadPlus m => m a
mzero
tryRule Rule
rule ByteString
inp = do
  String -> TokenizerM ()
info (String -> TokenizerM ()) -> String -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ String
"Trying rule " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Rule -> String
forall a. Show a => a -> String
show Rule
rule
  case Rule -> Maybe Int
rColumn Rule
rule of
       Maybe Int
Nothing -> () -> TokenizerM ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
       Just Int
n  -> (TokenizerState -> Int) -> TokenizerM Int
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Int
column TokenizerM Int -> (Int -> TokenizerM ()) -> TokenizerM ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> TokenizerM ()) -> (Int -> Bool) -> Int -> TokenizerM ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
n)

  Bool -> TokenizerM () -> TokenizerM ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Rule -> Bool
rFirstNonspace Rule
rule) (TokenizerM () -> TokenizerM ()) -> TokenizerM () -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ do
    Maybe Int
firstNonspace <- (TokenizerState -> Maybe Int) -> TokenizerM (Maybe Int)
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Maybe Int
firstNonspaceColumn
    Int
col <- (TokenizerState -> Int) -> TokenizerM Int
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Int
column
    Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Maybe Int
firstNonspace Maybe Int -> Maybe Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int -> Maybe Int
forall a. a -> Maybe a
Just Int
col)

  Maybe TokenizerState
oldstate <- if Rule -> Bool
rLookahead Rule
rule
                 then TokenizerState -> Maybe TokenizerState
forall a. a -> Maybe a
Just (TokenizerState -> Maybe TokenizerState)
-> TokenizerM TokenizerState -> TokenizerM (Maybe TokenizerState)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> TokenizerM TokenizerState
forall s (m :: * -> *). MonadState s m => m s
get -- needed for lookahead rules
                 else Maybe TokenizerState -> TokenizerM (Maybe TokenizerState)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe TokenizerState
forall a. Maybe a
Nothing

  let attr :: TokenType
attr = Rule -> TokenType
rAttribute Rule
rule
  Maybe Token
mbtok <- case Rule -> Matcher
rMatcher Rule
rule of
                DetectChar Char
c -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ Bool -> Char -> ByteString -> TokenizerM Text
detectChar (Rule -> Bool
rDynamic Rule
rule) Char
c ByteString
inp
                Detect2Chars Char
c Char
d -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$
                                      Bool -> Char -> Char -> ByteString -> TokenizerM Text
detect2Chars (Rule -> Bool
rDynamic Rule
rule) Char
c Char
d ByteString
inp
                AnyChar String
cs -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ String -> ByteString -> TokenizerM Text
anyChar String
cs ByteString
inp
                RangeDetect Char
c Char
d -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ Char -> Char -> ByteString -> TokenizerM Text
rangeDetect Char
c Char
d ByteString
inp
                RegExpr RE
re -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ Bool -> RE -> ByteString -> TokenizerM Text
regExpr (Rule -> Bool
rDynamic Rule
rule) RE
re ByteString
inp
                Matcher
Int -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseInt ByteString
inp
                Matcher
HlCOct -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseOct ByteString
inp
                Matcher
HlCHex -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseHex ByteString
inp
                Matcher
HlCStringChar -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseCStringChar ByteString
inp
                Matcher
HlCChar -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseCChar ByteString
inp
                Matcher
Float -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseFloat ByteString
inp
                Keyword KeywordAttr
kwattr WordSet Text
kws -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ KeywordAttr -> WordSet Text -> ByteString -> TokenizerM Text
keyword KeywordAttr
kwattr WordSet Text
kws ByteString
inp
                StringDetect Text
s -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$
                                    Bool -> Bool -> Text -> ByteString -> TokenizerM Text
stringDetect (Rule -> Bool
rDynamic Rule
rule) (Rule -> Bool
rCaseSensitive Rule
rule)
                                                 Text
s ByteString
inp
                WordDetect Text
s -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$
                                    Bool -> Text -> ByteString -> TokenizerM Text
wordDetect (Rule -> Bool
rCaseSensitive Rule
rule) Text
s ByteString
inp
                Matcher
LineContinue -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
lineContinue ByteString
inp
                Matcher
DetectSpaces -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
detectSpaces ByteString
inp
                Matcher
DetectIdentifier -> TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe Token))
-> TokenizerM Text -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
detectIdentifier ByteString
inp
                IncludeRules (Text, Text)
cname -> Maybe TokenType
-> (Text, Text) -> ByteString -> TokenizerM (Maybe Token)
includeRules
                   (if Rule -> Bool
rIncludeAttribute Rule
rule then TokenType -> Maybe TokenType
forall a. a -> Maybe a
Just TokenType
attr else Maybe TokenType
forall a. Maybe a
Nothing)
                   (Text, Text)
cname ByteString
inp
  Maybe Token
mbchildren <- do
    ByteString
inp' <- (TokenizerState -> ByteString) -> TokenizerM ByteString
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ByteString
input
    [TokenizerM (Maybe Token)] -> TokenizerM (Maybe Token)
forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum ((Rule -> TokenizerM (Maybe Token))
-> [Rule] -> [TokenizerM (Maybe Token)]
forall a b. (a -> b) -> [a] -> [b]
map (\Rule
r -> Rule -> ByteString -> TokenizerM (Maybe Token)
tryRule Rule
r ByteString
inp') (Rule -> [Rule]
rChildren Rule
rule)) TokenizerM (Maybe Token)
-> TokenizerM (Maybe Token) -> TokenizerM (Maybe Token)
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Maybe Token -> TokenizerM (Maybe Token)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Token
forall a. Maybe a
Nothing

  Maybe Token
mbtok' <- case Maybe Token
mbtok of
                 Maybe Token
Nothing -> Maybe Token -> TokenizerM (Maybe Token)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Token
forall a. Maybe a
Nothing
                 Just (TokenType
tt, Text
s)
                   | Rule -> Bool
rLookahead Rule
rule -> do
                     (ByteString
oldinput, Bool
oldendline, Char
oldprevChar, Int
oldColumn) <-
                         case Maybe TokenizerState
oldstate of
                              Maybe TokenizerState
Nothing -> String -> TokenizerM (ByteString, Bool, Char, Int)
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError
                                    String
"oldstate not saved with lookahead rule"
                              Just TokenizerState
st -> (ByteString, Bool, Char, Int)
-> TokenizerM (ByteString, Bool, Char, Int)
forall (m :: * -> *) a. Monad m => a -> m a
return
                                    (TokenizerState -> ByteString
input TokenizerState
st, TokenizerState -> Bool
endline TokenizerState
st,
                                     TokenizerState -> Char
prevChar TokenizerState
st, TokenizerState -> Int
column TokenizerState
st)
                     (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((TokenizerState -> TokenizerState) -> TokenizerM ())
-> (TokenizerState -> TokenizerState) -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ input :: ByteString
input = ByteString
oldinput
                                       , endline :: Bool
endline = Bool
oldendline
                                       , prevChar :: Char
prevChar = Char
oldprevChar
                                       , column :: Int
column = Int
oldColumn }
                     Maybe Token -> TokenizerM (Maybe Token)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Token
forall a. Maybe a
Nothing
                   | Bool
otherwise -> do
                     case Maybe Token
mbchildren of
                          Maybe Token
Nothing -> Maybe Token -> TokenizerM (Maybe Token)
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe Token -> TokenizerM (Maybe Token))
-> Maybe Token -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ Token -> Maybe Token
forall a. a -> Maybe a
Just (TokenType
tt, Text
s)
                          Just (TokenType
_, Text
cresult) -> Maybe Token -> TokenizerM (Maybe Token)
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe Token -> TokenizerM (Maybe Token))
-> Maybe Token -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ Token -> Maybe Token
forall a. a -> Maybe a
Just (TokenType
tt, Text
s Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
cresult)

  String -> TokenizerM ()
info (String -> TokenizerM ()) -> String -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> ShowS
forall a. (a -> Bool) -> [a] -> [a]
takeWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
' ') (Matcher -> String
forall a. Show a => a -> String
show (Rule -> Matcher
rMatcher Rule
rule)) String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
" MATCHED " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Maybe Token -> String
forall a. Show a => a -> String
show Maybe Token
mbtok'
  [ContextSwitch] -> TokenizerM ()
doContextSwitches (Rule -> [ContextSwitch]
rContextSwitch Rule
rule)
  Maybe Token -> TokenizerM (Maybe Token)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Token
mbtok'

withAttr :: TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr :: TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr TokenType
tt TokenizerM Text
p = do
  Text
res <- TokenizerM Text
p
  if Text -> Bool
Text.null Text
res
     then Maybe Token -> TokenizerM (Maybe Token)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Token
forall a. Maybe a
Nothing
     else Maybe Token -> TokenizerM (Maybe Token)
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe Token -> TokenizerM (Maybe Token))
-> Maybe Token -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ Token -> Maybe Token
forall a. a -> Maybe a
Just (TokenType
tt, Text
res)

wordDetect :: Bool -> Text -> ByteString -> TokenizerM Text
wordDetect :: Bool -> Text -> ByteString -> TokenizerM Text
wordDetect Bool
caseSensitive Text
s ByteString
inp = do
  ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  Text
t <- ByteString -> TokenizerM Text
decodeBS (ByteString -> TokenizerM Text) -> ByteString -> TokenizerM Text
forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
UTF8.take (Text -> Int
Text.length Text
s) ByteString
inp
  -- we assume here that the case fold will not change length,
  -- which is safe for ASCII keywords and the like...
  Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> TokenizerM ()) -> Bool -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ if Bool
caseSensitive
             then Text
s Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
t
             else Text -> CI Text
forall s. FoldCase s => s -> CI s
mk Text
s CI Text -> CI Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text -> CI Text
forall s. FoldCase s => s -> CI s
mk Text
t
  Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> TokenizerM ()) -> Bool -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (Text -> Bool
Text.null Text
t)
  let c :: Char
c = Text -> Char
Text.last Text
t
  let rest :: ByteString
rest = Int -> ByteString -> ByteString
UTF8.drop (Text -> Int
Text.length Text
s) ByteString
inp
  let d :: Char
d = case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
rest of
               Maybe (Char, ByteString)
Nothing    -> Char
'\n'
               Just (Char
x,ByteString
_) -> Char
x
  Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> TokenizerM ()) -> Bool -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ Char -> Char -> Bool
isWordBoundary Char
c Char
d
  Int -> TokenizerM Text
takeChars (Text -> Int
Text.length Text
t)

stringDetect :: Bool -> Bool -> Text -> ByteString -> TokenizerM Text
stringDetect :: Bool -> Bool -> Text -> ByteString -> TokenizerM Text
stringDetect Bool
dynamic Bool
caseSensitive Text
s ByteString
inp = do
  Text
s' <- if Bool
dynamic
        then do
          Text
dynStr <- Text -> TokenizerM Text
subDynamicText Text
s
          String -> TokenizerM ()
info (String -> TokenizerM ()) -> String -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ String
"Dynamic string: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Text -> String
forall a. Show a => a -> String
show Text
dynStr
          Text -> TokenizerM Text
forall (m :: * -> *) a. Monad m => a -> m a
return Text
dynStr
        else Text -> TokenizerM Text
forall (m :: * -> *) a. Monad m => a -> m a
return Text
s
  Text
t <- ByteString -> TokenizerM Text
decodeBS (ByteString -> TokenizerM Text) -> ByteString -> TokenizerM Text
forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
UTF8.take (Text -> Int
Text.length Text
s') ByteString
inp
  -- we assume here that the case fold will not change length,
  -- which is safe for ASCII keywords and the like...
  Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> TokenizerM ()) -> Bool -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ if Bool
caseSensitive
             then Text
s' Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
t
             else Text -> CI Text
forall s. FoldCase s => s -> CI s
mk Text
s' CI Text -> CI Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text -> CI Text
forall s. FoldCase s => s -> CI s
mk Text
t
  Int -> TokenizerM Text
takeChars (Text -> Int
Text.length Text
s')

subDynamicText :: Text -> TokenizerM Text
subDynamicText :: Text -> TokenizerM Text
subDynamicText Text
t = do
  let substitute :: Text -> TokenizerM Text
substitute Text
x = case Text -> Maybe (Char, Text)
Text.uncons Text
x of
        Just (Char
c, Text
rest) | Char -> Bool
isDigit Char
c -> let capNum :: Int
capNum = Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'0'
                                      in (Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
rest) (Text -> Text) -> TokenizerM Text -> TokenizerM Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Int -> TokenizerM Text
getCapture Int
capNum
        Maybe (Char, Text)
_ -> Text -> TokenizerM Text
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> TokenizerM Text) -> Text -> TokenizerM Text
forall a b. (a -> b) -> a -> b
$ Char -> Text -> Text
Text.cons Char
'%' Text
x
  case (Char -> Bool) -> Text -> [Text]
Text.split (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'%') Text
t of
    []     -> Text -> TokenizerM Text
forall (m :: * -> *) a. Monad m => a -> m a
return Text
Text.empty
    Text
x:[Text]
rest -> (Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<>) (Text -> Text) -> ([Text] -> Text) -> [Text] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Text] -> Text
Text.concat ([Text] -> Text) -> TokenizerM [Text] -> TokenizerM Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Text -> TokenizerM Text) -> [Text] -> TokenizerM [Text]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM Text -> TokenizerM Text
substitute [Text]
rest

-- This assumes that nothing significant will happen
-- in the middle of a string of spaces or a string
-- of alphanumerics.  This seems true  for all normal
-- programming languages, and the optimization speeds
-- things up a lot, relative to just parsing one char.
normalChunk :: TokenizerM Text
normalChunk :: TokenizerM Text
normalChunk = do
  ByteString
inp <- (TokenizerState -> ByteString) -> TokenizerM ByteString
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ByteString
input
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
    Maybe (Char, ByteString)
Nothing -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero
    Just (Char
c, ByteString
_)
      | Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
' ' ->
        let bs :: ByteString
bs = (Char -> Bool) -> ByteString -> ByteString
BS.takeWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
' ') ByteString
inp
        in  Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
bs)
      | Char -> Bool
isAscii Char
c Bool -> Bool -> Bool
&& Char -> Bool
isAlphaNum Char
c ->
        let (ByteString
bs, ByteString
_) = (Char -> Bool) -> ByteString -> (ByteString, ByteString)
UTF8.span Char -> Bool
isAlphaNum ByteString
inp
        in  Int -> TokenizerM Text
takeChars (ByteString -> Int
UTF8.length ByteString
bs)
      | Bool
otherwise -> Int -> TokenizerM Text
takeChars Int
1

includeRules :: Maybe TokenType -> ContextName -> ByteString
             -> TokenizerM (Maybe Token)
includeRules :: Maybe TokenType
-> (Text, Text) -> ByteString -> TokenizerM (Maybe Token)
includeRules Maybe TokenType
mbattr (Text
syn, Text
con) ByteString
inp = do
  SyntaxMap
syntaxes <- (TokenizerConfig -> SyntaxMap) -> TokenizerM SyntaxMap
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> SyntaxMap
syntaxMap
  case Text -> SyntaxMap -> Maybe Syntax
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
syn SyntaxMap
syntaxes Maybe Syntax -> (Syntax -> Maybe Context) -> Maybe Context
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Text -> Syntax -> Maybe Context
lookupContext Text
con of
       Maybe Context
Nothing  -> do
          Context
cur <- TokenizerM Context
currentContext
          String -> TokenizerM (Maybe Token)
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError (String -> TokenizerM (Maybe Token))
-> String -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ String
"IncludeRules in " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Text -> String
Text.unpack (Context -> Text
cSyntax Context
cur) String -> ShowS
forall a. [a] -> [a] -> [a]
++
           String
" requires undefined context " String -> ShowS
forall a. [a] -> [a] -> [a]
++
           Text -> String
Text.unpack Text
con String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
"##" String -> ShowS
forall a. [a] -> [a] -> [a]
++ Text -> String
Text.unpack Text
syn
       Just Context
c   -> do
         Maybe Token
mbtok <- [TokenizerM (Maybe Token)] -> TokenizerM (Maybe Token)
forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum ((Rule -> TokenizerM (Maybe Token))
-> [Rule] -> [TokenizerM (Maybe Token)]
forall a b. (a -> b) -> [a] -> [b]
map (\Rule
r -> Rule -> ByteString -> TokenizerM (Maybe Token)
tryRule Rule
r ByteString
inp) (Context -> [Rule]
cRules Context
c))
         Maybe Token -> TokenizerM (Maybe Token)
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe Token -> TokenizerM (Maybe Token))
-> Maybe Token -> TokenizerM (Maybe Token)
forall a b. (a -> b) -> a -> b
$ case (Maybe Token
mbtok, Maybe TokenType
mbattr) of
                    (Just (TokenType
NormalTok, Text
xs), Just TokenType
attr) -> Token -> Maybe Token
forall a. a -> Maybe a
Just (TokenType
attr, Text
xs)
                    (Maybe Token, Maybe TokenType)
_                                 -> Maybe Token
mbtok

checkLineEnd :: Context -> TokenizerM ()
checkLineEnd :: Context -> TokenizerM ()
checkLineEnd Context
c = do
  if [ContextSwitch] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (Context -> [ContextSwitch]
cLineEndContext Context
c)
     then () -> TokenizerM ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
     else do
       Bool
eol <- (TokenizerState -> Bool) -> TokenizerM Bool
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Bool
endline
       String -> TokenizerM ()
info (String -> TokenizerM ()) -> String -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ String
"checkLineEnd for " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Text -> String
forall a. Show a => a -> String
show (Context -> Text
cName Context
c) String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
" eol = " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Bool -> String
forall a. Show a => a -> String
show Bool
eol String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
" cLineEndContext = " String -> ShowS
forall a. [a] -> [a] -> [a]
++ [ContextSwitch] -> String
forall a. Show a => a -> String
show (Context -> [ContextSwitch]
cLineEndContext Context
c)
       Bool -> TokenizerM () -> TokenizerM ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
eol (TokenizerM () -> TokenizerM ()) -> TokenizerM () -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ do
         Bool
lineCont' <- (TokenizerState -> Bool) -> TokenizerM Bool
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Bool
lineContinuation
         Bool -> TokenizerM () -> TokenizerM ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless Bool
lineCont' (TokenizerM () -> TokenizerM ()) -> TokenizerM () -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$
           [ContextSwitch] -> TokenizerM ()
doContextSwitches (Context -> [ContextSwitch]
cLineEndContext Context
c)

detectChar :: Bool -> Char -> ByteString -> TokenizerM Text
detectChar :: Bool -> Char -> ByteString -> TokenizerM Text
detectChar Bool
dynamic Char
c ByteString
inp = do
  Char
c' <- if Bool
dynamic Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'9'
           then Char -> TokenizerM Char
getDynamicChar Char
c
           else Char -> TokenizerM Char
forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
    Just (Char
x,ByteString
_) | Char
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
c' -> Int -> TokenizerM Text
takeChars Int
1
    Maybe (Char, ByteString)
_          -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero

getDynamicChar :: Char -> TokenizerM Char
getDynamicChar :: Char -> TokenizerM Char
getDynamicChar Char
c = do
  let capNum :: Int
capNum = Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'0'
  Text
res <- Int -> TokenizerM Text
getCapture Int
capNum
  case Text -> Maybe (Char, Text)
Text.uncons Text
res of
       Maybe (Char, Text)
Nothing    -> TokenizerM Char
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Just (Char
d,Text
_) -> Char -> TokenizerM Char
forall (m :: * -> *) a. Monad m => a -> m a
return Char
d

detect2Chars :: Bool -> Char -> Char -> ByteString -> TokenizerM Text
detect2Chars :: Bool -> Char -> Char -> ByteString -> TokenizerM Text
detect2Chars Bool
dynamic Char
c Char
d ByteString
inp = do
  Char
c' <- if Bool
dynamic Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'9'
           then Char -> TokenizerM Char
getDynamicChar Char
c
           else Char -> TokenizerM Char
forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
  Char
d' <- if Bool
dynamic Bool -> Bool -> Bool
&& Char
d Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
d Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'9'
           then Char -> TokenizerM Char
getDynamicChar Char
d
           else Char -> TokenizerM Char
forall (m :: * -> *) a. Monad m => a -> m a
return Char
d
  if (Text -> ByteString
encodeUtf8 (String -> Text
Text.pack [Char
c',Char
d'])) ByteString -> ByteString -> Bool
`BS.isPrefixOf` ByteString
inp
     then Int -> TokenizerM Text
takeChars Int
2
     else TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero

rangeDetect :: Char -> Char -> ByteString -> TokenizerM Text
rangeDetect :: Char -> Char -> ByteString -> TokenizerM Text
rangeDetect Char
c Char
d ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
    Just (Char
x, ByteString
rest)
      | Char
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
c -> case (Char -> Bool) -> ByteString -> (ByteString, ByteString)
UTF8.span (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
d) ByteString
rest of
                       (ByteString
in_t, ByteString
out_t)
                         | ByteString -> Bool
BS.null ByteString
out_t -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero
                         | Bool
otherwise -> do
                              Text
t <- ByteString -> TokenizerM Text
decodeBS ByteString
in_t
                              Int -> TokenizerM Text
takeChars (Text -> Int
Text.length Text
t Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2)
    Maybe (Char, ByteString)
_ -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero

-- NOTE: currently limited to ASCII
detectSpaces :: ByteString -> TokenizerM Text
detectSpaces :: ByteString -> TokenizerM Text
detectSpaces ByteString
inp = do
  case (Char -> Bool) -> ByteString -> (ByteString, ByteString)
BS.span (\Char
c -> Char -> Bool
isSpace Char
c) ByteString
inp of
       (ByteString
t, ByteString
_)
         | ByteString -> Bool
BS.null ByteString
t -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero
         | Bool
otherwise -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
t)

-- NOTE: limited to ASCII as per kate documentation
detectIdentifier :: ByteString -> TokenizerM Text
detectIdentifier :: ByteString -> TokenizerM Text
detectIdentifier ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
BS.uncons ByteString
inp of
    Just (Char
c, ByteString
t) | Char -> Bool
isLetter Char
c Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'_' ->
      Int -> TokenizerM Text
takeChars (Int -> TokenizerM Text) -> Int -> TokenizerM Text
forall a b. (a -> b) -> a -> b
$ Int
1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int -> (Int -> Int) -> Maybe Int -> Int
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (ByteString -> Int
BS.length ByteString
t) Int -> Int
forall a. a -> a
id
                ((Char -> Bool) -> ByteString -> Maybe Int
BS.findIndex (\Char
d -> Bool -> Bool
not (Char -> Bool
isAlphaNum Char
d Bool -> Bool -> Bool
|| Char
d Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'_')) ByteString
t)
    Maybe (Char, ByteString)
_ -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero

lineContinue :: ByteString -> TokenizerM Text
lineContinue :: ByteString -> TokenizerM Text
lineContinue ByteString
inp = do
  if ByteString
inp ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString
"\\"
     then do
       (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((TokenizerState -> TokenizerState) -> TokenizerM ())
-> (TokenizerState -> TokenizerState) -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ lineContinuation :: Bool
lineContinuation = Bool
True }
       Int -> TokenizerM Text
takeChars Int
1
     else TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero

anyChar :: [Char] -> ByteString -> TokenizerM Text
anyChar :: String -> ByteString -> TokenizerM Text
anyChar String
cs ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
     Just (Char
x, ByteString
_) | Char
x Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
cs -> Int -> TokenizerM Text
takeChars Int
1
     Maybe (Char, ByteString)
_           -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero

regExpr :: Bool -> RE -> ByteString -> TokenizerM Text
regExpr :: Bool -> RE -> ByteString -> TokenizerM Text
regExpr Bool
dynamic RE
re ByteString
inp = do
  ByteString
reStr <- if Bool
dynamic
              then do
                ByteString
reStr' <- ByteString -> TokenizerM ByteString
subDynamic (RE -> ByteString
reString RE
re)
                String -> TokenizerM ()
info (String -> TokenizerM ()) -> String -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ String
"Dynamic regex: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ ByteString -> String
forall a. Show a => a -> String
show ByteString
reStr'
                ByteString -> TokenizerM ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
reStr'
              else ByteString -> TokenizerM ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return (RE -> ByteString
reString RE
re)
  Bool -> TokenizerM () -> TokenizerM ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int -> ByteString -> ByteString
BS.take Int
2 ByteString
reStr ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString
"\\b") (TokenizerM () -> TokenizerM ()) -> TokenizerM () -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  Regex
regex <- if Bool
dynamic
              then Regex -> TokenizerM Regex
forall (m :: * -> *) a. Monad m => a -> m a
return (Regex -> TokenizerM Regex) -> Regex -> TokenizerM Regex
forall a b. (a -> b) -> a -> b
$ Bool -> ByteString -> Regex
compileRegex (RE -> Bool
reCaseSensitive RE
re) ByteString
reStr
              else do
                Map RE Regex
compiledREs <- (TokenizerState -> Map RE Regex) -> TokenizerM (Map RE Regex)
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Map RE Regex
compiledRegexes
                case RE -> Map RE Regex -> Maybe Regex
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup RE
re Map RE Regex
compiledREs of
                     Maybe Regex
Nothing -> do
                       let cre :: Regex
cre = Bool -> ByteString -> Regex
compileRegex (RE -> Bool
reCaseSensitive RE
re) ByteString
reStr
                       (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((TokenizerState -> TokenizerState) -> TokenizerM ())
-> (TokenizerState -> TokenizerState) -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ compiledRegexes :: Map RE Regex
compiledRegexes =
                             RE -> Regex -> Map RE Regex -> Map RE Regex
forall k a. Ord k => k -> a -> Map k a -> Map k a
Map.insert RE
re Regex
cre (TokenizerState -> Map RE Regex
compiledRegexes TokenizerState
st) }
                       Regex -> TokenizerM Regex
forall (m :: * -> *) a. Monad m => a -> m a
return Regex
cre
                     Just Regex
cre -> Regex -> TokenizerM Regex
forall (m :: * -> *) a. Monad m => a -> m a
return Regex
cre
  case Regex -> ByteString -> Maybe [ByteString]
matchRegex Regex
regex ByteString
inp of
       Just (ByteString
match:[ByteString]
capts) -> do
         Bool -> TokenizerM () -> TokenizerM ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless ([ByteString] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [ByteString]
capts) (TokenizerM () -> TokenizerM ()) -> TokenizerM () -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$
           (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((TokenizerState -> TokenizerState) -> TokenizerM ())
-> (TokenizerState -> TokenizerState) -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ captures :: [ByteString]
captures = [ByteString]
capts }
         Int -> TokenizerM Text
takeChars (ByteString -> Int
UTF8.length ByteString
match)
       Maybe [ByteString]
_ -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero

wordBoundary :: ByteString -> TokenizerM ()
wordBoundary :: ByteString -> TokenizerM ()
wordBoundary ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
       Maybe (Char, ByteString)
Nothing -> () -> TokenizerM ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
       Just (Char
d, ByteString
_) -> do
         Char
c <- (TokenizerState -> Char) -> TokenizerM Char
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Char
prevChar
         Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> TokenizerM ()) -> Bool -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ Char -> Char -> Bool
isWordBoundary Char
c Char
d

-- TODO is this right?
isWordBoundary :: Char -> Char -> Bool
isWordBoundary :: Char -> Char -> Bool
isWordBoundary Char
c Char
d =
  (Char -> Bool
isAlphaNum Char
c Bool -> Bool -> Bool
&& Bool -> Bool
not (Char -> Bool
isAlphaNum Char
d))
  Bool -> Bool -> Bool
|| (Char -> Bool
isAlphaNum Char
d Bool -> Bool -> Bool
&& Bool -> Bool
not (Char -> Bool
isAlphaNum Char
c))
  Bool -> Bool -> Bool
|| (Char -> Bool
isSpace Char
d Bool -> Bool -> Bool
&& Bool -> Bool
not (Char -> Bool
isSpace Char
c))
  Bool -> Bool -> Bool
|| (Char -> Bool
isSpace Char
c Bool -> Bool -> Bool
&& Bool -> Bool
not (Char -> Bool
isSpace Char
d))


decodeBS :: ByteString -> TokenizerM Text
decodeBS :: ByteString -> TokenizerM Text
decodeBS ByteString
bs = case ByteString -> Either UnicodeException Text
decodeUtf8' ByteString
bs of
                    Left UnicodeException
_ -> String -> TokenizerM Text
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError (String
"ByteString " String -> ShowS
forall a. [a] -> [a] -> [a]
++
                                ByteString -> String
forall a. Show a => a -> String
show ByteString
bs String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
"is not UTF8")
                    Right Text
t -> Text -> TokenizerM Text
forall (m :: * -> *) a. Monad m => a -> m a
return Text
t

-- Substitute out %1, %2, etc. in regex string, escaping
-- appropriately..
subDynamic :: ByteString -> TokenizerM ByteString
subDynamic :: ByteString -> TokenizerM ByteString
subDynamic ByteString
bs =
  case (Char -> Bool) -> ByteString -> (ByteString, ByteString)
BS.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'%') ByteString
bs of
       (ByteString
y,ByteString
z)
         | ByteString -> Bool
BS.null ByteString
z -> ByteString -> TokenizerM ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
y
         | Bool
otherwise -> (ByteString
y ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<>) (ByteString -> ByteString)
-> TokenizerM ByteString -> TokenizerM ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
             case ByteString -> String
BS.unpack (Int -> ByteString -> ByteString
BS.take Int
2 ByteString
z) of
                  [Char
'%',Char
x] | Char
x Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
x Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'9' -> do
                     let capNum :: Int
capNum = Char -> Int
ord Char
x Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'0'
                     Text
replacement <- Int -> TokenizerM Text
getCapture Int
capNum
                     (ByteString -> ByteString
escapeRegex (Text -> ByteString
encodeUtf8 Text
replacement) ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<>) (ByteString -> ByteString)
-> TokenizerM ByteString -> TokenizerM ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
                         ByteString -> TokenizerM ByteString
subDynamic (Int -> ByteString -> ByteString
BS.drop Int
2 ByteString
z)
                  String
_ -> Char -> ByteString -> ByteString
BS.cons Char
'%' (ByteString -> ByteString)
-> TokenizerM ByteString -> TokenizerM ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (ByteString -> TokenizerM ByteString
subDynamic (Int -> ByteString -> ByteString
BS.drop Int
1 ByteString
z))

escapeRegex :: BS.ByteString -> BS.ByteString
escapeRegex :: ByteString -> ByteString
escapeRegex = (Char -> ByteString) -> ByteString -> ByteString
BS.concatMap Char -> ByteString
escapeRegexChar

escapeRegexChar :: Char -> BS.ByteString
escapeRegexChar :: Char -> ByteString
escapeRegexChar Char
'^' = ByteString
"\\^"
escapeRegexChar Char
'$' = ByteString
"\\$"
escapeRegexChar Char
'\\' = ByteString
"\\\\"
escapeRegexChar Char
'[' = ByteString
"\\["
escapeRegexChar Char
']' = ByteString
"\\]"
escapeRegexChar Char
'(' = ByteString
"\\("
escapeRegexChar Char
')' = ByteString
"\\)"
escapeRegexChar Char
'{' = ByteString
"\\{"
escapeRegexChar Char
'}' = ByteString
"\\}"
escapeRegexChar Char
'*' = ByteString
"\\*"
escapeRegexChar Char
'+' = ByteString
"\\+"
escapeRegexChar Char
'.' = ByteString
"\\."
escapeRegexChar Char
'?' = ByteString
"\\?"
escapeRegexChar Char
c
  | Char -> Bool
isAscii Char
c Bool -> Bool -> Bool
&& Char -> Bool
isPrint Char
c = Char -> ByteString
BS.singleton Char
c
  | Bool
otherwise              = String -> ByteString
BS.pack (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String -> Int -> String
forall r. PrintfType r => String -> r
printf String
"\\x{%x}" (Char -> Int
ord Char
c)

getCapture :: Int -> TokenizerM Text
getCapture :: Int -> TokenizerM Text
getCapture Int
capnum = do
  [ByteString]
capts <- (TokenizerState -> [ByteString]) -> TokenizerM [ByteString]
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> [ByteString]
captures
  if [ByteString] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [ByteString]
capts Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
capnum
     then TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero
     else ByteString -> TokenizerM Text
decodeBS (ByteString -> TokenizerM Text) -> ByteString -> TokenizerM Text
forall a b. (a -> b) -> a -> b
$ [ByteString]
capts [ByteString] -> Int -> ByteString
forall a. [a] -> Int -> a
!! (Int
capnum Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)

keyword :: KeywordAttr -> WordSet Text -> ByteString -> TokenizerM Text
keyword :: KeywordAttr -> WordSet Text -> ByteString -> TokenizerM Text
keyword KeywordAttr
kwattr WordSet Text
kws ByteString
inp = do
  Char
prev <- (TokenizerState -> Char) -> TokenizerM Char
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Char
prevChar
  Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> TokenizerM ()) -> Bool -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ Char
prev Char -> Set Char -> Bool
forall a. Ord a => a -> Set a -> Bool
`Set.member` (KeywordAttr -> Set Char
keywordDelims KeywordAttr
kwattr)
  let (ByteString
w,ByteString
_) = (Char -> Bool) -> ByteString -> (ByteString, ByteString)
UTF8.break (Char -> Set Char -> Bool
forall a. Ord a => a -> Set a -> Bool
`Set.member` (KeywordAttr -> Set Char
keywordDelims KeywordAttr
kwattr)) ByteString
inp
  Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> TokenizerM ()) -> Bool -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (ByteString -> Bool
BS.null ByteString
w)
  Text
w' <- ByteString -> TokenizerM Text
decodeBS ByteString
w
  let numchars :: Int
numchars = Text -> Int
Text.length Text
w'
  if Text
w' Text -> WordSet Text -> Bool
forall a. (FoldCase a, Ord a) => a -> WordSet a -> Bool
`inWordSet` WordSet Text
kws
     then Int -> TokenizerM Text
takeChars Int
numchars
     else TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero

normalizeHighlighting :: [Token] -> [Token]
normalizeHighlighting :: SourceLine -> SourceLine
normalizeHighlighting [] = []
normalizeHighlighting ((TokenType
t,Text
x):SourceLine
xs)
  | Text -> Bool
Text.null Text
x = SourceLine -> SourceLine
normalizeHighlighting SourceLine
xs
  | Bool
otherwise =
    (TokenType
t, [Text] -> Text
Text.concat (Text
x Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: (Token -> Text) -> SourceLine -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Token -> Text
forall a b. (a, b) -> b
snd SourceLine
matches)) Token -> SourceLine -> SourceLine
forall a. a -> [a] -> [a]
: SourceLine -> SourceLine
normalizeHighlighting SourceLine
rest
    where (SourceLine
matches, SourceLine
rest) = (Token -> Bool) -> SourceLine -> (SourceLine, SourceLine)
forall a. (a -> Bool) -> [a] -> ([a], [a])
span (\(TokenType
z,Text
_) -> TokenType
z TokenType -> TokenType -> Bool
forall a. Eq a => a -> a -> Bool
== TokenType
t) SourceLine
xs

parseCStringChar :: ByteString -> TokenizerM Text
parseCStringChar :: ByteString -> TokenizerM Text
parseCStringChar ByteString
inp = do
  case Parser (ByteString, ())
-> ByteString -> Either String (ByteString, ())
forall a. Parser a -> ByteString -> Either String a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pCStringChar) ByteString
inp of
       Left String
_      -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pCStringChar :: A.Parser ()
pCStringChar :: Parser ()
pCStringChar = do
  Char
_ <- Char -> Parser Char
A.char Char
'\\'
  Char
next <- Parser Char
A.anyChar
  case Char
next of
       Char
c | Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'x' Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'X' -> () () -> Parser ByteString ByteString -> Parser ()
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser ByteString ByteString
A.takeWhile1 (String -> Char -> Bool
A.inClass String
"0-9a-fA-F")
         | Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'0' -> () () -> Parser ByteString ByteString -> Parser ()
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser ByteString ByteString
A.takeWhile (String -> Char -> Bool
A.inClass String
"0-7")
         | String -> Char -> Bool
A.inClass String
"abefnrtv\"'?\\" Char
c -> () -> Parser ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
         | Bool
otherwise -> Parser ()
forall (m :: * -> *) a. MonadPlus m => m a
mzero

parseCChar :: ByteString -> TokenizerM Text
parseCChar :: ByteString -> TokenizerM Text
parseCChar ByteString
inp = do
  case Parser (ByteString, ())
-> ByteString -> Either String (ByteString, ())
forall a. Parser a -> ByteString -> Either String a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pCChar) ByteString
inp of
       Left String
_      -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pCChar :: A.Parser ()
pCChar :: Parser ()
pCChar = do
  () () -> Parser Char -> Parser ()
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Char -> Parser Char
A.char Char
'\''
  Parser ()
pCStringChar Parser () -> Parser () -> Parser ()
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () () -> Parser Char -> Parser ()
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser Char
A.satisfy (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\'' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\\')
  () () -> Parser Char -> Parser ()
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Char -> Parser Char
A.char Char
'\''

parseInt :: ByteString -> TokenizerM Text
parseInt :: ByteString -> TokenizerM Text
parseInt ByteString
inp = do
  ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  case Parser (ByteString, ())
-> ByteString -> Either String (ByteString, ())
forall a. Parser a -> ByteString -> Either String a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match (Parser ()
pHex Parser () -> Parser () -> Parser ()
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
pOct Parser () -> Parser () -> Parser ()
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
pDec)) ByteString
inp of
       Left String
_      -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pDec :: A.Parser ()
pDec :: Parser ()
pDec = do
  Parser ()
mbMinus
  ByteString
_ <- (Char -> Bool) -> Parser ByteString ByteString
A.takeWhile1 (String -> Char -> Bool
A.inClass String
"0-9")
  Parser ()
guardWordBoundary

parseOct :: ByteString -> TokenizerM Text
parseOct :: ByteString -> TokenizerM Text
parseOct ByteString
inp = do
  ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  case Parser (ByteString, ())
-> ByteString -> Either String (ByteString, ())
forall a. Parser a -> ByteString -> Either String a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pHex) ByteString
inp of
       Left String
_      -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pOct :: A.Parser ()
pOct :: Parser ()
pOct = do
  Parser ()
mbMinus
  Char
_ <- Char -> Parser Char
A.char Char
'0'
  Char
_ <- (Char -> Bool) -> Parser Char
A.satisfy (String -> Char -> Bool
A.inClass String
"Oo")
  ByteString
_ <- (Char -> Bool) -> Parser ByteString ByteString
A.takeWhile1 (String -> Char -> Bool
A.inClass String
"0-7")
  Parser ()
guardWordBoundary

parseHex :: ByteString -> TokenizerM Text
parseHex :: ByteString -> TokenizerM Text
parseHex ByteString
inp = do
  ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  case Parser (ByteString, ())
-> ByteString -> Either String (ByteString, ())
forall a. Parser a -> ByteString -> Either String a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pHex) ByteString
inp of
       Left String
_      -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pHex :: A.Parser ()
pHex :: Parser ()
pHex = do
  Parser ()
mbMinus
  Char
_ <- Char -> Parser Char
A.char Char
'0'
  Char
_ <- (Char -> Bool) -> Parser Char
A.satisfy (String -> Char -> Bool
A.inClass String
"Xx")
  ByteString
_ <- (Char -> Bool) -> Parser ByteString ByteString
A.takeWhile1 (String -> Char -> Bool
A.inClass String
"0-9a-fA-F")
  Parser ()
guardWordBoundary

guardWordBoundary :: A.Parser ()
guardWordBoundary :: Parser ()
guardWordBoundary = do
  Maybe Char
mbw <- Parser (Maybe Char)
A.peekChar
  case Maybe Char
mbw of
       Just Char
c  ->  Bool -> Parser ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Parser ()) -> Bool -> Parser ()
forall a b. (a -> b) -> a -> b
$ Char -> Char -> Bool
isWordBoundary Char
'0' Char
c
       Maybe Char
Nothing -> () -> Parser ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()

mbMinus :: A.Parser ()
mbMinus :: Parser ()
mbMinus = (() () -> Parser Char -> Parser ()
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Char -> Parser Char
A.char Char
'-') Parser () -> Parser () -> Parser ()
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () -> Parser ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()

mbPlusMinus :: A.Parser ()
mbPlusMinus :: Parser ()
mbPlusMinus = () () -> Parser Char -> Parser ()
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser Char
A.satisfy (String -> Char -> Bool
A.inClass String
"+-") Parser () -> Parser () -> Parser ()
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () -> Parser ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()

parseFloat :: ByteString -> TokenizerM Text
parseFloat :: ByteString -> TokenizerM Text
parseFloat ByteString
inp = do
  ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  case Parser (ByteString, ())
-> ByteString -> Either String (ByteString, ())
forall a. Parser a -> ByteString -> Either String a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pFloat) ByteString
inp of
       Left String
_      -> TokenizerM Text
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r)  -- assumes all ascii
  where pFloat :: A.Parser ()
        pFloat :: Parser ()
pFloat = do
          let digits :: Parser ByteString ByteString
digits = (Char -> Bool) -> Parser ByteString ByteString
A.takeWhile1 (String -> Char -> Bool
A.inClass String
"0-9")
          Parser ()
mbPlusMinus
          Bool
before <- Bool -> Parser ByteString Bool -> Parser ByteString Bool
forall (f :: * -> *) a. Alternative f => a -> f a -> f a
A.option Bool
False (Parser ByteString Bool -> Parser ByteString Bool)
-> Parser ByteString Bool -> Parser ByteString Bool
forall a b. (a -> b) -> a -> b
$ Bool
True Bool -> Parser ByteString ByteString -> Parser ByteString Bool
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Parser ByteString ByteString
digits
          Bool
dot <- Bool -> Parser ByteString Bool -> Parser ByteString Bool
forall (f :: * -> *) a. Alternative f => a -> f a -> f a
A.option Bool
False (Parser ByteString Bool -> Parser ByteString Bool)
-> Parser ByteString Bool -> Parser ByteString Bool
forall a b. (a -> b) -> a -> b
$ Bool
True Bool -> Parser Char -> Parser ByteString Bool
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser Char
A.satisfy (String -> Char -> Bool
A.inClass String
".")
          Bool
after <- Bool -> Parser ByteString Bool -> Parser ByteString Bool
forall (f :: * -> *) a. Alternative f => a -> f a -> f a
A.option Bool
False (Parser ByteString Bool -> Parser ByteString Bool)
-> Parser ByteString Bool -> Parser ByteString Bool
forall a b. (a -> b) -> a -> b
$ Bool
True Bool -> Parser ByteString ByteString -> Parser ByteString Bool
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Parser ByteString ByteString
digits
          Bool
e <- Bool -> Parser ByteString Bool -> Parser ByteString Bool
forall (f :: * -> *) a. Alternative f => a -> f a -> f a
A.option Bool
False (Parser ByteString Bool -> Parser ByteString Bool)
-> Parser ByteString Bool -> Parser ByteString Bool
forall a b. (a -> b) -> a -> b
$ Bool
True Bool -> Parser ByteString ByteString -> Parser ByteString Bool
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ ((Char -> Bool) -> Parser Char
A.satisfy (String -> Char -> Bool
A.inClass String
"Ee") Parser Char -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>>
                                         Parser ()
mbPlusMinus Parser ()
-> Parser ByteString ByteString -> Parser ByteString ByteString
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ByteString ByteString
digits)
          Maybe Char
mbnext <- Parser (Maybe Char)
A.peekChar
          case Maybe Char
mbnext of
               Maybe Char
Nothing -> () -> Parser ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
               Just Char
c  -> Bool -> Parser ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ String -> Char -> Bool
A.inClass String
"." Char
c)
          Bool -> Parser ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Parser ()) -> Bool -> Parser ()
forall a b. (a -> b) -> a -> b
$ (Bool
before Bool -> Bool -> Bool
&& Bool -> Bool
not Bool
dot Bool -> Bool -> Bool
&& Bool
e)     -- 5e2
               Bool -> Bool -> Bool
|| (Bool
before Bool -> Bool -> Bool
&& Bool
dot Bool -> Bool -> Bool
&& (Bool
after Bool -> Bool -> Bool
|| Bool -> Bool
not Bool
e)) -- 5.2e2 or 5.2 or 5.
               Bool -> Bool -> Bool
|| (Bool -> Bool
not Bool
before Bool -> Bool -> Bool
&& Bool
dot Bool -> Bool -> Bool
&& Bool
after) -- .23 or .23e2