{-# OPTIONS_GHC -fno-warn-missing-methods #-}
{-# LANGUAGE StrictData            #-}
{-# LANGUAGE BangPatterns          #-}
{-# LANGUAGE CPP                   #-}
{-# LANGUAGE DeriveFunctor         #-}
{-# LANGUAGE FlexibleInstances     #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings     #-}
{-# LANGUAGE StandaloneDeriving    #-}
{-# LANGUAGE TypeSynonymInstances  #-}
module Skylighting.Tokenizer (
    tokenize
  , TokenizerConfig(..)
  ) where

import Control.Applicative
import Control.Monad
import Control.Monad.Except
import Control.Monad.Reader
import Control.Monad.State.Strict
import qualified Data.Attoparsec.ByteString.Char8 as A
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.UTF8 as UTF8
import Data.CaseInsensitive (mk)
import Data.Char (isAlphaNum, isAscii, isDigit, isLetter, isSpace, ord)
import qualified Data.Map as Map
import qualified Data.IntMap as IntMap
import Data.Maybe (catMaybes)
import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as Text
import Data.Text.Encoding (decodeUtf8', encodeUtf8)
import Debug.Trace
import Skylighting.Regex
import Skylighting.Types
import Skylighting.Parser (resolveKeywords)
import Data.List.NonEmpty (NonEmpty((:|)), (<|), toList)
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup
#endif

newtype Captures = Captures{ Captures -> IntMap ByteString
unCaptures :: IntMap.IntMap ByteString }
  deriving (Int -> Captures -> ShowS
[Captures] -> ShowS
Captures -> [Char]
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [Captures] -> ShowS
$cshowList :: [Captures] -> ShowS
show :: Captures -> [Char]
$cshow :: Captures -> [Char]
showsPrec :: Int -> Captures -> ShowS
$cshowsPrec :: Int -> Captures -> ShowS
Show)

newtype ContextStack =
  ContextStack{ ContextStack -> NonEmpty (Context, Captures)
unContextStack :: NonEmpty (Context, Captures) }
  deriving (Int -> ContextStack -> ShowS
[ContextStack] -> ShowS
ContextStack -> [Char]
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [ContextStack] -> ShowS
$cshowList :: [ContextStack] -> ShowS
show :: ContextStack -> [Char]
$cshow :: ContextStack -> [Char]
showsPrec :: Int -> ContextStack -> ShowS
$cshowsPrec :: Int -> ContextStack -> ShowS
Show)

data TokenizerState = TokenizerState{
    TokenizerState -> ByteString
input               :: ByteString
  , TokenizerState -> Bool
endline             :: Bool
  , TokenizerState -> Char
prevChar            :: Char
  , TokenizerState -> ContextStack
contextStack        :: ContextStack
  , TokenizerState -> Captures
captures            :: Captures
  , TokenizerState -> Int
column              :: Int
  , TokenizerState -> Bool
lineContinuation    :: Bool
  , TokenizerState -> Maybe Int
firstNonspaceColumn :: Maybe Int
  , TokenizerState -> Map RE Regex
compiledRegexes     :: Map.Map RE Regex
}

-- | Configuration options for 'tokenize'.
data TokenizerConfig = TokenizerConfig{
    TokenizerConfig -> SyntaxMap
syntaxMap   :: SyntaxMap  -- ^ Syntax map to use
  , TokenizerConfig -> Bool
traceOutput :: Bool       -- ^ Generate trace output for debugging
} deriving (Int -> TokenizerConfig -> ShowS
[TokenizerConfig] -> ShowS
TokenizerConfig -> [Char]
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [TokenizerConfig] -> ShowS
$cshowList :: [TokenizerConfig] -> ShowS
show :: TokenizerConfig -> [Char]
$cshow :: TokenizerConfig -> [Char]
showsPrec :: Int -> TokenizerConfig -> ShowS
$cshowsPrec :: Int -> TokenizerConfig -> ShowS
Show)

data Result e a = Success a
                | Failure
                | Error e
     deriving (forall a b. a -> Result e b -> Result e a
forall a b. (a -> b) -> Result e a -> Result e b
forall e a b. a -> Result e b -> Result e a
forall e a b. (a -> b) -> Result e a -> Result e b
forall (f :: * -> *).
(forall a b. (a -> b) -> f a -> f b)
-> (forall a b. a -> f b -> f a) -> Functor f
<$ :: forall a b. a -> Result e b -> Result e a
$c<$ :: forall e a b. a -> Result e b -> Result e a
fmap :: forall a b. (a -> b) -> Result e a -> Result e b
$cfmap :: forall e a b. (a -> b) -> Result e a -> Result e b
Functor)

deriving instance (Show a, Show e) => Show (Result e a)

newtype TokenizerM a = TM { forall a.
TokenizerM a
-> TokenizerConfig
-> TokenizerState
-> (TokenizerState, Result [Char] a)
runTokenizerM :: TokenizerConfig
                                          -> TokenizerState
                                          -> (TokenizerState, Result String a) }

mapsnd :: (a -> b) -> (c, a) -> (c, b)
mapsnd :: forall a b c. (a -> b) -> (c, a) -> (c, b)
mapsnd a -> b
f (c
x, a
y) = (c
x, a -> b
f a
y)

instance Functor TokenizerM where
  fmap :: forall a b. (a -> b) -> TokenizerM a -> TokenizerM b
fmap a -> b
f (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
g) = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s -> forall a b c. (a -> b) -> (c, a) -> (c, b)
mapsnd (forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap a -> b
f) (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
g TokenizerConfig
c TokenizerState
s))

instance Applicative TokenizerM where
  pure :: forall a. a -> TokenizerM a
pure a
x = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, forall e a. a -> Result e a
Success a
x))
  (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] (a -> b))
f) <*> :: forall a b. TokenizerM (a -> b) -> TokenizerM a -> TokenizerM b
<*> (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y) = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                           case (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] (a -> b))
f TokenizerConfig
c TokenizerState
s) of
                              (TokenizerState
s', Result [Char] (a -> b)
Failure   ) -> (TokenizerState
s', forall e a. Result e a
Failure)
                              (TokenizerState
s', Error [Char]
e   ) -> (TokenizerState
s', forall e a. e -> Result e a
Error [Char]
e)
                              (TokenizerState
s', Success a -> b
f') ->
                                  case (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y TokenizerConfig
c TokenizerState
s') of
                                    (TokenizerState
s'', Result [Char] a
Failure   ) -> (TokenizerState
s'', forall e a. Result e a
Failure)
                                    (TokenizerState
s'', Error [Char]
e'  ) -> (TokenizerState
s'', forall e a. e -> Result e a
Error [Char]
e')
                                    (TokenizerState
s'', Success a
y') -> (TokenizerState
s'', forall e a. a -> Result e a
Success (a -> b
f' a
y')))


instance Monad TokenizerM where
  return :: forall a. a -> TokenizerM a
return = forall (f :: * -> *) a. Applicative f => a -> f a
pure
  (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x) >>= :: forall a b. TokenizerM a -> (a -> TokenizerM b) -> TokenizerM b
>>= a -> TokenizerM b
f = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                       case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x TokenizerConfig
c TokenizerState
s of
                            (TokenizerState
s', Result [Char] a
Failure   ) -> (TokenizerState
s', forall e a. Result e a
Failure)
                            (TokenizerState
s', Error [Char]
e   ) -> (TokenizerState
s', forall e a. e -> Result e a
Error [Char]
e)
                            (TokenizerState
s', Success a
x') -> TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] b)
g TokenizerConfig
c TokenizerState
s'
                              where TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] b)
g = a -> TokenizerM b
f a
x')

instance Alternative TokenizerM where
  empty :: forall a. TokenizerM a
empty = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, forall e a. Result e a
Failure))
  <|> :: forall a. TokenizerM a -> TokenizerM a -> TokenizerM a
(<|>) (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x) (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y) = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                           case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x TokenizerConfig
c TokenizerState
s of
                                (TokenizerState
_, Result [Char] a
Failure   )  -> TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y TokenizerConfig
c TokenizerState
s
                                (TokenizerState
s', Error [Char]
e   ) -> (TokenizerState
s', forall e a. e -> Result e a
Error [Char]
e)
                                (TokenizerState
s', Success a
x') -> (TokenizerState
s', forall e a. a -> Result e a
Success a
x'))
  many :: forall a. TokenizerM a -> TokenizerM [a]
many (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x) = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                    case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x TokenizerConfig
c TokenizerState
s of
                       (TokenizerState
_, Result [Char] a
Failure   )  -> (TokenizerState
s, forall e a. a -> Result e a
Success [])
                       (TokenizerState
s', Error [Char]
e   ) -> (TokenizerState
s', forall e a. e -> Result e a
Error [Char]
e)
                       (TokenizerState
s', Success a
x') -> forall a b c. (a -> b) -> (c, a) -> (c, b)
mapsnd (forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (a
x'forall a. a -> [a] -> [a]
:)) (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] [a])
g TokenizerConfig
c TokenizerState
s')
                         where TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] [a])
g = forall (f :: * -> *) a. Alternative f => f a -> f [a]
many (forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x))
  some :: forall a. TokenizerM a -> TokenizerM [a]
some TokenizerM a
x = (:) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> TokenizerM a
x forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> forall (f :: * -> *) a. Alternative f => f a -> f [a]
many TokenizerM a
x

instance MonadPlus TokenizerM where
  mzero :: forall a. TokenizerM a
mzero = forall (f :: * -> *) a. Alternative f => f a
empty
  mplus :: forall a. TokenizerM a -> TokenizerM a -> TokenizerM a
mplus = forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
(<|>)

instance MonadReader TokenizerConfig TokenizerM where
  ask :: TokenizerM TokenizerConfig
ask = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s -> (TokenizerState
s, forall e a. a -> Result e a
Success TokenizerConfig
c))
  local :: forall a.
(TokenizerConfig -> TokenizerConfig)
-> TokenizerM a -> TokenizerM a
local TokenizerConfig -> TokenizerConfig
f (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x) = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x forall b c a. (b -> c) -> (a -> b) -> a -> c
. TokenizerConfig -> TokenizerConfig
f)

instance MonadState TokenizerState TokenizerM where
  get :: TokenizerM TokenizerState
get = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, forall e a. a -> Result e a
Success TokenizerState
s))
  put :: TokenizerState -> TokenizerM ()
put TokenizerState
x = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
_ -> (TokenizerState
x, forall e a. a -> Result e a
Success ()))

instance MonadError String TokenizerM where
  throwError :: forall a. [Char] -> TokenizerM a
throwError [Char]
e = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, forall e a. e -> Result e a
Error [Char]
e))
  catchError :: forall a. TokenizerM a -> ([Char] -> TokenizerM a) -> TokenizerM a
catchError (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x) [Char] -> TokenizerM a
f = forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s -> case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x TokenizerConfig
c TokenizerState
s of
                                      (TokenizerState
_, Error [Char]
e) -> let TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y = [Char] -> TokenizerM a
f [Char]
e in TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y TokenizerConfig
c TokenizerState
s
                                      (TokenizerState, Result [Char] a)
z            -> (TokenizerState, Result [Char] a)
z)

-- | Tokenize some text using 'Syntax'.
tokenize :: TokenizerConfig -> Syntax -> Text -> Either String [SourceLine]
tokenize :: TokenizerConfig -> Syntax -> Text -> Either [Char] [SourceLine]
tokenize TokenizerConfig
config Syntax
syntax Text
inp =
  Either [Char] ContextStack
eitherStack forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \(!ContextStack
stack) ->
    case forall a.
TokenizerM a
-> TokenizerConfig
-> TokenizerState
-> (TokenizerState, Result [Char] a)
runTokenizerM TokenizerM [SourceLine]
action
            TokenizerConfig
config{ syntaxMap :: SyntaxMap
syntaxMap = forall a b k. (a -> b) -> Map k a -> Map k b
Map.map (SyntaxMap -> Syntax -> Syntax
resolveKeywords (TokenizerConfig -> SyntaxMap
syntaxMap TokenizerConfig
config))
                                          (TokenizerConfig -> SyntaxMap
syntaxMap TokenizerConfig
config) }
            (ContextStack -> TokenizerState
startingState ContextStack
stack) of
       (TokenizerState
_, Success [SourceLine]
ls) -> forall a b. b -> Either a b
Right [SourceLine]
ls
       (TokenizerState
_, Error [Char]
e)    -> forall a b. a -> Either a b
Left [Char]
e
       (TokenizerState
_, Result [Char] [SourceLine]
Failure)    -> forall a b. a -> Either a b
Left [Char]
"Could not tokenize code"
  where
    action :: TokenizerM [SourceLine]
action = forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (ByteString, Int) -> TokenizerM SourceLine
tokenizeLine (forall a b. [a] -> [b] -> [(a, b)]
zip (ByteString -> [ByteString]
BS.lines (Text -> ByteString
encodeUtf8 Text
inp)) [Int
1..])
    eitherStack :: Either [Char] ContextStack
eitherStack = case Text -> Syntax -> Maybe Context
lookupContext (Syntax -> Text
sStartingContext Syntax
syntax)
                         (SyntaxMap -> Syntax -> Syntax
resolveKeywords (TokenizerConfig -> SyntaxMap
syntaxMap TokenizerConfig
config) Syntax
syntax) of
                    Just Context
c  -> forall a b. b -> Either a b
Right forall a b. (a -> b) -> a -> b
$ NonEmpty (Context, Captures) -> ContextStack
ContextStack ((Context
c, IntMap ByteString -> Captures
Captures forall a. Monoid a => a
mempty) forall a. a -> [a] -> NonEmpty a
:| [])
                    Maybe Context
Nothing -> forall a b. a -> Either a b
Left [Char]
"No starting context specified"
    startingState :: ContextStack -> TokenizerState
startingState ContextStack
stack =
      TokenizerState{ input :: ByteString
input = ByteString
BS.empty
                    , endline :: Bool
endline = Text -> Bool
Text.null Text
inp
                    , prevChar :: Char
prevChar = Char
'\n'
                    , contextStack :: ContextStack
contextStack = ContextStack
stack
                    , captures :: Captures
captures = IntMap ByteString -> Captures
Captures forall a. Monoid a => a
mempty
                    , column :: Int
column = Int
0
                    , lineContinuation :: Bool
lineContinuation = Bool
False
                    , firstNonspaceColumn :: Maybe Int
firstNonspaceColumn = forall a. Maybe a
Nothing
                    , compiledRegexes :: Map RE Regex
compiledRegexes = forall k a. Map k a
Map.empty
                    }

info :: String -> TokenizerM ()
info :: [Char] -> TokenizerM ()
info [Char]
s = do
  Bool
tr <- forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> Bool
traceOutput
  forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
tr forall a b. (a -> b) -> a -> b
$ forall a. [Char] -> a -> a
trace [Char]
s (forall (m :: * -> *) a. Monad m => a -> m a
return ())

infoContextStack :: TokenizerM ()
infoContextStack :: TokenizerM ()
infoContextStack = do
  Bool
tr <- forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> Bool
traceOutput
  forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
tr forall a b. (a -> b) -> a -> b
$ do
    ContextStack NonEmpty (Context, Captures)
stack <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ContextStack
contextStack
    [Char] -> TokenizerM ()
info forall a b. (a -> b) -> a -> b
$ [Char]
"CONTEXT STACK " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show (forall a b. (a -> b) -> [a] -> [b]
map (Context -> Text
cName forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst) forall a b. (a -> b) -> a -> b
$ forall a. NonEmpty a -> [a]
toList NonEmpty (Context, Captures)
stack)

popContextStack :: TokenizerM ()
popContextStack :: TokenizerM ()
popContextStack = do
  ContextStack NonEmpty (Context, Captures)
cs <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ContextStack
contextStack
  case NonEmpty (Context, Captures)
cs of
       ((Context, Captures)
_ :| []) -> [Char] -> TokenizerM ()
info [Char]
"WARNING: Tried to pop only element on context stack!"
       ((Context, Captures)
_ :| ((Context, Captures)
x:[(Context, Captures)]
xs)) -> do
         forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify (\TokenizerState
st -> TokenizerState
st{ contextStack :: ContextStack
contextStack = NonEmpty (Context, Captures) -> ContextStack
ContextStack ((Context, Captures)
x forall a. a -> [a] -> NonEmpty a
:| [(Context, Captures)]
xs) })
         TokenizerM ()
infoContextStack

pushContextStack :: Context -> TokenizerM ()
pushContextStack :: Context -> TokenizerM ()
pushContextStack Context
cont = do
  forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify (\TokenizerState
st -> TokenizerState
st{ contextStack :: ContextStack
contextStack =
                      NonEmpty (Context, Captures) -> ContextStack
ContextStack
                       (((Context
cont, IntMap ByteString -> Captures
Captures forall a. Monoid a => a
mempty) forall a. a -> NonEmpty a -> NonEmpty a
<|) forall b c a. (b -> c) -> (a -> b) -> a -> c
. ContextStack -> NonEmpty (Context, Captures)
unContextStack
                         forall a b. (a -> b) -> a -> b
$ TokenizerState -> ContextStack
contextStack TokenizerState
st) } )
  TokenizerM ()
infoContextStack

currentContext :: TokenizerM Context
currentContext :: TokenizerM Context
currentContext = do
  ContextStack ((Context
c,Captures
_) :| [(Context, Captures)]
_) <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ContextStack
contextStack
  forall (m :: * -> *) a. Monad m => a -> m a
return Context
c

doContextSwitch :: ContextSwitch -> TokenizerM ()
doContextSwitch :: ContextSwitch -> TokenizerM ()
doContextSwitch ContextSwitch
Pop = TokenizerM ()
popContextStack
doContextSwitch (Push (!Text
syn,!Text
c)) = do
  SyntaxMap
syntaxes <- forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> SyntaxMap
syntaxMap
  case forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
syn SyntaxMap
syntaxes forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Text -> Syntax -> Maybe Context
lookupContext Text
c of
       Just !Context
con -> Context -> TokenizerM ()
pushContextStack Context
con
       Maybe Context
Nothing   -> forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError forall a b. (a -> b) -> a -> b
$ [Char]
"Unknown syntax or context: " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show (Text
syn, Text
c)

doContextSwitches :: [ContextSwitch] -> TokenizerM ()
doContextSwitches :: [ContextSwitch] -> TokenizerM ()
doContextSwitches = forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ ContextSwitch -> TokenizerM ()
doContextSwitch

addCaptures :: TokenizerM ()
addCaptures :: TokenizerM ()
addCaptures = do
  Captures
capts <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Captures
captures
  if forall a. IntMap a -> Bool
IntMap.null (Captures -> IntMap ByteString
unCaptures Captures
capts)
     then forall (m :: * -> *) a. Monad m => a -> m a
return ()
     else do
       ContextStack ((Context
c,Captures
_) :| [(Context, Captures)]
cs) <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ContextStack
contextStack
       [Char] -> TokenizerM ()
info forall a b. (a -> b) -> a -> b
$ [Char]
"Adding captures to " forall a. Semigroup a => a -> a -> a
<> forall a. Show a => a -> [Char]
show (Context -> Text
cName Context
c) forall a. Semigroup a => a -> a -> a
<> [Char]
": " forall a. Semigroup a => a -> a -> a
<> forall a. Show a => a -> [Char]
show Captures
capts
       forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ contextStack :: ContextStack
contextStack = NonEmpty (Context, Captures) -> ContextStack
ContextStack ((Context
c,Captures
capts) forall a. a -> [a] -> NonEmpty a
:| [(Context, Captures)]
cs) }

getCapture :: Int -> TokenizerM Text
getCapture :: Int -> TokenizerM Text
getCapture Int
capnum = do
  ContextStack ((Context
_,Captures IntMap ByteString
capts) :| [(Context, Captures)]
_) <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ContextStack
contextStack
  [Char] -> TokenizerM ()
info forall a b. (a -> b) -> a -> b
$ [Char]
"Retrieving capture " forall a. Semigroup a => a -> a -> a
<> forall a. Show a => a -> [Char]
show Int
capnum
  Text
res <- case forall a. Int -> IntMap a -> Maybe a
IntMap.lookup Int
capnum IntMap ByteString
capts of
          Maybe ByteString
Nothing -> do
            [Char] -> TokenizerM ()
info [Char]
"Not found"
            forall (m :: * -> *) a. MonadPlus m => m a
mzero
          Just ByteString
x  -> ByteString -> TokenizerM Text
decodeBS ByteString
x
  [Char] -> TokenizerM ()
info forall a b. (a -> b) -> a -> b
$ [Char]
"Got " forall a. Semigroup a => a -> a -> a
<> forall a. Show a => a -> [Char]
show Text
res
  forall (m :: * -> *) a. Monad m => a -> m a
return Text
res

lookupContext :: Text -> Syntax -> Maybe Context
lookupContext :: Text -> Syntax -> Maybe Context
lookupContext Text
name Syntax
syntax | Text -> Bool
Text.null Text
name =
  if Text -> Bool
Text.null (Syntax -> Text
sStartingContext Syntax
syntax)
     then forall a. Maybe a
Nothing
     else Text -> Syntax -> Maybe Context
lookupContext (Syntax -> Text
sStartingContext Syntax
syntax) Syntax
syntax
lookupContext Text
name Syntax
syntax = forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
name forall a b. (a -> b) -> a -> b
$ Syntax -> Map Text Context
sContexts Syntax
syntax

tokenizeLine :: (ByteString, Int) -> TokenizerM [Token]
tokenizeLine :: (ByteString, Int) -> TokenizerM SourceLine
tokenizeLine (!ByteString
ln, !Int
linenum) = do
  forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ input :: ByteString
input = ByteString
ln, endline :: Bool
endline = ByteString -> Bool
BS.null ByteString
ln, prevChar :: Char
prevChar = Char
'\n' }
  Context
cur <- TokenizerM Context
currentContext
  Bool
lineCont <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Bool
lineContinuation
  if Bool
lineCont
     then forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ lineContinuation :: Bool
lineContinuation = Bool
False }
     else do
       let !mbFirstNonspace :: Maybe Int
mbFirstNonspace = (Char -> Bool) -> ByteString -> Maybe Int
BS.findIndex (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace) forall a b. (a -> b) -> a -> b
$! ByteString
ln
       forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ column :: Int
column = Int
0
                         , firstNonspaceColumn :: Maybe Int
firstNonspaceColumn = Maybe Int
mbFirstNonspace }
       [ContextSwitch] -> TokenizerM ()
doContextSwitches (Context -> [ContextSwitch]
cLineBeginContext Context
cur)
  if ByteString -> Bool
BS.null ByteString
ln
     then [ContextSwitch] -> TokenizerM ()
doContextSwitches (Context -> [ContextSwitch]
cLineEmptyContext Context
cur)
     else [ContextSwitch] -> TokenizerM ()
doContextSwitches (Context -> [ContextSwitch]
cLineBeginContext Context
cur)
  SourceLine
ts <- SourceLine -> SourceLine
normalizeHighlighting forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. [Maybe a] -> [a]
catMaybes forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (f :: * -> *) a. Alternative f => f a -> f [a]
many TokenizerM (Maybe (TokenType, Text))
getToken
  Bool
eol <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Bool
endline
  if Bool
eol
     then do
       TokenizerM Context
currentContext forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Context -> TokenizerM ()
checkLineEnd
       forall (m :: * -> *) a. Monad m => a -> m a
return SourceLine
ts
     else do  -- fail if we haven't consumed whole line
       Int
col <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Int
column
       forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError forall a b. (a -> b) -> a -> b
$ [Char]
"Could not match anything at line " forall a. [a] -> [a] -> [a]
++
         forall a. Show a => a -> [Char]
show Int
linenum forall a. [a] -> [a] -> [a]
++ [Char]
" column " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Int
col

getToken :: TokenizerM (Maybe Token)
getToken :: TokenizerM (Maybe (TokenType, Text))
getToken = do
  ByteString
inp <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ByteString
input
  forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Bool
endline forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall b c a. (b -> c) -> (a -> b) -> a -> c
. Bool -> Bool
not
  !Context
context <- TokenizerM Context
currentContext
  forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum (forall a b. (a -> b) -> [a] -> [b]
map (\Rule
r -> Rule -> ByteString -> TokenizerM (Maybe (TokenType, Text))
tryRule Rule
r ByteString
inp) (Context -> [Rule]
cRules Context
context)) forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
     case Context -> [ContextSwitch]
cFallthroughContext Context
context of
           [] | Context -> Bool
cFallthrough Context
context -> forall a. Maybe a
Nothing forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ [ContextSwitch] -> TokenizerM ()
doContextSwitches [ContextSwitch
Pop]
              | Bool
otherwise -> do
                  Text
t <- TokenizerM Text
normalChunk
                  let mbtok :: Maybe (TokenType, Text)
mbtok = forall a. a -> Maybe a
Just (Context -> TokenType
cAttribute Context
context, Text
t)
                  [Char] -> TokenizerM ()
info forall a b. (a -> b) -> a -> b
$ [Char]
"FALLTHROUGH " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Maybe (TokenType, Text)
mbtok
                  forall (m :: * -> *) a. Monad m => a -> m a
return Maybe (TokenType, Text)
mbtok
           [ContextSwitch]
cs -> forall a. Maybe a
Nothing forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ [ContextSwitch] -> TokenizerM ()
doContextSwitches [ContextSwitch]
cs

takeChars :: Int -> TokenizerM Text
takeChars :: Int -> TokenizerM Text
takeChars Int
0 = forall (m :: * -> *) a. MonadPlus m => m a
mzero
takeChars Int
numchars = do
  ByteString
inp <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ByteString
input
  let (ByteString
bs,ByteString
rest) = Int -> ByteString -> (ByteString, ByteString)
UTF8.splitAt Int
numchars ByteString
inp
  forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (ByteString -> Bool
BS.null ByteString
bs)
  !Text
t <- ByteString -> TokenizerM Text
decodeBS ByteString
bs
  forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ input :: ByteString
input = ByteString
rest,
                      endline :: Bool
endline = ByteString -> Bool
BS.null ByteString
rest,
                      prevChar :: Char
prevChar = Text -> Char
Text.last Text
t,
                      column :: Int
column = TokenizerState -> Int
column TokenizerState
st forall a. Num a => a -> a -> a
+ Int
numchars }
  forall (m :: * -> *) a. Monad m => a -> m a
return Text
t

tryRule :: Rule -> ByteString -> TokenizerM (Maybe Token)
tryRule :: Rule -> ByteString -> TokenizerM (Maybe (TokenType, Text))
tryRule Rule
_    ByteString
""  = forall (m :: * -> *) a. MonadPlus m => m a
mzero
tryRule Rule
rule ByteString
inp = do
  [Char] -> TokenizerM ()
info forall a b. (a -> b) -> a -> b
$ [Char]
"Trying rule " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Rule
rule
  case Rule -> Maybe Int
rColumn Rule
rule of
       Maybe Int
Nothing -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
       Just Int
n  -> forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Int
column forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall b c a. (b -> c) -> (a -> b) -> a -> c
. (forall a. Eq a => a -> a -> Bool
== Int
n)

  forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Rule -> Bool
rFirstNonspace Rule
rule) forall a b. (a -> b) -> a -> b
$ do
    !Maybe Int
firstNonspace <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Maybe Int
firstNonspaceColumn
    !Int
col <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Int
column
    forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Maybe Int
firstNonspace forall a. Eq a => a -> a -> Bool
== forall a. a -> Maybe a
Just Int
col)

  Maybe TokenizerState
oldstate <- if Rule -> Bool
rLookahead Rule
rule
                 then forall a. a -> Maybe a
Just forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall s (m :: * -> *). MonadState s m => m s
get -- needed for lookahead rules
                 else forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing

  -- reset regex captures
  forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ captures :: Captures
captures = IntMap ByteString -> Captures
Captures forall a. Monoid a => a
mempty }

  let attr :: TokenType
attr = Rule -> TokenType
rAttribute Rule
rule
  Maybe (TokenType, Text)
mbtok <- case Rule -> Matcher
rMatcher Rule
rule of
                DetectChar Char
c -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ Bool -> Char -> ByteString -> TokenizerM Text
detectChar (Rule -> Bool
rDynamic Rule
rule) Char
c ByteString
inp
                Detect2Chars Char
c Char
d -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$
                                      Bool -> Char -> Char -> ByteString -> TokenizerM Text
detect2Chars (Rule -> Bool
rDynamic Rule
rule) Char
c Char
d ByteString
inp
                AnyChar Set Char
cs -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ Set Char -> ByteString -> TokenizerM Text
anyChar Set Char
cs ByteString
inp
                RangeDetect Char
c Char
d -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ Char -> Char -> ByteString -> TokenizerM Text
rangeDetect Char
c Char
d ByteString
inp
                RegExpr RE
re -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ Bool -> RE -> ByteString -> TokenizerM Text
regExpr (Rule -> Bool
rDynamic Rule
rule) RE
re ByteString
inp
                Matcher
Int -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseInt ByteString
inp
                Matcher
HlCOct -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseOct ByteString
inp
                Matcher
HlCHex -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseHex ByteString
inp
                Matcher
HlCStringChar -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseCStringChar ByteString
inp
                Matcher
HlCChar -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseCChar ByteString
inp
                Matcher
Float -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseFloat ByteString
inp
                Keyword KeywordAttr
_kwattr (Left Text
listname) ->
                  forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError forall a b. (a -> b) -> a -> b
$ [Char]
"Keyword with unresolved list " forall a. Semigroup a => a -> a -> a
<> forall a. Show a => a -> [Char]
show Text
listname
                Keyword KeywordAttr
kwattr (Right WordSet Text
kws) ->
                  TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ KeywordAttr -> WordSet Text -> ByteString -> TokenizerM Text
keyword KeywordAttr
kwattr WordSet Text
kws ByteString
inp
                StringDetect Text
s -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$
                                    Bool -> Bool -> Text -> ByteString -> TokenizerM Text
stringDetect (Rule -> Bool
rDynamic Rule
rule) (Rule -> Bool
rCaseSensitive Rule
rule)
                                                 Text
s ByteString
inp
                WordDetect Text
s -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$
                                    Bool -> Text -> ByteString -> TokenizerM Text
wordDetect (Rule -> Bool
rCaseSensitive Rule
rule) Text
s ByteString
inp
                Matcher
LineContinue -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
lineContinue ByteString
inp
                Matcher
DetectSpaces -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
detectSpaces ByteString
inp
                Matcher
DetectIdentifier -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
detectIdentifier ByteString
inp
                IncludeRules (Text, Text)
cname -> Maybe TokenType
-> (Text, Text)
-> ByteString
-> TokenizerM (Maybe (TokenType, Text))
includeRules
                   (if Rule -> Bool
rIncludeAttribute Rule
rule then forall a. a -> Maybe a
Just TokenType
attr else forall a. Maybe a
Nothing)
                   (Text, Text)
cname ByteString
inp
  Maybe (TokenType, Text)
mbchildren <- do
    ByteString
inp' <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ByteString
input
    forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum (forall a b. (a -> b) -> [a] -> [b]
map (\Rule
r -> Rule -> ByteString -> TokenizerM (Maybe (TokenType, Text))
tryRule Rule
r ByteString
inp') (Rule -> [Rule]
rChildren Rule
rule)) forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing

  Maybe (TokenType, Text)
mbtok' <- case Maybe (TokenType, Text)
mbtok of
                 Maybe (TokenType, Text)
Nothing -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
                 Just (TokenType
tt, Text
s)
                   | Rule -> Bool
rLookahead Rule
rule -> do
                     (ByteString
oldinput, Bool
oldendline, Char
oldprevChar, Int
oldColumn) <-
                         case Maybe TokenizerState
oldstate of
                              Maybe TokenizerState
Nothing -> forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError
                                    [Char]
"oldstate not saved with lookahead rule"
                              Just TokenizerState
st -> forall (m :: * -> *) a. Monad m => a -> m a
return
                                    (TokenizerState -> ByteString
input TokenizerState
st, TokenizerState -> Bool
endline TokenizerState
st,
                                     TokenizerState -> Char
prevChar TokenizerState
st, TokenizerState -> Int
column TokenizerState
st)
                     forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ input :: ByteString
input = ByteString
oldinput
                                       , endline :: Bool
endline = Bool
oldendline
                                       , prevChar :: Char
prevChar = Char
oldprevChar
                                       , column :: Int
column = Int
oldColumn }
                     forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
                   | Bool
otherwise -> do
                     case Maybe (TokenType, Text)
mbchildren of
                          Maybe (TokenType, Text)
Nothing -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a
Just (TokenType
tt, Text
s)
                          Just (TokenType
_, Text
cresult) -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a
Just (TokenType
tt, Text
s forall a. Semigroup a => a -> a -> a
<> Text
cresult)

  [Char] -> TokenizerM ()
info forall a b. (a -> b) -> a -> b
$ forall a. (a -> Bool) -> [a] -> [a]
takeWhile (forall a. Eq a => a -> a -> Bool
/=Char
' ') (forall a. Show a => a -> [Char]
show (Rule -> Matcher
rMatcher Rule
rule)) forall a. [a] -> [a] -> [a]
++ [Char]
" MATCHED " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Maybe (TokenType, Text)
mbtok'
  [ContextSwitch] -> TokenizerM ()
doContextSwitches (Rule -> [ContextSwitch]
rContextSwitch Rule
rule)
  -- Add any captures to the context on top of the stack
  TokenizerM ()
addCaptures
  forall (m :: * -> *) a. Monad m => a -> m a
return Maybe (TokenType, Text)
mbtok'


withAttr :: TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr :: TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
tt TokenizerM Text
p = do
  Text
res <- TokenizerM Text
p
  if Text -> Bool
Text.null Text
res
     then forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
     else forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a
Just (TokenType
tt, Text
res)

wordDetect :: Bool -> Text -> ByteString -> TokenizerM Text
wordDetect :: Bool -> Text -> ByteString -> TokenizerM Text
wordDetect Bool
caseSensitive Text
s ByteString
inp = do
  ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  Text
t <- ByteString -> TokenizerM Text
decodeBS forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
UTF8.take (Text -> Int
Text.length Text
s) ByteString
inp
  -- we assume here that the case fold will not change length,
  -- which is safe for ASCII keywords and the like...
  forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ if Bool
caseSensitive
             then Text
s forall a. Eq a => a -> a -> Bool
== Text
t
             else forall s. FoldCase s => s -> CI s
mk Text
s forall a. Eq a => a -> a -> Bool
== forall s. FoldCase s => s -> CI s
mk Text
t
  forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (Text -> Bool
Text.null Text
t)
  let c :: Char
c = Text -> Char
Text.last Text
t
  let rest :: ByteString
rest = Int -> ByteString -> ByteString
UTF8.drop (Text -> Int
Text.length Text
s) ByteString
inp
  let d :: Char
d = case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
rest of
               Maybe (Char, ByteString)
Nothing    -> Char
'\n'
               Just (Char
x,ByteString
_) -> Char
x
  forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ Char -> Char -> Bool
isWordBoundary Char
c Char
d
  Int -> TokenizerM Text
takeChars (Text -> Int
Text.length Text
t)

stringDetect :: Bool -> Bool -> Text -> ByteString -> TokenizerM Text
stringDetect :: Bool -> Bool -> Text -> ByteString -> TokenizerM Text
stringDetect Bool
dynamic Bool
caseSensitive Text
s ByteString
inp = do
  Text
s' <- if Bool
dynamic
        then do
          Text
dynStr <- Text -> TokenizerM Text
subDynamicText Text
s
          [Char] -> TokenizerM ()
info forall a b. (a -> b) -> a -> b
$ [Char]
"Dynamic string: " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Text
dynStr
          forall (m :: * -> *) a. Monad m => a -> m a
return Text
dynStr
        else forall (m :: * -> *) a. Monad m => a -> m a
return Text
s
  Text
t <- ByteString -> TokenizerM Text
decodeBS forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
UTF8.take (Text -> Int
Text.length Text
s') ByteString
inp
  -- we assume here that the case fold will not change length,
  -- which is safe for ASCII keywords and the like...
  forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ if Bool
caseSensitive
             then Text
s' forall a. Eq a => a -> a -> Bool
== Text
t
             else forall s. FoldCase s => s -> CI s
mk Text
s' forall a. Eq a => a -> a -> Bool
== forall s. FoldCase s => s -> CI s
mk Text
t
  Int -> TokenizerM Text
takeChars (Text -> Int
Text.length Text
s')

subDynamicText :: Text -> TokenizerM Text
subDynamicText :: Text -> TokenizerM Text
subDynamicText Text
t = do
  let substitute :: Text -> TokenizerM Text
substitute Text
x = case Text -> Maybe (Char, Text)
Text.uncons Text
x of
        Just (Char
c, Text
rest) | Char -> Bool
isDigit Char
c -> let capNum :: Int
capNum = Char -> Int
ord Char
c forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'0'
                                      in (forall a. Semigroup a => a -> a -> a
<> Text
rest) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Int -> TokenizerM Text
getCapture Int
capNum
        Maybe (Char, Text)
_ -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Char -> Text -> Text
Text.cons Char
'%' Text
x
  case (Char -> Bool) -> Text -> [Text]
Text.split (forall a. Eq a => a -> a -> Bool
== Char
'%') Text
t of
    []     -> forall (m :: * -> *) a. Monad m => a -> m a
return Text
Text.empty
    Text
x:[Text]
rest -> (Text
x forall a. Semigroup a => a -> a -> a
<>) forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Text] -> Text
Text.concat forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM Text -> TokenizerM Text
substitute [Text]
rest

-- This assumes that nothing significant will happen
-- in the middle of a string of spaces or a string
-- of alphanumerics.  This seems true  for all normal
-- programming languages, and the optimization speeds
-- things up a lot, relative to just parsing one char.
normalChunk :: TokenizerM Text
normalChunk :: TokenizerM Text
normalChunk = do
  ByteString
inp <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ByteString
input
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
    Maybe (Char, ByteString)
Nothing -> forall (m :: * -> *) a. MonadPlus m => m a
mzero
    Just (Char
c, ByteString
_)
      | Char
c forall a. Eq a => a -> a -> Bool
== Char
' ' ->
        let bs :: ByteString
bs = (Char -> Bool) -> ByteString -> ByteString
BS.takeWhile (forall a. Eq a => a -> a -> Bool
==Char
' ') ByteString
inp
        in  Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
bs)
      | Char -> Bool
isAscii Char
c Bool -> Bool -> Bool
&& Char -> Bool
isAlphaNum Char
c ->
        let (ByteString
bs, ByteString
_) = (Char -> Bool) -> ByteString -> (ByteString, ByteString)
UTF8.span Char -> Bool
isAlphaNum ByteString
inp
        in  Int -> TokenizerM Text
takeChars (ByteString -> Int
UTF8.length ByteString
bs)
      | Bool
otherwise -> Int -> TokenizerM Text
takeChars Int
1

includeRules :: Maybe TokenType -> ContextName -> ByteString
             -> TokenizerM (Maybe Token)
includeRules :: Maybe TokenType
-> (Text, Text)
-> ByteString
-> TokenizerM (Maybe (TokenType, Text))
includeRules Maybe TokenType
mbattr (Text
syn, Text
con) ByteString
inp = do
  SyntaxMap
syntaxes <- forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> SyntaxMap
syntaxMap
  case forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
syn SyntaxMap
syntaxes forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Text -> Syntax -> Maybe Context
lookupContext Text
con of
       Maybe Context
Nothing  -> do
          Context
cur <- TokenizerM Context
currentContext
          forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError forall a b. (a -> b) -> a -> b
$ [Char]
"IncludeRules in " forall a. [a] -> [a] -> [a]
++ Text -> [Char]
Text.unpack (Context -> Text
cSyntax Context
cur) forall a. [a] -> [a] -> [a]
++
           [Char]
" requires undefined context " forall a. [a] -> [a] -> [a]
++
           Text -> [Char]
Text.unpack Text
con forall a. [a] -> [a] -> [a]
++ [Char]
"##" forall a. [a] -> [a] -> [a]
++ Text -> [Char]
Text.unpack Text
syn
       Just Context
c   -> do
         Maybe (TokenType, Text)
mbtok <- forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum (forall a b. (a -> b) -> [a] -> [b]
map (\Rule
r -> Rule -> ByteString -> TokenizerM (Maybe (TokenType, Text))
tryRule Rule
r ByteString
inp) (Context -> [Rule]
cRules Context
c))
         forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ captures :: Captures
captures = IntMap ByteString -> Captures
Captures forall a. Monoid a => a
mempty }
         forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case (Maybe (TokenType, Text)
mbtok, Maybe TokenType
mbattr) of
                    (Just (TokenType
NormalTok, Text
xs), Just TokenType
attr) -> forall a. a -> Maybe a
Just (TokenType
attr, Text
xs)
                    (Maybe (TokenType, Text), Maybe TokenType)
_                                 -> Maybe (TokenType, Text)
mbtok

checkLineEnd :: Context -> TokenizerM ()
checkLineEnd :: Context -> TokenizerM ()
checkLineEnd Context
c = do
  forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (forall (t :: * -> *) a. Foldable t => t a -> Bool
null (Context -> [ContextSwitch]
cLineEndContext Context
c)) forall a b. (a -> b) -> a -> b
$ do
    Bool
eol <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Bool
endline
    [Char] -> TokenizerM ()
info forall a b. (a -> b) -> a -> b
$ [Char]
"checkLineEnd for " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show (Context -> Text
cName Context
c) forall a. [a] -> [a] -> [a]
++ [Char]
" eol = " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Bool
eol forall a. [a] -> [a] -> [a]
++ [Char]
" cLineEndContext = " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show (Context -> [ContextSwitch]
cLineEndContext Context
c)
    forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
eol forall a b. (a -> b) -> a -> b
$ do
      Bool
lineCont' <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Bool
lineContinuation
      forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless Bool
lineCont' forall a b. (a -> b) -> a -> b
$ do
        [ContextSwitch] -> TokenizerM ()
doContextSwitches (Context -> [ContextSwitch]
cLineEndContext Context
c)
        Context
c' <- TokenizerM Context
currentContext
        forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (Context
c forall a. Eq a => a -> a -> Bool
== Context
c') forall a b. (a -> b) -> a -> b
$ Context -> TokenizerM ()
checkLineEnd Context
c'

detectChar :: Bool -> Char -> ByteString -> TokenizerM Text
detectChar :: Bool -> Char -> ByteString -> TokenizerM Text
detectChar Bool
dynamic Char
c ByteString
inp = do
  Char
c' <- if Bool
dynamic Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'9'
           then Char -> TokenizerM Char
getDynamicChar Char
c
           else forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
    Just (Char
x,ByteString
_) | Char
x forall a. Eq a => a -> a -> Bool
== Char
c' -> Int -> TokenizerM Text
takeChars Int
1
    Maybe (Char, ByteString)
_          -> forall (m :: * -> *) a. MonadPlus m => m a
mzero

getDynamicChar :: Char -> TokenizerM Char
getDynamicChar :: Char -> TokenizerM Char
getDynamicChar Char
c = do
  let capNum :: Int
capNum = Char -> Int
ord Char
c forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'0'
  Text
res <- Int -> TokenizerM Text
getCapture Int
capNum
  case Text -> Maybe (Char, Text)
Text.uncons Text
res of
       Maybe (Char, Text)
Nothing    -> forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Just (Char
d,Text
_) -> forall (m :: * -> *) a. Monad m => a -> m a
return Char
d

detect2Chars :: Bool -> Char -> Char -> ByteString -> TokenizerM Text
detect2Chars :: Bool -> Char -> Char -> ByteString -> TokenizerM Text
detect2Chars Bool
dynamic Char
c Char
d ByteString
inp = do
  Char
c' <- if Bool
dynamic Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'9'
           then Char -> TokenizerM Char
getDynamicChar Char
c
           else forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
  Char
d' <- if Bool
dynamic Bool -> Bool -> Bool
&& Char
d forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
d forall a. Ord a => a -> a -> Bool
<= Char
'9'
           then Char -> TokenizerM Char
getDynamicChar Char
d
           else forall (m :: * -> *) a. Monad m => a -> m a
return Char
d
  if (Text -> ByteString
encodeUtf8 ([Char] -> Text
Text.pack [Char
c',Char
d'])) ByteString -> ByteString -> Bool
`BS.isPrefixOf` ByteString
inp
     then Int -> TokenizerM Text
takeChars Int
2
     else forall (m :: * -> *) a. MonadPlus m => m a
mzero

rangeDetect :: Char -> Char -> ByteString -> TokenizerM Text
rangeDetect :: Char -> Char -> ByteString -> TokenizerM Text
rangeDetect Char
c Char
d ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
    Just (Char
x, ByteString
rest)
      | Char
x forall a. Eq a => a -> a -> Bool
== Char
c -> case (Char -> Bool) -> ByteString -> (ByteString, ByteString)
UTF8.span (forall a. Eq a => a -> a -> Bool
/= Char
d) ByteString
rest of
                       (ByteString
in_t, ByteString
out_t)
                         | ByteString -> Bool
BS.null ByteString
out_t -> forall (m :: * -> *) a. MonadPlus m => m a
mzero
                         | Bool
otherwise -> do
                              Text
t <- ByteString -> TokenizerM Text
decodeBS ByteString
in_t
                              Int -> TokenizerM Text
takeChars (Text -> Int
Text.length Text
t forall a. Num a => a -> a -> a
+ Int
2)
    Maybe (Char, ByteString)
_ -> forall (m :: * -> *) a. MonadPlus m => m a
mzero

-- NOTE: currently limited to ASCII
detectSpaces :: ByteString -> TokenizerM Text
detectSpaces :: ByteString -> TokenizerM Text
detectSpaces ByteString
inp = do
  case (Char -> Bool) -> ByteString -> (ByteString, ByteString)
BS.span (\Char
c -> Char -> Bool
isSpace Char
c) ByteString
inp of
       (ByteString
t, ByteString
_)
         | ByteString -> Bool
BS.null ByteString
t -> forall (m :: * -> *) a. MonadPlus m => m a
mzero
         | Bool
otherwise -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
t)

-- NOTE: limited to ASCII as per kate documentation
detectIdentifier :: ByteString -> TokenizerM Text
detectIdentifier :: ByteString -> TokenizerM Text
detectIdentifier ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
BS.uncons ByteString
inp of
    Just (Char
c, ByteString
t) | (Char -> Bool
isAscii Char
c Bool -> Bool -> Bool
&& Char -> Bool
isLetter Char
c) Bool -> Bool -> Bool
|| Char
c forall a. Eq a => a -> a -> Bool
== Char
'_' ->
      Int -> TokenizerM Text
takeChars forall a b. (a -> b) -> a -> b
$ Int
1 forall a. Num a => a -> a -> a
+ forall b a. b -> (a -> b) -> Maybe a -> b
maybe (ByteString -> Int
BS.length ByteString
t) forall a. a -> a
id
                ((Char -> Bool) -> ByteString -> Maybe Int
BS.findIndex (\Char
d -> Bool -> Bool
not (Char -> Bool
isAscii Char
d) Bool -> Bool -> Bool
||
                                     Bool -> Bool
not (Char -> Bool
isAlphaNum Char
d Bool -> Bool -> Bool
|| Char
d forall a. Eq a => a -> a -> Bool
== Char
'_')) ByteString
t)
    Maybe (Char, ByteString)
_ -> forall (m :: * -> *) a. MonadPlus m => m a
mzero

lineContinue :: ByteString -> TokenizerM Text
lineContinue :: ByteString -> TokenizerM Text
lineContinue ByteString
inp = do
  if ByteString
inp forall a. Eq a => a -> a -> Bool
== ByteString
"\\"
     then do
       forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ lineContinuation :: Bool
lineContinuation = Bool
True }
       Int -> TokenizerM Text
takeChars Int
1
     else forall (m :: * -> *) a. MonadPlus m => m a
mzero

anyChar :: Set.Set Char -> ByteString -> TokenizerM Text
anyChar :: Set Char -> ByteString -> TokenizerM Text
anyChar Set Char
cs ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
     Just (Char
x, ByteString
_) | Char
x forall a. Ord a => a -> Set a -> Bool
`Set.member` Set Char
cs -> Int -> TokenizerM Text
takeChars Int
1
     Maybe (Char, ByteString)
_           -> forall (m :: * -> *) a. MonadPlus m => m a
mzero

regExpr :: Bool -> RE -> ByteString -> TokenizerM Text
regExpr :: Bool -> RE -> ByteString -> TokenizerM Text
regExpr Bool
dynamic RE
re ByteString
inp = do
  -- return $! traceShowId $! (reStr, inp)
  let reStr :: ByteString
reStr = RE -> ByteString
reString RE
re
  forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int -> ByteString -> ByteString
BS.take Int
2 ByteString
reStr forall a. Eq a => a -> a -> Bool
== ByteString
"\\b") forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  Map RE Regex
compiledREs <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Map RE Regex
compiledRegexes
  Regex
regex <- case forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup RE
re Map RE Regex
compiledREs of
              Maybe Regex
Nothing -> do
                Regex
cre <- case Bool -> ByteString -> Either [Char] Regex
compileRegex (RE -> Bool
reCaseSensitive RE
re) ByteString
reStr of
                         Right Regex
r  -> forall (m :: * -> *) a. Monad m => a -> m a
return Regex
r
                         Left [Char]
e   -> forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError forall a b. (a -> b) -> a -> b
$
                           [Char]
"Error compiling regex " forall a. [a] -> [a] -> [a]
++
                            ByteString -> [Char]
UTF8.toString ByteString
reStr forall a. [a] -> [a] -> [a]
++ [Char]
": " forall a. [a] -> [a] -> [a]
++ [Char]
e
                forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ compiledRegexes :: Map RE Regex
compiledRegexes =
                      forall k a. Ord k => k -> a -> Map k a -> Map k a
Map.insert RE
re Regex
cre (TokenizerState -> Map RE Regex
compiledRegexes TokenizerState
st) }
                forall (m :: * -> *) a. Monad m => a -> m a
return Regex
cre
              Just Regex
cre -> forall (m :: * -> *) a. Monad m => a -> m a
return Regex
cre
  Regex
regex' <- if Bool
dynamic
               then Regex -> TokenizerM Regex
subDynamic Regex
regex
               else forall (m :: * -> *) a. Monad m => a -> m a
return Regex
regex
  case Regex -> ByteString -> Maybe (ByteString, IntMap (Int, Int))
matchRegex Regex
regex' ByteString
inp of
        Just (ByteString
matchedBytes, IntMap (Int, Int)
capts) -> do
          forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (forall (t :: * -> *) a. Foldable t => t a -> Bool
null IntMap (Int, Int)
capts) forall a b. (a -> b) -> a -> b
$
            forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ captures :: Captures
captures = IntMap ByteString -> Captures
Captures forall a b. (a -> b) -> a -> b
$
                                  forall a b. (a -> b) -> IntMap a -> IntMap b
IntMap.map (ByteString -> (Int, Int) -> ByteString
toSlice ByteString
inp) IntMap (Int, Int)
capts }
          Int -> TokenizerM Text
takeChars (ByteString -> Int
UTF8.length ByteString
matchedBytes)
        Maybe (ByteString, IntMap (Int, Int))
_ -> forall (m :: * -> *) a. MonadPlus m => m a
mzero

toSlice :: ByteString -> (Int, Int) -> ByteString
toSlice :: ByteString -> (Int, Int) -> ByteString
toSlice ByteString
bs (Int
off, Int
len) = Int -> ByteString -> ByteString
BS.take Int
len forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
BS.drop Int
off ByteString
bs

wordBoundary :: ByteString -> TokenizerM ()
wordBoundary :: ByteString -> TokenizerM ()
wordBoundary ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
       Maybe (Char, ByteString)
Nothing -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
       Just (Char
d, ByteString
_) -> do
         Char
c <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Char
prevChar
         forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ Char -> Char -> Bool
isWordBoundary Char
c Char
d

isWordBoundary :: Char -> Char -> Bool
isWordBoundary :: Char -> Char -> Bool
isWordBoundary Char
c Char
d = Char -> Bool
isWordChar Char
c forall a. Eq a => a -> a -> Bool
/= Char -> Bool
isWordChar Char
d

decodeBS :: ByteString -> TokenizerM Text
decodeBS :: ByteString -> TokenizerM Text
decodeBS ByteString
bs = case ByteString -> Either UnicodeException Text
decodeUtf8' ByteString
bs of
                    Left UnicodeException
_ -> forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError ([Char]
"ByteString " forall a. [a] -> [a] -> [a]
++
                                forall a. Show a => a -> [Char]
show ByteString
bs forall a. [a] -> [a] -> [a]
++ [Char]
"is not UTF8")
                    Right Text
t -> forall (m :: * -> *) a. Monad m => a -> m a
return Text
t

-- Substitute out %1, %2, etc. in regex string, escaping
-- appropriately..
subDynamic :: Regex -> TokenizerM Regex
subDynamic :: Regex -> TokenizerM Regex
subDynamic (MatchDynamic Int
capNum) = do
  Text
replacement <- Int -> TokenizerM Text
getCapture Int
capNum
  forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall a. Monoid a => [a] -> a
mconcat forall a b. (a -> b) -> a -> b
$ forall a b. (a -> b) -> [a] -> [b]
map ((Char -> Bool) -> Regex
MatchChar forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Eq a => a -> a -> Bool
(==)) forall a b. (a -> b) -> a -> b
$ Text -> [Char]
Text.unpack Text
replacement
subDynamic (MatchAlt Regex
r1 Regex
r2) =
  Regex -> Regex -> Regex
MatchAlt forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r1 forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Regex -> TokenizerM Regex
subDynamic Regex
r2
subDynamic (MatchConcat Regex
r1 Regex
r2) =
  Regex -> Regex -> Regex
MatchConcat forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r1 forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Regex -> TokenizerM Regex
subDynamic Regex
r2
subDynamic (MatchSome Regex
r) =
  Regex -> Regex
MatchSome forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r
subDynamic (MatchCapture Int
i Regex
r) =
  Int -> Regex -> Regex
MatchCapture Int
i forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r
subDynamic (AssertPositive Direction
dir Regex
r) =
  Direction -> Regex -> Regex
AssertPositive Direction
dir forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r
subDynamic (AssertNegative Direction
dir Regex
r) =
  Direction -> Regex -> Regex
AssertNegative Direction
dir forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r
subDynamic Regex
x = forall (m :: * -> *) a. Monad m => a -> m a
return Regex
x

keyword :: KeywordAttr -> WordSet Text -> ByteString -> TokenizerM Text
keyword :: KeywordAttr -> WordSet Text -> ByteString -> TokenizerM Text
keyword KeywordAttr
kwattr WordSet Text
kws ByteString
inp = do
  Char
prev <- forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Char
prevChar
  forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ Char
prev forall a. Ord a => a -> Set a -> Bool
`Set.member` (KeywordAttr -> Set Char
keywordDelims KeywordAttr
kwattr)
  let (ByteString
w,ByteString
_) = (Char -> Bool) -> ByteString -> (ByteString, ByteString)
UTF8.break (forall a. Ord a => a -> Set a -> Bool
`Set.member` (KeywordAttr -> Set Char
keywordDelims KeywordAttr
kwattr)) ByteString
inp
  forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (ByteString -> Bool
BS.null ByteString
w)
  Text
w' <- ByteString -> TokenizerM Text
decodeBS ByteString
w
  let numchars :: Int
numchars = Text -> Int
Text.length Text
w'
  if Text
w' forall a. (FoldCase a, Ord a) => a -> WordSet a -> Bool
`inWordSet` WordSet Text
kws
     then Int -> TokenizerM Text
takeChars Int
numchars
     else forall (m :: * -> *) a. MonadPlus m => m a
mzero

normalizeHighlighting :: [Token] -> [Token]
normalizeHighlighting :: SourceLine -> SourceLine
normalizeHighlighting [] = []
normalizeHighlighting ((!TokenType
t,!Text
x):SourceLine
xs)
  | Text -> Bool
Text.null Text
x = SourceLine -> SourceLine
normalizeHighlighting SourceLine
xs
  | Bool
otherwise =
    (TokenType
t, Text
matchedText) forall a. a -> [a] -> [a]
: SourceLine -> SourceLine
normalizeHighlighting SourceLine
rest
    where (SourceLine
matches, SourceLine
rest) = forall a. (a -> Bool) -> [a] -> ([a], [a])
span (\(TokenType
z,Text
_) -> TokenType
z forall a. Eq a => a -> a -> Bool
== TokenType
t) SourceLine
xs
          !matchedText :: Text
matchedText = [Text] -> Text
Text.concat (Text
x forall a. a -> [a] -> [a]
: forall a b. (a -> b) -> [a] -> [b]
map forall a b. (a, b) -> b
snd SourceLine
matches)


parseCStringChar :: ByteString -> TokenizerM Text
parseCStringChar :: ByteString -> TokenizerM Text
parseCStringChar ByteString
inp = do
  case forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pCStringChar) ByteString
inp of
       Left [Char]
_      -> forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pCStringChar :: A.Parser ()
pCStringChar :: Parser ()
pCStringChar = do
  Char
_ <- Char -> Parser Char
A.char Char
'\\'
  Char
next <- Parser Char
A.anyChar
  case Char
next of
       Char
c | Char
c forall a. Eq a => a -> a -> Bool
== Char
'x' Bool -> Bool -> Bool
|| Char
c forall a. Eq a => a -> a -> Bool
== Char
'X' -> () forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser ByteString
A.takeWhile1 ([Char] -> Char -> Bool
A.inClass [Char]
"0-9a-fA-F")
         | Char
c forall a. Eq a => a -> a -> Bool
== Char
'0' -> () forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser ByteString
A.takeWhile ([Char] -> Char -> Bool
A.inClass [Char]
"0-7")
         | [Char] -> Char -> Bool
A.inClass [Char]
"abefnrtv\"'?\\" Char
c -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
         | Bool
otherwise -> forall (m :: * -> *) a. MonadPlus m => m a
mzero

parseCChar :: ByteString -> TokenizerM Text
parseCChar :: ByteString -> TokenizerM Text
parseCChar ByteString
inp = do
  case forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pCChar) ByteString
inp of
       Left [Char]
_      -> forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pCChar :: A.Parser ()
pCChar :: Parser ()
pCChar = do
  () forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Char -> Parser Char
A.char Char
'\''
  Parser ()
pCStringChar forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser Char
A.satisfy (\Char
c -> Char
c forall a. Eq a => a -> a -> Bool
/= Char
'\'' Bool -> Bool -> Bool
&& Char
c forall a. Eq a => a -> a -> Bool
/= Char
'\\')
  () forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Char -> Parser Char
A.char Char
'\''

parseInt :: ByteString -> TokenizerM Text
parseInt :: ByteString -> TokenizerM Text
parseInt ByteString
inp = do
  ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  case forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (forall a. Parser a -> Parser (ByteString, a)
A.match (Parser ()
pHex forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
pOct forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
pDec)) ByteString
inp of
       Left [Char]
_      -> forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pDec :: A.Parser ()
pDec :: Parser ()
pDec = do
  Parser ()
mbMinus
  ByteString
_ <- (Char -> Bool) -> Parser ByteString
A.takeWhile1 ([Char] -> Char -> Bool
A.inClass [Char]
"0-9")
  Parser ()
guardWordBoundary

parseOct :: ByteString -> TokenizerM Text
parseOct :: ByteString -> TokenizerM Text
parseOct ByteString
inp = do
  ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  case forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pHex) ByteString
inp of
       Left [Char]
_      -> forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pOct :: A.Parser ()
pOct :: Parser ()
pOct = do
  Parser ()
mbMinus
  Char
_ <- Char -> Parser Char
A.char Char
'0'
  Char
_ <- (Char -> Bool) -> Parser Char
A.satisfy ([Char] -> Char -> Bool
A.inClass [Char]
"Oo")
  ByteString
_ <- (Char -> Bool) -> Parser ByteString
A.takeWhile1 ([Char] -> Char -> Bool
A.inClass [Char]
"0-7")
  Parser ()
guardWordBoundary

parseHex :: ByteString -> TokenizerM Text
parseHex :: ByteString -> TokenizerM Text
parseHex ByteString
inp = do
  ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  case forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pHex) ByteString
inp of
       Left [Char]
_      -> forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pHex :: A.Parser ()
pHex :: Parser ()
pHex = do
  Parser ()
mbMinus
  Char
_ <- Char -> Parser Char
A.char Char
'0'
  Char
_ <- (Char -> Bool) -> Parser Char
A.satisfy ([Char] -> Char -> Bool
A.inClass [Char]
"Xx")
  ByteString
_ <- (Char -> Bool) -> Parser ByteString
A.takeWhile1 ([Char] -> Char -> Bool
A.inClass [Char]
"0-9a-fA-F")
  Parser ()
guardWordBoundary

guardWordBoundary :: A.Parser ()
guardWordBoundary :: Parser ()
guardWordBoundary = do
  Maybe Char
mbw <- Parser (Maybe Char)
A.peekChar
  case Maybe Char
mbw of
       Just Char
c  ->  forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ Char -> Char -> Bool
isWordBoundary Char
'0' Char
c
       Maybe Char
Nothing -> forall (m :: * -> *) a. Monad m => a -> m a
return ()

mbMinus :: A.Parser ()
mbMinus :: Parser ()
mbMinus = (() forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Char -> Parser Char
A.char Char
'-') forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall (m :: * -> *) a. Monad m => a -> m a
return ()

mbPlusMinus :: A.Parser ()
mbPlusMinus :: Parser ()
mbPlusMinus = () forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser Char
A.satisfy ([Char] -> Char -> Bool
A.inClass [Char]
"+-") forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall (m :: * -> *) a. Monad m => a -> m a
return ()

parseFloat :: ByteString -> TokenizerM Text
parseFloat :: ByteString -> TokenizerM Text
parseFloat ByteString
inp = do
  ByteString -> TokenizerM ()
wordBoundary ByteString
inp
  case forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pFloat) ByteString
inp of
       Left [Char]
_      -> forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r)  -- assumes all ascii
  where pFloat :: A.Parser ()
        pFloat :: Parser ()
pFloat = do
          let digits :: Parser ByteString
digits = (Char -> Bool) -> Parser ByteString
A.takeWhile1 ([Char] -> Char -> Bool
A.inClass [Char]
"0-9")
          Parser ()
mbPlusMinus
          Bool
before <- forall (f :: * -> *) a. Alternative f => a -> f a -> f a
A.option Bool
False forall a b. (a -> b) -> a -> b
$ Bool
True forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Parser ByteString
digits
          Bool
dot <- forall (f :: * -> *) a. Alternative f => a -> f a -> f a
A.option Bool
False forall a b. (a -> b) -> a -> b
$ Bool
True forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser Char
A.satisfy ([Char] -> Char -> Bool
A.inClass [Char]
".")
          Bool
after <- forall (f :: * -> *) a. Alternative f => a -> f a -> f a
A.option Bool
False forall a b. (a -> b) -> a -> b
$ Bool
True forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Parser ByteString
digits
          Bool
e <- forall (f :: * -> *) a. Alternative f => a -> f a -> f a
A.option Bool
False forall a b. (a -> b) -> a -> b
$ Bool
True forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ ((Char -> Bool) -> Parser Char
A.satisfy ([Char] -> Char -> Bool
A.inClass [Char]
"Ee") forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>>
                                         Parser ()
mbPlusMinus forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ByteString
digits)
          Maybe Char
mbnext <- Parser (Maybe Char)
A.peekChar
          case Maybe Char
mbnext of
               Maybe Char
Nothing -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
               Just Char
c  -> forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Bool
not forall a b. (a -> b) -> a -> b
$ [Char] -> Char -> Bool
A.inClass [Char]
"." Char
c)
          forall (f :: * -> *). Alternative f => Bool -> f ()
guard forall a b. (a -> b) -> a -> b
$ (Bool
before Bool -> Bool -> Bool
&& Bool -> Bool
not Bool
dot Bool -> Bool -> Bool
&& Bool
e)     -- 5e2
               Bool -> Bool -> Bool
|| (Bool
before Bool -> Bool -> Bool
&& Bool
dot Bool -> Bool -> Bool
&& (Bool
after Bool -> Bool -> Bool
|| Bool -> Bool
not Bool
e)) -- 5.2e2 or 5.2 or 5.
               Bool -> Bool -> Bool
|| (Bool -> Bool
not Bool
before Bool -> Bool -> Bool
&& Bool
dot Bool -> Bool -> Bool
&& Bool
after) -- .23 or .23e2