{-# LANGUAGE BangPatterns #-}
module Language.PureScript.CST.Lexer
( lenient
, lex
, lexTopLevel
, lexWithState
, isUnquotedKey
) where
import Prelude hiding (lex, exp, exponent, lines)
import Control.Monad (join)
import qualified Data.Char as Char
import qualified Data.DList as DList
import Data.Foldable (foldl')
import Data.Functor (($>))
import qualified Data.Scientific as Sci
import Data.String (fromString)
import Data.Text (Text)
import qualified Data.Text as Text
import Language.PureScript.CST.Errors
import Language.PureScript.CST.Monad hiding (token)
import Language.PureScript.CST.Layout
import Language.PureScript.CST.Positions
import Language.PureScript.CST.Types
lenient :: [LexResult] -> [LexResult]
lenient = go
where
go [] = []
go (Right a : as) = Right a : go as
go (Left (st, _) : _) = do
let
pos = lexPos st
ann = TokenAnn (SourceRange pos pos) (lexLeading st) []
[Right (SourceToken ann TokEof)]
lex :: Text -> [LexResult]
lex src = do
let (leading, src') = comments src
lexWithState $ LexState
{ lexPos = advanceLeading (SourcePos 1 1) leading
, lexLeading = leading
, lexSource = src'
, lexStack = [(SourcePos 0 0, LytRoot)]
}
lexTopLevel :: Text -> [LexResult]
lexTopLevel src = do
let
(leading, src') = comments src
lexPos = advanceLeading (SourcePos 1 1) leading
hd = Right $ lytToken lexPos TokLayoutStart
tl = lexWithState $ LexState
{ lexPos = lexPos
, lexLeading = leading
, lexSource = src'
, lexStack = [(lexPos, LytWhere), (SourcePos 0 0, LytRoot)]
}
hd : tl
lexWithState :: LexState -> [LexResult]
lexWithState = go
where
Parser lexK =
tokenAndComments
go state@(LexState {..}) =
lexK lexSource onError onSuccess
where
onError lexSource' err = do
let
len1 = Text.length lexSource
len2 = Text.length lexSource'
chunk = Text.take (max 0 (len1 - len2)) lexSource
chunkDelta = textDelta chunk
pos = applyDelta lexPos chunkDelta
pure $ Left
( state { lexSource = lexSource' }
, ParserError (SourceRange pos $ applyDelta pos (0, 1)) [] lexStack err
)
onSuccess _ (TokEof, _) =
Right <$> unwindLayout lexPos lexLeading lexStack
onSuccess lexSource' (tok, (trailing, lexLeading')) = do
let
endPos = advanceToken lexPos tok
lexPos' = advanceLeading (advanceTrailing endPos trailing) lexLeading'
tokenAnn = TokenAnn
{ tokRange = SourceRange lexPos endPos
, tokLeadingComments = lexLeading
, tokTrailingComments = trailing
}
(lexStack', toks) =
insertLayout (SourceToken tokenAnn tok) lexPos' lexStack
state' = LexState
{ lexPos = lexPos'
, lexLeading = lexLeading'
, lexSource = lexSource'
, lexStack = lexStack'
}
go2 state' toks
go2 state [] = go state
go2 state (t : ts) = Right t : go2 state ts
type Lexer = ParserM ParserErrorType Text
{-# INLINE next #-}
next :: Lexer ()
next = Parser $ \inp _ ksucc ->
ksucc (Text.drop 1 inp) ()
{-# INLINE nextWhile #-}
nextWhile :: (Char -> Bool) -> Lexer Text
nextWhile p = Parser $ \inp _ ksucc -> do
let (chs, inp') = Text.span p inp
ksucc inp' chs
{-# INLINE peek #-}
peek :: Lexer (Maybe Char)
peek = Parser $ \inp _ ksucc ->
if Text.null inp
then ksucc inp Nothing
else ksucc inp $ Just $ Text.head inp
{-# INLINE restore #-}
restore :: (ParserErrorType -> Bool) -> Lexer a -> Lexer a
restore p (Parser k) = Parser $ \inp kerr ksucc ->
k inp (\inp' err -> kerr (if p err then inp else inp') err) ksucc
tokenAndComments :: Lexer (Token, ([Comment void], [Comment LineFeed]))
tokenAndComments = (,) <$> token <*> breakComments
comments :: Text -> ([Comment LineFeed], Text)
comments = \src -> k src (\_ _ -> ([], src)) (\inp (a, b) -> (a <> b, inp))
where
Parser k = breakComments
breakComments :: Lexer ([Comment void], [Comment LineFeed])
breakComments = k0 []
where
k0 acc = do
spaces <- nextWhile (== ' ')
lines <- nextWhile isLineFeed
let
acc'
| Text.null spaces = acc
| otherwise = Space (Text.length spaces) : acc
if Text.null lines
then do
mbComm <- comment
case mbComm of
Just comm -> k0 (comm : acc')
Nothing -> pure (reverse acc', [])
else
k1 acc' (goWs [] $ Text.unpack lines)
k1 trl acc = do
ws <- nextWhile (\c -> c == ' ' || isLineFeed c)
let acc' = goWs acc $ Text.unpack ws
mbComm <- comment
case mbComm of
Just comm -> k1 trl (comm : acc')
Nothing -> pure (reverse trl, reverse acc')
goWs a ('\r' : '\n' : ls) = goWs (Line CRLF : a) ls
goWs a ('\r' : ls) = goWs (Line CRLF : a) ls
goWs a ('\n' : ls) = goWs (Line LF : a) ls
goWs a (' ' : ls) = goSpace a 1 ls
goWs a _ = a
goSpace a !n (' ' : ls) = goSpace a (n + 1) ls
goSpace a !n ls = goWs (Space n : a) ls
isBlockComment = Parser $ \inp _ ksucc ->
case Text.uncons inp of
Just ('-', inp2) ->
case Text.uncons inp2 of
Just ('-', inp3) ->
ksucc inp3 $ Just False
_ ->
ksucc inp Nothing
Just ('{', inp2) ->
case Text.uncons inp2 of
Just ('-', inp3) ->
ksucc inp3 $ Just True
_ ->
ksucc inp Nothing
_ ->
ksucc inp Nothing
comment = isBlockComment >>= \case
Just True -> Just <$> blockComment "{-"
Just False -> Just <$> lineComment "--"
Nothing -> pure $ Nothing
lineComment acc = do
comm <- nextWhile (\c -> c /= '\r' && c /= '\n')
pure $ Comment (acc <> comm)
blockComment acc = do
chs <- nextWhile (/= '-')
dashes <- nextWhile (== '-')
if Text.null dashes
then pure $ Comment $ acc <> chs
else peek >>= \case
Just '}' -> next $> Comment (acc <> chs <> dashes <> "}")
_ -> blockComment (acc <> chs <> dashes)
token :: Lexer Token
token = peek >>= maybe (pure TokEof) k0
where
k0 ch1 = case ch1 of
'(' -> next *> leftParen
')' -> next $> TokRightParen
'{' -> next $> TokLeftBrace
'}' -> next $> TokRightBrace
'[' -> next $> TokLeftSquare
']' -> next $> TokRightSquare
'`' -> next $> TokTick
',' -> next $> TokComma
'∷' -> next *> orOperator1 (TokDoubleColon Unicode) ch1
'←' -> next *> orOperator1 (TokLeftArrow Unicode) ch1
'→' -> next *> orOperator1 (TokRightArrow Unicode) ch1
'⇒' -> next *> orOperator1 (TokRightFatArrow Unicode) ch1
'∀' -> next *> orOperator1 (TokForall Unicode) ch1
'|' -> next *> orOperator1 TokPipe ch1
'.' -> next *> orOperator1 TokDot ch1
'\\' -> next *> orOperator1 TokBackslash ch1
'<' -> next *> orOperator2 (TokLeftArrow ASCII) ch1 '-'
'-' -> next *> orOperator2 (TokRightArrow ASCII) ch1 '>'
'=' -> next *> orOperator2' TokEquals (TokRightFatArrow ASCII) ch1 '>'
':' -> next *> orOperator2' (TokOperator [] ":") (TokDoubleColon ASCII) ch1 ':'
'?' -> next *> hole
'\'' -> next *> char
'"' -> next *> string
_ | Char.isDigit ch1 -> restore (== ErrNumberOutOfRange) (next *> number ch1)
| Char.isUpper ch1 -> next *> upper [] ch1
| isIdentStart ch1 -> next *> lower [] ch1
| isSymbolChar ch1 -> next *> operator [] [ch1]
| otherwise -> throw $ ErrLexeme (Just [ch1]) []
{-# INLINE orOperator1 #-}
orOperator1 :: Token -> Char -> Lexer Token
orOperator1 tok ch1 = join $ Parser $ \inp _ ksucc ->
case Text.uncons inp of
Just (ch2, inp2) | isSymbolChar ch2 ->
ksucc inp2 $ operator [] [ch1, ch2]
_ ->
ksucc inp $ pure tok
{-# INLINE orOperator2 #-}
orOperator2 :: Token -> Char -> Char -> Lexer Token
orOperator2 tok ch1 ch2 = join $ Parser $ \inp _ ksucc ->
case Text.uncons inp of
Just (ch2', inp2) | ch2 == ch2' ->
case Text.uncons inp2 of
Just (ch3, inp3) | isSymbolChar ch3 ->
ksucc inp3 $ operator [] [ch1, ch2, ch3]
_ ->
ksucc inp2 $ pure tok
_ ->
ksucc inp $ operator [] [ch1]
{-# INLINE orOperator2' #-}
orOperator2' :: Token -> Token -> Char -> Char -> Lexer Token
orOperator2' tok1 tok2 ch1 ch2 = join $ Parser $ \inp _ ksucc ->
case Text.uncons inp of
Just (ch2', inp2) | ch2 == ch2' ->
case Text.uncons inp2 of
Just (ch3, inp3) | isSymbolChar ch3 ->
ksucc inp3 $ operator [] [ch1, ch2, ch3]
_ ->
ksucc inp2 $ pure tok2
Just (ch2', inp2) | isSymbolChar ch2' ->
ksucc inp2 $ operator [] [ch1, ch2']
_ ->
ksucc inp $ pure tok1
leftParen :: Lexer Token
leftParen = Parser $ \inp kerr ksucc ->
case Text.span isSymbolChar inp of
(chs, inp2)
| Text.null chs -> ksucc inp TokLeftParen
| otherwise ->
case Text.uncons inp2 of
Just (')', inp3) ->
case chs of
"→" -> ksucc inp3 $ TokSymbolArr Unicode
"->" -> ksucc inp3 $ TokSymbolArr ASCII
_ | isReservedSymbol chs -> kerr inp ErrReservedSymbol
| otherwise -> ksucc inp3 $ TokSymbolName [] chs
_ -> ksucc inp TokLeftParen
symbol :: [Text] -> Lexer Token
symbol qual = restore isReservedSymbolError $ peek >>= \case
Just ch | isSymbolChar ch ->
nextWhile isSymbolChar >>= \chs ->
peek >>= \case
Just ')'
| isReservedSymbol chs -> throw ErrReservedSymbol
| otherwise -> next $> TokSymbolName qual chs
Just ch2 -> throw $ ErrLexeme (Just [ch2]) []
Nothing -> throw ErrEof
Just ch -> throw $ ErrLexeme (Just [ch]) []
Nothing -> throw ErrEof
operator :: [Text] -> [Char] -> Lexer Token
operator qual pre = do
rest <- nextWhile isSymbolChar
pure . TokOperator (reverse qual) $ Text.pack pre <> rest
upper :: [Text] -> Char -> Lexer Token
upper qual pre = do
rest <- nextWhile isIdentChar
ch1 <- peek
let name = Text.cons pre rest
case ch1 of
Just '.' -> do
let qual' = name : qual
next *> peek >>= \case
Just '(' -> next *> symbol qual'
Just ch2
| Char.isUpper ch2 -> next *> upper qual' ch2
| isIdentStart ch2 -> next *> lower qual' ch2
| isSymbolChar ch2 -> next *> operator qual' [ch2]
| otherwise -> throw $ ErrLexeme (Just [ch2]) []
Nothing ->
throw ErrEof
_ ->
pure $ TokUpperName (reverse qual) name
lower :: [Text] -> Char -> Lexer Token
lower qual pre = do
rest <- nextWhile isIdentChar
case pre of
'_' | Text.null rest ->
if null qual
then pure TokUnderscore
else throw $ ErrLexeme (Just [pre]) []
_ ->
case Text.cons pre rest of
"forall" | null qual -> pure $ TokForall ASCII
name -> pure $ TokLowerName (reverse qual) name
hole :: Lexer Token
hole = do
name <- nextWhile isIdentChar
if Text.null name
then operator [] ['?']
else pure $ TokHole name
char :: Lexer Token
char = do
(raw, ch) <- peek >>= \case
Just '\\' -> do
(raw, ch2) <- next *> escape
pure (Text.cons '\\' raw, ch2)
Just ch ->
next $> (Text.singleton ch, ch)
Nothing ->
throw $ ErrEof
peek >>= \case
Just '\''
| fromEnum ch > 0xFFFF -> throw ErrAstralCodePointInChar
| otherwise -> next $> TokChar raw ch
Just ch2 ->
throw $ ErrLexeme (Just [ch2]) []
_ ->
throw $ ErrEof
string :: Lexer Token
string = do
quotes1 <- nextWhile (== '"')
case Text.length quotes1 of
0 -> do
let
go raw acc = do
chs <- nextWhile isNormalStringChar
let
raw' = raw <> chs
acc' = acc <> DList.fromList (Text.unpack chs)
peek >>= \case
Just '"' -> next $> TokString raw' (fromString (DList.toList acc'))
Just '\\' -> next *> goEscape (raw' <> "\\") acc'
Just _ -> throw ErrLineFeedInString
Nothing -> throw ErrEof
goEscape raw acc = do
mbCh <- peek
case mbCh of
Just ch1 | isStringGapChar ch1 -> do
gap <- nextWhile isStringGapChar
peek >>= \case
Just '"' -> next $> TokString (raw <> gap) (fromString (DList.toList acc))
Just '\\' -> next *> go (raw <> gap <> "\\") acc
Just ch -> throw $ ErrCharInGap ch
Nothing -> throw ErrEof
_ -> do
(raw', ch) <- escape
go (raw <> raw') (acc <> DList.singleton ch)
go "" mempty
1 ->
pure $ TokString "" ""
n | n >= 5 -> do
let str = Text.take 5 quotes1
pure $ TokString str (fromString (Text.unpack str))
_ -> do
let
go acc = do
chs <- nextWhile (/= '"')
quotes2 <- nextWhile (== '"')
case Text.length quotes2 of
0 -> throw ErrEof
n | n >= 3 -> pure $ TokRawString $ acc <> chs <> Text.drop 3 quotes2
_ -> go (acc <> chs <> quotes2)
go ""
escape :: Lexer (Text, Char)
escape = do
ch <- peek
case ch of
Just 't' -> next $> ("\t", '\t')
Just 'r' -> next $> ("\\r", '\r')
Just 'n' -> next $> ("\\n", '\n')
Just '"' -> next $> ("\"", '"')
Just '\'' -> next $> ("'", '\'')
Just '\\' -> next $> ("\\", '\\')
Just 'x' -> (*>) next $ Parser $ \inp kerr ksucc -> do
let
go n acc (ch' : chs)
| Char.isHexDigit ch' = go (n * 16 + Char.digitToInt ch') (ch' : acc) chs
go n acc _
| n <= 0x10FFFF =
ksucc (Text.drop (length acc) inp)
(Text.pack $ reverse acc, Char.chr n)
| otherwise =
kerr inp ErrCharEscape
go 0 [] $ Text.unpack $ Text.take 6 inp
_ -> throw ErrCharEscape
number :: Char -> Lexer Token
number ch1 = peek >>= \ch2 -> case (ch1, ch2) of
('0', Just 'x') -> next *> hexadecimal
(_, _) -> do
mbInt <- integer1 ch1
mbFraction <- fraction
case (mbInt, mbFraction) of
(Just (raw, int), Nothing) -> do
let int' = digitsToInteger int
exponent >>= \case
Just (raw', exp) ->
sciDouble (raw <> raw') $ Sci.scientific int' exp
Nothing ->
pure $ TokInt raw int'
(Just (raw, int), Just (raw', frac)) -> do
let sci = digitsToScientific int frac
exponent >>= \case
Just (raw'', exp) ->
sciDouble (raw <> raw' <> raw'') $ uncurry Sci.scientific $ (+ exp) <$> sci
Nothing ->
sciDouble (raw <> raw') $ uncurry Sci.scientific sci
(Nothing, Just (raw, frac)) -> do
let sci = digitsToScientific [] frac
exponent >>= \case
Just (raw', exp) ->
sciDouble (raw <> raw') $ uncurry Sci.scientific $ (+ exp) <$> sci
Nothing ->
sciDouble raw $ uncurry Sci.scientific sci
(Nothing, Nothing) ->
peek >>= \ch -> throw $ ErrLexeme (pure <$> ch) []
sciDouble :: Text -> Sci.Scientific -> Lexer Token
sciDouble raw sci = case Sci.toBoundedRealFloat sci of
Left _ -> throw ErrNumberOutOfRange
Right n -> pure $ TokNumber raw n
integer :: Lexer (Maybe (Text, String))
integer = peek >>= \case
Just '0' -> next *> peek >>= \case
Just ch | isNumberChar ch -> throw ErrLeadingZero
_ -> pure $ Just ("0", "0")
Just ch | isDigitChar ch -> Just <$> digits
_ -> pure $ Nothing
integer1 :: Char -> Lexer (Maybe (Text, String))
integer1 = \case
'0' -> peek >>= \case
Just ch | isNumberChar ch -> throw ErrLeadingZero
_ -> pure $ Just ("0", "0")
ch | isDigitChar ch -> do
(raw, chs) <- digits
pure $ Just (Text.cons ch raw, ch : chs)
_ -> pure $ Nothing
fraction :: Lexer (Maybe (Text, String))
fraction = Parser $ \inp _ ksucc ->
case Text.uncons inp of
Just ('.', inp')
| (raw, inp'') <- Text.span isNumberChar inp'
, not (Text.null raw) ->
ksucc inp'' $ Just ("." <> raw, filter (/= '_') $ Text.unpack raw)
_ ->
ksucc inp Nothing
digits :: Lexer (Text, String)
digits = do
raw <- nextWhile isNumberChar
pure (raw, filter (/= '_') $ Text.unpack raw)
exponent :: Lexer (Maybe (Text, Int))
exponent = peek >>= \case
Just 'e' -> do
(neg, sign) <- next *> peek >>= \case
Just '-' -> next $> (True, "-")
Just '+' -> next $> (False, "+")
_ -> pure (False, "")
integer >>= \case
Just (raw, chs) -> do
let
int | neg = negate $ digitsToInteger chs
| otherwise = digitsToInteger chs
pure $ Just ("e" <> sign <> raw, fromInteger int)
Nothing -> throw ErrExpectedExponent
_ ->
pure Nothing
hexadecimal :: Lexer Token
hexadecimal = do
chs <- nextWhile Char.isHexDigit
if Text.null chs
then throw ErrExpectedHex
else pure $ TokInt ("0x" <> chs) $ digitsToIntegerBase 16 $ Text.unpack chs
digitsToInteger :: [Char] -> Integer
digitsToInteger = digitsToIntegerBase 10
digitsToIntegerBase :: Integer -> [Char] -> Integer
digitsToIntegerBase b = foldl' (\n c -> n * b + (toInteger (Char.digitToInt c))) 0
digitsToScientific :: [Char] -> [Char] -> (Integer, Int)
digitsToScientific = go 0 . reverse
where
go !exp is [] = (digitsToInteger (reverse is), exp)
go !exp is (f : fs) = go (exp - 1) (f : is) fs
isSymbolChar :: Char -> Bool
isSymbolChar c = (c `elem` (":!#$%&*+./<=>?@\\^|-~" :: [Char])) || (not (Char.isAscii c) && Char.isSymbol c)
isReservedSymbolError :: ParserErrorType -> Bool
isReservedSymbolError = (== ErrReservedSymbol)
isReservedSymbol :: Text -> Bool
isReservedSymbol = flip elem symbols
where
symbols =
[ "::"
, "∷"
, "<-"
, "←"
, "->"
, "→"
, "=>"
, "⇒"
, "∀"
, "|"
, "."
, "\\"
, "="
]
isIdentStart :: Char -> Bool
isIdentStart c = Char.isLower c || c == '_'
isIdentChar :: Char -> Bool
isIdentChar c = Char.isAlphaNum c || c == '_' || c == '\''
isDigitChar :: Char -> Bool
isDigitChar c = c >= '0' && c <= '9'
isNumberChar :: Char -> Bool
isNumberChar c = (c >= '0' && c <= '9') || c == '_'
isNormalStringChar :: Char -> Bool
isNormalStringChar c = c /= '"' && c /= '\\' && c /= '\r' && c /= '\n'
isStringGapChar :: Char -> Bool
isStringGapChar c = c == ' ' || c == '\r' || c == '\n'
isLineFeed :: Char -> Bool
isLineFeed c = c == '\r' || c == '\n'
isUnquotedKey :: Text -> Bool
isUnquotedKey t =
case Text.uncons t of
Nothing ->
False
Just (hd, tl) ->
isIdentStart hd && Text.all isIdentChar tl