{-# LANGUAGE MagicHash, UnboxedTuples, FlexibleInstances #-}
module Scanner where
import GHC.Prim
import TokenDef
import UU.Scanner.Position
import UU.Scanner.Token
import UU.Parsing(InputState(..),Either'(..))
import Data.Maybe
import Data.List
import Data.Char
import UU.Scanner.GenToken
import Options (Options (..))
data Input = Input !Pos String (Maybe (Token, Input))
instance InputState Input Token Pos where
splitStateE input@(Input _ _ next) =
case next of
Nothing -> Right' input
Just (s, rest) -> Left' s rest
splitState (Input _ _ next) =
case next of
Nothing -> error "splitState on empty input"
Just (s, rest) -> (# s, rest #)
getPosition (Input pos _ next) = case next of
Just (s,_) -> position s
Nothing -> pos
input :: Options -> Pos -> String -> Input
input opts pos inp = Input pos
inp
(case scan opts pos inp of
Nothing -> Nothing
Just (s,p,r) -> Just (s, input opts p r)
)
type Lexer s = Pos -> String -> Maybe (s,Pos,String)
scan :: Options -> Lexer Token
scan opts p0
| column p0 == 1 = scanBeginOfLine p0
| otherwise = scan p0
where
keywords' = if lcKeywords opts
then map (map toLower) keywords
else keywords
mkKeyword s | s `elem` lowercaseKeywords = s
| otherwise = map toUpper s
scan :: Lexer Token
scan p [] = Nothing
scan p ('/':'/':xs)
| clean opts
= let (com,rest) = span (/= '\n') xs
in advc' (2+length com) p scan rest
scan p ('-':'-':xs) | null xs || not (head xs `elem` "<>!?#@:%$^&")
= let (com,rest) = span (/= '\n') xs
in advc' (2+length com) p scan rest
scan p ('{':'-':xs) = advc' 2 p (ncomment scan) xs
scan p ('/':'*':xs) | clean opts = advc' 2 p (cleancomment scan) xs
scan p ('{' :xs) = advc' 1 p codescrap xs
scan p ('\CR':xs) = case xs of
'\LF':ys -> newl' p scanBeginOfLine ys
_ -> newl' p scanBeginOfLine xs
scan p ('\LF':xs) = newl' p scanBeginOfLine xs
scan p (x:xs) | isSpace x = updPos' x p scan xs
scan p xs = Just (scan' xs)
where scan' ('.' :rs) = (reserved "." p, advc 1 p, rs)
scan' ('@' :rs) = (reserved "@" p, advc 1 p, rs)
scan' (',' :rs) = (reserved "," p, advc 1 p, rs)
scan' ('_' :rs) = (reserved "_" p, advc 1 p, rs)
scan' ('~' :rs) = (reserved "~" p, advc 1 p, rs)
scan' ('+' :rs) = (reserved "+" p, advc 1 p, rs)
scan' ('<' : '-' : rs) = (reserved "<-" p, advc 2 p, rs)
scan' ('<' : '=' : rs) = (reserved "<=" p, advc 2 p, rs)
scan' ('<' : '<' : '-' : rs) = (reserved "<<-" p, advc 3 p, rs)
scan' ('<' :rs) = (reserved "<" p, advc 1 p, rs)
scan' ('[' :rs) = (reserved "[" p, advc 1 p, rs)
scan' (']' :rs) = (reserved "]" p, advc 1 p, rs)
scan' ('(' :rs) = (reserved "(" p, advc 1 p, rs)
scan' (')' :rs) = (reserved ")" p, advc 1 p, rs)
scan' ('\"' :rs) = let isOk c = c /= '"' && c /= '\n'
(str,rest) = span isOk rs
in if null rest || head rest /= '"'
then (errToken "unterminated string literal" p
, advc (1+length str) p,rest)
else (valueToken TkString str p, advc (2+length str) p, tail rest)
scan' ('=' : '>' : rs) = (reserved "=>" p, advc 2 p, rs)
scan' ('=' :rs) = (reserved "=" p, advc 1 p, rs)
scan' (':':'=':rs) = (reserved ":=" p, advc 2 p, rs)
scan' (':':':':rs) = (reserved "::" p, advc 2 p, rs)
scan' ('∷':rs) = (reserved "::" p, advc 1 p, rs)
scan' (':' :rs) = (reserved ":" p, advc 1 p, rs)
scan' ('|' :rs) = (reserved "|" p, advc 1 p, rs)
scan' ('/':'\\':rs) = (reserved "/\\" p, advc 2 p, rs)
scan' ('-':'>' :rs) = (reserved "->" p, advc 2 p, rs)
scan' ('-' :rs) = (reserved "-" p, advc 1 p, rs)
scan' ('*' :rs) = (reserved "*" p, advc 1 p, rs)
scan' ('\'' :rs) | ocaml opts =
let (var,rest) = ident opts rs
str = '\'' : var
in (valueToken TkTextnm str p, advc (length str) p, rest)
scan' (x:rs) | isLower x = let (var,rest) = ident opts rs
str = (x:var)
tok | str `elem` keywords' = reserved (mkKeyword str)
| otherwise = valueToken TkVarid str
in (tok p, advc (length var+1) p, rest)
| isUpper x = let (var,rest) = ident opts rs
str = (x:var)
tok | str `elem` keywords' = reserved (mkKeyword str)
| otherwise = valueToken TkConid str
in (tok p, advc (length var+1) p,rest)
| otherwise = (errToken ("unexpected character " ++ show x) p, advc 1 p, rs)
scanBeginOfLine :: Lexer Token
scanBeginOfLine p ('{' : '-' : ' ' : 'L' : 'I' : 'N' : 'E' : ' ' : xs)
| isOkBegin rs && isOkEnd rs'
= scan (advc (8 + length r + 2 + length s + 4) p') (drop 4 rs')
| otherwise
= Just (errToken ("Invalid LINE pragma: " ++ show r) p, advc 8 p, xs)
where
(r,rs) = span isDigit xs
(s, rs') = span (/= '"') (drop 2 rs)
p' = Pos (read r - 1) (column p) s
isOkBegin (' ' : '"' : _) = True
isOkBegin _ = False
isOkEnd ('"' : ' ' : '-' : '}' : _) = True
isOkEnd _ = False
scanBeginOfLine p xs
= scan p xs
ident opts = span isValid
where isValid x = isAlphaNum x || x == '_' ||
(not (clean opts) && x == '\'') || (clean opts && x == '`')
lowercaseKeywords = ["loc","lhs", "inst", "optpragmas", "imports", "toplevel", "datablock", "recblock"]
keywords = lowercaseKeywords ++
[ "DATA", "RECORD", "EXT", "ATTR", "SEM","TYPE", "USE", "INCLUDE"
, "EXTENDS"
, "SET","DERIVING","FOR", "WRAPPER", "NOCATAS", "MAYBE", "EITHER", "MAP", "INTMAP"
, "PRAGMA", "SEMPRAGMA", "MODULE", "ATTACH", "UNIQUEREF", "INH", "SYN", "CHN"
, "AUGMENT", "AROUND", "MERGE", "AS", "SELF", "INTSET"
]
ncomment c p ('-':'}':xs) = advc' 2 p c xs
ncomment c p ('{':'-':xs) = advc' 2 p (ncomment (ncomment c)) xs
ncomment c p (x:xs) = updPos' x p (ncomment c) xs
ncomment c p [] = Just (errToken "unterminated nested comment" p, p,[])
cleancomment c p ('*':'/':xs) = advc' 2 p c xs
cleancomment c p ('/':'*':xs) = advc' 2 p (cleancomment (cleancomment c)) xs
cleancomment c p (x:xs) = updPos' x p (cleancomment c) xs
cleancomment c p [] = Just (errToken "unterminated nested comment" p, p,[])
codescrap p xs = let (p2,xs2,sc) = codescrap' 1 p xs
in case xs2 of
('}':rest) -> Just (valueToken TkTextln sc p,advc 1 p2,rest)
_ -> Just (errToken "unterminated codescrap" p,p2,xs2)
codescrap' d p [] = (p,[],[])
codescrap' d p ('{':xs) = let (p2,xs2,sc) = advc' 1 p (codescrap' (d+1)) xs
in (p2,xs2,'{' : sc)
codescrap' d p ('}':xs) | d == 1 = (p,'}':xs,[])
| otherwise = let (p2,xs2,sc) = advc' 1 p (codescrap' (d-1)) xs
in (p2,xs2,'}' : sc)
codescrap' d p (x :xs) = let (p2,xs2,sc) = updPos' x p (codescrap' d) xs
in (p2,xs2,x:sc)
scanLit xs = (fs, foldr insNL (const "") codeLns 1)
where insNL (n,line) r = \n1 -> replicate (n-n1) '\n' ++ line ++ r n
(fs,codeLns,_) = getBlocks ([1..] `zip` toLines xs)
getBlocks [] = ([],[],[])
getBlocks xs = let (files1,txt1,r1) = getBlock xs
(files2,txt2,r2) = getBlocks r1
in (files1++files2, txt1++txt2, r2)
getBlock = getLines . dropWhile comment
getLines [] = ([],[],[])
getLines ((n,l):ls) | "\\begin{code}" `isPrefixOf` l = let (lns,rest) = codelines ls
in ([],lns,rest)
| "\\begin{Code}" `isPrefixOf` l = let (lns,rest) = codeLines ls
in ([],lns,rest)
| "\\IN{" `isPrefixOf` l =
let name = getName l
in ([name],[],ls)
| otherwise = getBlock ls
comment = not . ("\\" `isPrefixOf`) .snd
toLines :: String -> [String]
toLines "" = []
toLines s = let (l,s') = breakLine s
in l : toLines s'
breakLine xs = case xs of
'\CR' : ys -> case ys of
'\LF' : zs -> ([],zs)
_ -> ([],ys)
'\LF' : ys -> ([], ys)
x : ys -> let (l,s) = breakLine ys
in (x:l,s)
[] -> ([],[])
codelines [] = error "Unterminated literate code block"
codelines ((n,l):ls) | "\\end{code}" `isPrefixOf` l = ([],ls)
| otherwise = let (lns,r) = codelines ls
in ((n,l):lns,r)
codeLines [] = error "Unterminated literate Code block"
codeLines ((n,l):ls) | "\\end{Code}" `isPrefixOf` l = ([],ls)
| otherwise = let (lns,r) = codeLines ls
in ((n,l):lns,r)
getName l = case r of
('}':_) -> nm
_ -> error $ "missing '}' in \\IN"
where (nm,r) = span (/='}') (drop 4 l)