module Language.Haskell.HsColour.Classify
  ( TokenType(..)
  , tokenise
  ) where

import Data.Char (isSpace, isUpper, isLower, isDigit)
import Data.List

-- | Lex Haskell source code into an annotated token stream, without
--   discarding any characters or layout.
tokenise :: String -> [(TokenType,String)]
tokenise :: String -> [(TokenType, String)]
tokenise String
str = 
    let chunks :: [String]
chunks = [String] -> [String]
glue forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> [String]
chunk forall a b. (a -> b) -> a -> b
$ String
str 
    in [(TokenType, String)] -> [(TokenType, String)]
markDefs forall a b. (a -> b) -> a -> b
$ forall a b. (a -> b) -> [a] -> [b]
map (\String
s-> (String -> TokenType
classify String
s,String
s)) [String]
chunks

markDefs :: [(TokenType, String)] -> [(TokenType, String)]
markDefs :: [(TokenType, String)] -> [(TokenType, String)]
markDefs [] = []
markDefs ((TokenType
Varid, String
s) : [(TokenType, String)]
rest) = (TokenType
Definition, String
s) forall a. a -> [a] -> [a]
: [(TokenType, String)] -> [(TokenType, String)]
continue [(TokenType, String)]
rest
markDefs ((TokenType
Varop, String
">") : (TokenType
Space, String
" ") : (TokenType
Varid, String
d) : [(TokenType, String)]
rest) =
    (TokenType
Varop, String
">") forall a. a -> [a] -> [a]
: (TokenType
Space, String
" ") forall a. a -> [a] -> [a]
: (TokenType
Definition, String
d) forall a. a -> [a] -> [a]
: [(TokenType, String)] -> [(TokenType, String)]
continue [(TokenType, String)]
rest
markDefs [(TokenType, String)]
rest = [(TokenType, String)] -> [(TokenType, String)]
continue [(TokenType, String)]
rest

continue :: [(TokenType, String)] -> [(TokenType, String)]
continue [(TokenType, String)]
rest 
    = let ([(TokenType, String)]
thisLine, [(TokenType, String)]
nextLine) = forall a. (a -> Bool) -> [a] -> ([a], [a])
span (forall a. Eq a => a -> a -> Bool
/= (TokenType
Space, String
"\n")) [(TokenType, String)]
rest
      in
        case [(TokenType, String)]
nextLine of
          [] -> [(TokenType, String)]
thisLine
          ((TokenType
Space, String
"\n"):[(TokenType, String)]
nextLine') -> ([(TokenType, String)]
thisLine forall a. [a] -> [a] -> [a]
++ ((TokenType
Space, String
"\n") forall a. a -> [a] -> [a]
: ([(TokenType, String)] -> [(TokenType, String)]
markDefs [(TokenType, String)]
nextLine')))


-- Basic Haskell lexing, except we keep whitespace.
chunk :: String -> [String]
chunk :: String -> [String]
chunk []    = []
chunk (Char
'\r':String
s) = String -> [String]
chunk String
s -- get rid of DOS newline stuff
chunk (Char
'\n':String
s) = String
"\n"forall a. a -> [a] -> [a]
: String -> [String]
chunk String
s
chunk (Char
c:String
s) | Char -> Bool
isLinearSpace Char
c
            = (Char
cforall a. a -> [a] -> [a]
:String
ss)forall a. a -> [a] -> [a]
: String -> [String]
chunk String
rest where (String
ss,String
rest) = forall a. (a -> Bool) -> [a] -> ([a], [a])
span Char -> Bool
isLinearSpace String
s
chunk (Char
'{':Char
'-':String
s) = let (String
com,String
s') = Int -> String -> (String, String)
nestcomment Int
0 String
s
                    in (Char
'{'forall a. a -> [a] -> [a]
:Char
'-'forall a. a -> [a] -> [a]
:String
com) forall a. a -> [a] -> [a]
: String -> [String]
chunk String
s'
chunk String
s = case ReadS String
Prelude.lex String
s of
              []             -> [forall a. [a] -> a
head String
s]forall a. a -> [a] -> [a]
: String -> [String]
chunk (forall a. [a] -> [a]
tail String
s) -- e.g. inside comment
              ((tok :: String
tok@(Char
'-':Char
'-':String
_),String
rest):[(String, String)]
_)
                  | forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (forall a. Eq a => a -> a -> Bool
==Char
'-') String
tok -> (String
tokforall a. [a] -> [a] -> [a]
++String
com)forall a. a -> [a] -> [a]
: String -> [String]
chunk String
s'
                                       where (String
com,String
s') = String -> (String, String)
eolcomment String
rest
              ((String
tok,String
rest):[(String, String)]
_) -> String
tokforall a. a -> [a] -> [a]
: String -> [String]
chunk String
rest

isLinearSpace :: Char -> Bool
isLinearSpace Char
c = Char
c forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
" \t\f" -- " \t\xa0"

-- Glue sequences of tokens into more useful blobs
glue :: [String] -> [String]
glue (String
q:String
".":String
n:[String]
rest) | Char -> Bool
isUpper (forall a. [a] -> a
head String
q)	-- qualified names
                    = [String] -> [String]
glue ((String
qforall a. [a] -> [a] -> [a]
++String
"."forall a. [a] -> [a] -> [a]
++String
n)forall a. a -> [a] -> [a]
: [String]
rest)
glue (String
"`":[String]
rest) =				-- `varid` -> varop
  case [String] -> [String]
glue [String]
rest of
    (String
qn:String
"`":[String]
rest) -> (String
"`"forall a. [a] -> [a] -> [a]
++String
qnforall a. [a] -> [a] -> [a]
++String
"`")forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
rest
    [String]
_             -> String
"`"forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
rest
glue (String
s:[String]
ss)       | forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (forall a. Eq a => a -> a -> Bool
==Char
'-') String
s Bool -> Bool -> Bool
&& forall (t :: * -> *) a. Foldable t => t a -> Int
length String
s forall a. Ord a => a -> a -> Bool
>=Int
2	-- eol comment
                  = (String
sforall a. [a] -> [a] -> [a]
++forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
c)forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
rest
                  where ([String]
c,[String]
rest) = forall a. (a -> Bool) -> [a] -> ([a], [a])
break (Char
'\n'forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem`) [String]
ss
--glue ("{":"-":ss)  = ("{-"++c): glue rest	-- nested comment
--                  where (c,rest) = nestcomment 0 ss
glue (String
"(":[String]
ss) = case [String]
rest of
                String
")":[String]
rest -> (String
"(" forall a. [a] -> [a] -> [a]
++ forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
tuple forall a. [a] -> [a] -> [a]
++ String
")") forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
rest
                [String]
_         -> String
"(" forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
ss
              where ([String]
tuple,[String]
rest) = forall a. (a -> Bool) -> [a] -> ([a], [a])
span (forall a. Eq a => a -> a -> Bool
==String
",") [String]
ss
glue (String
"[":String
"]":[String]
ss) = String
"[]" forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
ss
glue (String
"\n":String
"#":[String]
ss)= String
"\n" forall a. a -> [a] -> [a]
: (Char
'#'forall a. a -> [a] -> [a]
:forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
line) forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
rest
                  where ([String]
line,[String]
rest) = forall a. (a -> Bool) -> [a] -> ([a], [a])
break (Char
'\n'forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem`) [String]
ss
glue (String
s:[String]
ss)       = String
sforall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
ss
glue []           = []

-- Deal with comments.
nestcomment :: Int -> String -> (String,String)
nestcomment :: Int -> String -> (String, String)
nestcomment Int
n (Char
'{':Char
'-':String
ss) | Int
nforall a. Ord a => a -> a -> Bool
>=Int
0 = ((String
"{-"forall a. [a] -> [a] -> [a]
++String
cs),String
rm)
                                  where (String
cs,String
rm) = Int -> String -> (String, String)
nestcomment (Int
nforall a. Num a => a -> a -> a
+Int
1) String
ss
nestcomment Int
n (Char
'-':Char
'}':String
ss) | Int
nforall a. Ord a => a -> a -> Bool
>Int
0  = ((String
"-}"forall a. [a] -> [a] -> [a]
++String
cs),String
rm)
                                  where (String
cs,String
rm) = Int -> String -> (String, String)
nestcomment (Int
nforall a. Num a => a -> a -> a
-Int
1) String
ss
nestcomment Int
n (Char
'-':Char
'}':String
ss) | Int
nforall a. Eq a => a -> a -> Bool
==Int
0 = (String
"-}",String
ss)
nestcomment Int
n (Char
s:String
ss)       | Int
nforall a. Ord a => a -> a -> Bool
>=Int
0 = ((Char
sforall a. a -> [a] -> [a]
:String
cs),String
rm)
                                  where (String
cs,String
rm) = Int -> String -> (String, String)
nestcomment Int
n String
ss
nestcomment Int
n [] = ([],[])

eolcomment :: String -> (String,String)
eolcomment :: String -> (String, String)
eolcomment s :: String
s@(Char
'\n':String
_) = ([], String
s)
eolcomment (Char
'\r':String
s)   = String -> (String, String)
eolcomment String
s
eolcomment (Char
c:String
s)      = (Char
cforall a. a -> [a] -> [a]
:String
cs, String
s') where (String
cs,String
s') = String -> (String, String)
eolcomment String
s
eolcomment []         = ([],[])

-- | Classification of tokens as lexical entities
data TokenType =
  Space | Keyword | Keyglyph | Layout | Comment | Conid | Varid |
  Conop | Varop   | String   | Char   | Number  | Cpp   | Error |
  Definition
  deriving (TokenType -> TokenType -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: TokenType -> TokenType -> Bool
$c/= :: TokenType -> TokenType -> Bool
== :: TokenType -> TokenType -> Bool
$c== :: TokenType -> TokenType -> Bool
Eq,Int -> TokenType -> ShowS
[TokenType] -> ShowS
TokenType -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [TokenType] -> ShowS
$cshowList :: [TokenType] -> ShowS
show :: TokenType -> String
$cshow :: TokenType -> String
showsPrec :: Int -> TokenType -> ShowS
$cshowsPrec :: Int -> TokenType -> ShowS
Show)

classify :: String -> TokenType
classify :: String -> TokenType
classify s :: String
s@(Char
h:String
t)
    | Char -> Bool
isSpace Char
h              = TokenType
Space
    | forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (forall a. Eq a => a -> a -> Bool
==Char
'-') String
s          = TokenType
Comment
    | String
"--" forall a. Eq a => [a] -> [a] -> Bool
`isPrefixOf` String
s
      Bool -> Bool -> Bool
&& forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any Char -> Bool
isSpace String
s       = TokenType
Comment		-- not fully correct
    | String
"{-" forall a. Eq a => [a] -> [a] -> Bool
`isPrefixOf` String
s    = TokenType
Comment
    | String
s forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [String]
keywords      = TokenType
Keyword
    | String
s forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [String]
keyglyphs     = TokenType
Keyglyph
    | String
s forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [String]
layoutchars   = TokenType
Layout
    | Char -> Bool
isUpper Char
h              = TokenType
Conid
    | String
s forall a. Eq a => a -> a -> Bool
== String
"[]"              = TokenType
Conid
    | Char
h forall a. Eq a => a -> a -> Bool
== Char
'(' Bool -> Bool -> Bool
&& String -> Bool
isTupleTail String
t = TokenType
Conid
    | Char
h forall a. Eq a => a -> a -> Bool
== Char
'#'               = TokenType
Cpp
    | Char -> Bool
isLower Char
h              = TokenType
Varid
    | Char
h forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
symbols       = TokenType
Varop
    | Char
hforall a. Eq a => a -> a -> Bool
==Char
':'                 = TokenType
Conop
    | Char
hforall a. Eq a => a -> a -> Bool
==Char
'`'                 = TokenType
Varop
    | Char
hforall a. Eq a => a -> a -> Bool
==Char
'"'                 = TokenType
String
    | Char
hforall a. Eq a => a -> a -> Bool
==Char
'\''                = TokenType
Char
    | Char -> Bool
isDigit Char
h              = TokenType
Number
    | Bool
otherwise              = TokenType
Error
classify String
_ = TokenType
Space

isTupleTail :: String -> Bool
isTupleTail [Char
')'] = Bool
True
isTupleTail (Char
',':String
xs) = String -> Bool
isTupleTail String
xs
isTupleTail String
_ = Bool
False


-- Haskell keywords
keywords :: [String]
keywords =
  [String
"case",String
"class",String
"data",String
"default",String
"deriving",String
"do",String
"else",String
"forall"
  ,String
"if",String
"import",String
"in",String
"infix",String
"infixl",String
"infixr",String
"instance",String
"let",String
"module"
  ,String
"newtype",String
"of",String
"qualified",String
"then",String
"type",String
"where",String
"_"
  ,String
"foreign",String
"ccall",String
"as",String
"safe",String
"unsafe",String
"family"]
keyglyphs :: [String]
keyglyphs =
  [String
"..",String
"::",String
"=",String
"\\",String
"|",String
"<-",String
"->",String
"@",String
"~",String
"=>",String
"[",String
"]"]
layoutchars :: [String]
layoutchars =
  forall a b. (a -> b) -> [a] -> [b]
map (forall a. a -> [a] -> [a]
:[]) String
";{}(),"
symbols :: String
symbols =
  String
"!#$%&*+./<=>?@\\^|-~"