-- | This is another hand-written lexer, this time for the Xtract
--   command-language.  The entry point is lexXtract.  You don't
--   normally need to use this module directly - the lexer is called
--   automatically by the parser.  (We only expose this interface
--   for debugging purposes.)
--
--   The Xtract command language is very like the XPath specification.

module Text.XML.HaXml.Xtract.Lex
  ( lexXtract
  , Posn(..)
  , TokenT(..)
  , Token
  ) where

import Data.Char


type Token = Either String (Posn, TokenT)

data Posn = Pn Int              -- char index only
        deriving Posn -> Posn -> Bool
(Posn -> Posn -> Bool) -> (Posn -> Posn -> Bool) -> Eq Posn
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Posn -> Posn -> Bool
== :: Posn -> Posn -> Bool
$c/= :: Posn -> Posn -> Bool
/= :: Posn -> Posn -> Bool
Eq

instance Show Posn where
      showsPrec :: Int -> Posn -> ShowS
showsPrec Int
_p (Pn Int
c) = String -> ShowS
showString String
"char pos " ShowS -> ShowS -> ShowS
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> ShowS
forall a. Show a => a -> ShowS
shows Int
c

data TokenT =
      Symbol String
    | TokString String          --     begins with letter
    | TokNum Integer            --     begins with digit
    deriving TokenT -> TokenT -> Bool
(TokenT -> TokenT -> Bool)
-> (TokenT -> TokenT -> Bool) -> Eq TokenT
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: TokenT -> TokenT -> Bool
== :: TokenT -> TokenT -> Bool
$c/= :: TokenT -> TokenT -> Bool
/= :: TokenT -> TokenT -> Bool
Eq

instance Show TokenT where
    showsPrec :: Int -> TokenT -> ShowS
showsPrec Int
_p (Symbol String
s) = String -> ShowS
showString String
s
    showsPrec Int
_p (TokString String
s) = String -> ShowS
showString String
s
    showsPrec Int
_p (TokNum Integer
n) = Integer -> ShowS
forall a. Show a => a -> ShowS
shows Integer
n

emit :: TokenT -> Posn -> Token
emit :: TokenT -> Posn -> Token
emit TokenT
tok Posn
p = Posn -> Int
forcep Posn
p Int -> Token -> Token
forall a b. a -> b -> b
`seq` (Posn, TokenT) -> Token
forall a b. b -> Either a b
Right (Posn
p,TokenT
tok)
  where forcep :: Posn -> Int
forcep (Pn Int
n) = Int
n

lexerror :: String -> Posn -> [Token]
lexerror :: String -> Posn -> [Token]
lexerror String
s Posn
p = [String -> Token
forall a b. a -> Either a b
Left (String
"Lexical error in selection pattern at "String -> ShowS
forall a. [a] -> [a] -> [a]
++Posn -> String
forall a. Show a => a -> String
show Posn
pString -> ShowS
forall a. [a] -> [a] -> [a]
++String
": "
                       String -> ShowS
forall a. [a] -> [a] -> [a]
++String
sString -> ShowS
forall a. [a] -> [a] -> [a]
++String
"\n")]

addcol :: Int -> Posn -> Posn
addcol :: Int -> Posn -> Posn
addcol Int
n (Pn Int
c) = Int -> Posn
Pn (Int
cInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
n)

newline, tab :: Posn -> Posn
newline :: Posn -> Posn
newline (Pn Int
c) = Int -> Posn
Pn (Int
cInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
tab :: Posn -> Posn
tab     (Pn Int
c) = Int -> Posn
Pn (((Int
cInt -> Int -> Int
forall a. Integral a => a -> a -> a
`div`Int
8)Int -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)Int -> Int -> Int
forall a. Num a => a -> a -> a
*Int
8)

white :: Char -> Posn -> Posn
white :: Char -> Posn -> Posn
white Char
'\t' = Posn -> Posn
tab
white Char
' '  = Int -> Posn -> Posn
addcol Int
1
white Char
'\n' = Int -> Posn -> Posn
addcol Int
1
white Char
'\r' = Int -> Posn -> Posn
addcol Int
1
white Char
'\xa0' = Int -> Posn -> Posn
addcol Int
1

blank :: (Posn->String->[Token]) -> Posn-> String-> [Token]
blank :: (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank Posn -> String -> [Token]
_ Posn
_ []       = []
blank Posn -> String -> [Token]
k Posn
p (Char
' ': String
s) = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank Posn -> String -> [Token]
k (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
s
blank Posn -> String -> [Token]
k Posn
p (Char
'\t':String
s) = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank Posn -> String -> [Token]
k (Posn -> Posn
tab Posn
p) String
s
blank Posn -> String -> [Token]
k Posn
p (Char
'\n':String
s) = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank Posn -> String -> [Token]
k (Posn -> Posn
newline Posn
p) String
s
blank Posn -> String -> [Token]
k Posn
p (Char
'\r':String
s) = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank Posn -> String -> [Token]
k Posn
p String
s
blank Posn -> String -> [Token]
k Posn
p (Char
'\xa0': String
s) = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank Posn -> String -> [Token]
k (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
s
blank Posn -> String -> [Token]
k Posn
p    String
s     = Posn -> String -> [Token]
k Posn
p String
s

----
-- | First argument is a transformer for pattern strings, e.g. map toLower,
--   but only applying to parts of the pattern not in quotation marks.
--   (Needed to canonicalise HTML where tags are case-insensitive, but
--   attribute values are case sensitive.)
lexXtract :: (String->String) -> String -> [Token]
lexXtract :: ShowS -> String -> [Token]
lexXtract ShowS
f = ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn
Pn Int
1)

syms :: [Char]
syms :: String
syms = String
"/[]()@,=*&|~$+-<>"

selAny :: (String->String) -> Posn -> String -> [Token]
selAny :: ShowS -> Posn -> String -> [Token]
selAny ShowS
_ Posn
_ [] = []
selAny ShowS
f Posn
p (Char
'/':Char
'/':String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
"//") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
2 Posn
p) String
ss
selAny ShowS
f Posn
p (Char
'!':Char
'=':String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
"!=") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
2 Posn
p) String
ss
selAny ShowS
f Posn
p (Char
'<':Char
'=':String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
"<=") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
2 Posn
p) String
ss
selAny ShowS
f Posn
p (Char
'>':Char
'=':String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
">=") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
2 Posn
p) String
ss
selAny ShowS
f Posn
p (Char
'\'':String
ss)    = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
"'") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:
                          Char
-> TokenT
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
accumulateUntil Char
'\'' (String -> TokenT
Symbol String
"'") [] Posn
p (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
ss
                                          (ShowS -> Posn -> String -> [Token]
selAny ShowS
f)
selAny ShowS
f Posn
p (Char
'"':String
ss)     = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
"\"") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:
                          Char
-> TokenT
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
accumulateUntil Char
'"' (String -> TokenT
Symbol String
"\"") [] Posn
p (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
ss
                                          (ShowS -> Posn -> String -> [Token]
selAny ShowS
f)
selAny ShowS
f Posn
p (Char
'_':String
ss)     = ShowS
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
gatherName ShowS
f String
"_" Posn
p (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
ss ((Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank (ShowS -> Posn -> String -> [Token]
selAny ShowS
f))
selAny ShowS
f Posn
p (Char
':':String
ss)     = ShowS
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
gatherName ShowS
f String
":" Posn
p (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
ss ((Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank (ShowS -> Posn -> String -> [Token]
selAny ShowS
f))
selAny ShowS
f Posn
p (Char
'.':Char
'=':Char
'.':String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
".=.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
3 Posn
p) String
ss
selAny ShowS
f Posn
p (Char
'.':Char
'!':Char
'=':Char
'.':String
ss)
                            = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
".!=.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
4 Posn
p) String
ss
selAny ShowS
f Posn
p (Char
'.':Char
'<':Char
'.':String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
".<.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
3 Posn
p) String
ss
selAny ShowS
f Posn
p (Char
'.':Char
'<':Char
'=':Char
'.':String
ss)
                            = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
".<=.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
4 Posn
p) String
ss
selAny ShowS
f Posn
p (Char
'.':Char
'>':Char
'.':String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
".>.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
3 Posn
p) String
ss
selAny ShowS
f Posn
p (Char
'.':Char
'>':Char
'=':Char
'.':String
ss)
                            = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
".>=.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
4 Posn
p) String
ss
selAny ShowS
f Posn
p (Char
'.':Char
'/':String
ss)     = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol String
"./") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
2 Posn
p) String
ss
selAny ShowS
f Posn
p (Char
s:String
ss)
    | Char
s Char -> String -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
syms   = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol [Char
s]) Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:     ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
ss
    | Char -> Bool
isSpace Char
s       = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank (ShowS -> Posn -> String -> [Token]
selAny ShowS
f) Posn
p (Char
sChar -> ShowS
forall a. a -> [a] -> [a]
:String
ss)
    | Char -> Bool
isAlpha Char
s       = ShowS
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
gatherName ShowS
f [Char
s] Posn
p (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
ss ((Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank (ShowS -> Posn -> String -> [Token]
selAny ShowS
f))
    | Char -> Bool
isDigit Char
s       = String
-> Posn -> Posn -> String -> (Posn -> String -> [Token]) -> [Token]
gatherNum    [Char
s] Posn
p (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
ss ((Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank (ShowS -> Posn -> String -> [Token]
selAny ShowS
f))
    | Bool
otherwise       = String -> Posn -> [Token]
lexerror String
"unrecognised pattern" Posn
p

gatherName :: (String->String) -> String -> Posn -> Posn -> String
              -> (Posn->String->[Token]) -> [Token]
gatherName :: ShowS
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
gatherName ShowS
f String
acc Posn
pos Posn
p (Char
s:String
ss) Posn -> String -> [Token]
k
  | Char -> Bool
isAlphaNum Char
s Bool -> Bool -> Bool
|| Char
s Char -> String -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
"-_:" = ShowS
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
gatherName ShowS
f (Char
sChar -> ShowS
forall a. a -> [a] -> [a]
:String
acc) Posn
pos (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
ss Posn -> String -> [Token]
k
gatherName ShowS
f String
acc Posn
pos Posn
p String
ss Posn -> String -> [Token]
k =
  TokenT -> Posn -> Token
emit (String -> TokenT
TokString (ShowS
f (ShowS
forall a. [a] -> [a]
reverse String
acc))) Posn
posToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: Posn -> String -> [Token]
k Posn
p String
ss

gatherNum :: String -> Posn -> Posn -> String
             -> (Posn->String->[Token]) -> [Token]
gatherNum :: String
-> Posn -> Posn -> String -> (Posn -> String -> [Token]) -> [Token]
gatherNum String
acc Posn
pos Posn
p (Char
s:String
ss) Posn -> String -> [Token]
k
  | Char -> Bool
isHexDigit Char
s = String
-> Posn -> Posn -> String -> (Posn -> String -> [Token]) -> [Token]
gatherNum (Char
sChar -> ShowS
forall a. a -> [a] -> [a]
:String
acc) Posn
pos (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
ss Posn -> String -> [Token]
k
gatherNum String
acc Posn
pos Posn
p String
ss Posn -> String -> [Token]
k =
  TokenT -> Posn -> Token
emit (Integer -> TokenT
TokNum (String -> Integer
forall a. Read a => String -> a
read (ShowS
forall a. [a] -> [a]
reverse String
acc))) Posn
posToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: Posn -> String -> [Token]
k Posn
p String
ss

accumulateUntil :: Char -> TokenT -> String -> Posn -> Posn -> String
                   -> (Posn->String->[Token]) -> [Token]
accumulateUntil :: Char
-> TokenT
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
accumulateUntil Char
c TokenT
_tok String
_acc Posn
pos  Posn
p  [] Posn -> String -> [Token]
_k =
    String -> Posn -> [Token]
lexerror (String
"found end of pattern while looking for "String -> ShowS
forall a. [a] -> [a] -> [a]
++Char
c
              Char -> ShowS
forall a. a -> [a] -> [a]
:String
" to match opening quote at "String -> ShowS
forall a. [a] -> [a] -> [a]
++Posn -> String
forall a. Show a => a -> String
show Posn
pos) Posn
p
accumulateUntil Char
c  TokenT
tok  String
acc Posn
pos  Posn
p (Char
s:String
ss) Posn -> String -> [Token]
k
    | Char
cChar -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
s       = TokenT -> Posn -> Token
emit (String -> TokenT
TokString (ShowS
forall a. [a] -> [a]
reverse String
acc)) Posn
posToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:
                                  TokenT -> Posn -> Token
emit TokenT
tok Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: Posn -> String -> [Token]
k (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
ss
    | Char -> Bool
isSpace Char
s  = Char
-> TokenT
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
accumulateUntil Char
c TokenT
tok (Char
sChar -> ShowS
forall a. a -> [a] -> [a]
:String
acc) Posn
pos (Char -> Posn -> Posn
white Char
s Posn
p) String
ss Posn -> String -> [Token]
k
    | Bool
otherwise  = Char
-> TokenT
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
accumulateUntil Char
c TokenT
tok (Char
sChar -> ShowS
forall a. a -> [a] -> [a]
:String
acc) Posn
pos (Int -> Posn -> Posn
addcol Int
1 Posn
p) String
ss Posn -> String -> [Token]
k