-- | This is another hand-written lexer, this time for the Xtract
--   command-language.  The entry point is lexXtract.  You don't
--   normally need to use this module directly - the lexer is called
--   automatically by the parser.  (We only expose this interface
--   for debugging purposes.)
--
--   The Xtract command language is very like the XPath specification.

module Text.XML.HaXml.Xtract.Lex
  ( lexXtract
  , Posn(..)
  , TokenT(..)
  , Token
  ) where

import Data.Char


type Token = Either String (Posn, TokenT)

data Posn = Pn Int              -- char index only
        deriving Eq

instance Show Posn where
      showsPrec _p (Pn c) = showString "char pos " . shows c

data TokenT =
      Symbol String
    | TokString String          --     begins with letter
    | TokNum Integer            --     begins with digit
    deriving Eq

instance Show TokenT where
    showsPrec _p (Symbol s) = showString s
    showsPrec _p (TokString s) = showString s
    showsPrec _p (TokNum n) = shows n

emit :: TokenT -> Posn -> Token
emit tok p = forcep p `seq` Right (p,tok)
  where forcep (Pn n) = n

lexerror :: String -> Posn -> [Token]
lexerror s p = [Left ("Lexical error in selection pattern at "++show p++": "
                       ++s++"\n")]

addcol :: Int -> Posn -> Posn
addcol n (Pn c) = Pn (c+n)

newline, tab :: Posn -> Posn
newline (Pn c) = Pn (c+1)
tab     (Pn c) = Pn (((c`div`8)+1)*8)

white :: Char -> Posn -> Posn
white '\t' = tab
white ' '  = addcol 1
white '\n' = addcol 1
white '\r' = addcol 1
white '\xa0' = addcol 1

blank :: (Posn->String->[Token]) -> Posn-> String-> [Token]
blank _ _ []       = []
blank k p (' ': s) = blank k (addcol 1 p) s
blank k p ('\t':s) = blank k (tab p) s
blank k p ('\n':s) = blank k (newline p) s
blank k p ('\r':s) = blank k p s
blank k p ('\xa0': s) = blank k (addcol 1 p) s
blank k p    s     = k p s

----
-- | First argument is a transformer for pattern strings, e.g. map toLower,
--   but only applying to parts of the pattern not in quotation marks.
--   (Needed to canonicalise HTML where tags are case-insensitive, but
--   attribute values are case sensitive.)
lexXtract :: (String->String) -> String -> [Token]
lexXtract f = selAny f (Pn 1)

syms :: [Char]
syms = "/[]()@,=*&|~$+-<>"

selAny :: (String->String) -> Posn -> String -> [Token]
selAny _ _ [] = []
selAny f p ('/':'/':ss) = emit (Symbol "//") p:  selAny f (addcol 2 p) ss
selAny f p ('!':'=':ss) = emit (Symbol "!=") p:  selAny f (addcol 2 p) ss
selAny f p ('<':'=':ss) = emit (Symbol "<=") p:  selAny f (addcol 2 p) ss
selAny f p ('>':'=':ss) = emit (Symbol ">=") p:  selAny f (addcol 2 p) ss
selAny f p ('\'':ss)    = emit (Symbol "'") p:
                          accumulateUntil '\'' (Symbol "'") [] p (addcol 1 p) ss
                                          (selAny f)
selAny f p ('"':ss)     = emit (Symbol "\"") p:
                          accumulateUntil '"' (Symbol "\"") [] p (addcol 1 p) ss
                                          (selAny f)
selAny f p ('_':ss)     = gatherName f "_" p (addcol 1 p) ss (blank (selAny f))
selAny f p (':':ss)     = gatherName f ":" p (addcol 1 p) ss (blank (selAny f))
selAny f p ('.':'=':'.':ss) = emit (Symbol ".=.") p:  selAny f (addcol 3 p) ss
selAny f p ('.':'!':'=':'.':ss)
                            = emit (Symbol ".!=.") p: selAny f (addcol 4 p) ss
selAny f p ('.':'<':'.':ss) = emit (Symbol ".<.") p:  selAny f (addcol 3 p) ss
selAny f p ('.':'<':'=':'.':ss)
                            = emit (Symbol ".<=.") p: selAny f (addcol 4 p) ss
selAny f p ('.':'>':'.':ss) = emit (Symbol ".>.") p:  selAny f (addcol 3 p) ss
selAny f p ('.':'>':'=':'.':ss)
                            = emit (Symbol ".>=.") p: selAny f (addcol 4 p) ss
selAny f p ('.':'/':ss)     = emit (Symbol "./") p:  selAny f (addcol 2 p) ss
selAny f p (s:ss)
    | s `elem` syms   = emit (Symbol [s]) p:     selAny f (addcol 1 p) ss
    | isSpace s       = blank (selAny f) p (s:ss)
    | isAlpha s       = gatherName f [s] p (addcol 1 p) ss (blank (selAny f))
    | isDigit s       = gatherNum    [s] p (addcol 1 p) ss (blank (selAny f))
    | otherwise       = lexerror "unrecognised pattern" p

gatherName :: (String->String) -> String -> Posn -> Posn -> String
              -> (Posn->String->[Token]) -> [Token]
gatherName f acc pos p (s:ss) k
  | isAlphaNum s || s `elem` "-_:" = gatherName f (s:acc) pos (addcol 1 p) ss k
gatherName f acc pos p ss k =
  emit (TokString (f (reverse acc))) pos: k p ss

gatherNum :: String -> Posn -> Posn -> String
             -> (Posn->String->[Token]) -> [Token]
gatherNum acc pos p (s:ss) k
  | isHexDigit s = gatherNum (s:acc) pos (addcol 1 p) ss k
gatherNum acc pos p ss k =
  emit (TokNum (read (reverse acc))) pos: k p ss

accumulateUntil :: Char -> TokenT -> String -> Posn -> Posn -> String
                   -> (Posn->String->[Token]) -> [Token]
accumulateUntil c _tok _acc pos  p  [] _k =
    lexerror ("found end of pattern while looking for "++c
              :" to match opening quote at "++show pos) p
accumulateUntil c  tok  acc pos  p (s:ss) k
    | c==s       = emit (TokString (reverse acc)) pos:
                                  emit tok p: k (addcol 1 p) ss
    | isSpace s  = accumulateUntil c tok (s:acc) pos (white s p) ss k
    | otherwise  = accumulateUntil c tok (s:acc) pos (addcol 1 p) ss k