module Text.XML.HaXml.Xtract.Lex
( lexXtract
, Posn(..)
, TokenT(..)
, Token
) where
import Data.Char
type Token = Either String (Posn, TokenT)
data Posn = Pn Int
deriving Eq
instance Show Posn where
showsPrec _p (Pn c) = showString "char pos " . shows c
data TokenT =
Symbol String
| TokString String
| TokNum Integer
deriving Eq
instance Show TokenT where
showsPrec _p (Symbol s) = showString s
showsPrec _p (TokString s) = showString s
showsPrec _p (TokNum n) = shows n
emit :: TokenT -> Posn -> Token
emit tok p = forcep p `seq` Right (p,tok)
where forcep (Pn n) = n
lexerror :: String -> Posn -> [Token]
lexerror s p = [Left ("Lexical error in selection pattern at "++show p++": "
++s++"\n")]
addcol :: Int -> Posn -> Posn
addcol n (Pn c) = Pn (c+n)
newline, tab :: Posn -> Posn
newline (Pn c) = Pn (c+1)
tab (Pn c) = Pn (((c`div`8)+1)*8)
white :: Char -> Posn -> Posn
white '\t' = tab
white ' ' = addcol 1
white '\n' = addcol 1
white '\r' = addcol 1
white '\xa0' = addcol 1
blank :: (Posn->String->[Token]) -> Posn-> String-> [Token]
blank _ _ [] = []
blank k p (' ': s) = blank k (addcol 1 p) s
blank k p ('\t':s) = blank k (tab p) s
blank k p ('\n':s) = blank k (newline p) s
blank k p ('\r':s) = blank k p s
blank k p ('\xa0': s) = blank k (addcol 1 p) s
blank k p s = k p s
lexXtract :: (String->String) -> String -> [Token]
lexXtract f = selAny f (Pn 1)
syms :: [Char]
syms = "/[]()@,=*&|~$+-<>"
selAny :: (String->String) -> Posn -> String -> [Token]
selAny _ _ [] = []
selAny f p ('/':'/':ss) = emit (Symbol "//") p: selAny f (addcol 2 p) ss
selAny f p ('!':'=':ss) = emit (Symbol "!=") p: selAny f (addcol 2 p) ss
selAny f p ('<':'=':ss) = emit (Symbol "<=") p: selAny f (addcol 2 p) ss
selAny f p ('>':'=':ss) = emit (Symbol ">=") p: selAny f (addcol 2 p) ss
selAny f p ('\'':ss) = emit (Symbol "'") p:
accumulateUntil '\'' (Symbol "'") [] p (addcol 1 p) ss
(selAny f)
selAny f p ('"':ss) = emit (Symbol "\"") p:
accumulateUntil '"' (Symbol "\"") [] p (addcol 1 p) ss
(selAny f)
selAny f p ('_':ss) = gatherName f "_" p (addcol 1 p) ss (blank (selAny f))
selAny f p (':':ss) = gatherName f ":" p (addcol 1 p) ss (blank (selAny f))
selAny f p ('.':'=':'.':ss) = emit (Symbol ".=.") p: selAny f (addcol 3 p) ss
selAny f p ('.':'!':'=':'.':ss)
= emit (Symbol ".!=.") p: selAny f (addcol 4 p) ss
selAny f p ('.':'<':'.':ss) = emit (Symbol ".<.") p: selAny f (addcol 3 p) ss
selAny f p ('.':'<':'=':'.':ss)
= emit (Symbol ".<=.") p: selAny f (addcol 4 p) ss
selAny f p ('.':'>':'.':ss) = emit (Symbol ".>.") p: selAny f (addcol 3 p) ss
selAny f p ('.':'>':'=':'.':ss)
= emit (Symbol ".>=.") p: selAny f (addcol 4 p) ss
selAny f p ('.':'/':ss) = emit (Symbol "./") p: selAny f (addcol 2 p) ss
selAny f p (s:ss)
| s `elem` syms = emit (Symbol [s]) p: selAny f (addcol 1 p) ss
| isSpace s = blank (selAny f) p (s:ss)
| isAlpha s = gatherName f [s] p (addcol 1 p) ss (blank (selAny f))
| isDigit s = gatherNum [s] p (addcol 1 p) ss (blank (selAny f))
| otherwise = lexerror "unrecognised pattern" p
gatherName :: (String->String) -> String -> Posn -> Posn -> String
-> (Posn->String->[Token]) -> [Token]
gatherName f acc pos p (s:ss) k
| isAlphaNum s || s `elem` "-_:" = gatherName f (s:acc) pos (addcol 1 p) ss k
gatherName f acc pos p ss k =
emit (TokString (f (reverse acc))) pos: k p ss
gatherNum :: String -> Posn -> Posn -> String
-> (Posn->String->[Token]) -> [Token]
gatherNum acc pos p (s:ss) k
| isHexDigit s = gatherNum (s:acc) pos (addcol 1 p) ss k
gatherNum acc pos p ss k =
emit (TokNum (read (reverse acc))) pos: k p ss
accumulateUntil :: Char -> TokenT -> String -> Posn -> Posn -> String
-> (Posn->String->[Token]) -> [Token]
accumulateUntil c _tok _acc pos p [] _k =
lexerror ("found end of pattern while looking for "++c
:" to match opening quote at "++show pos) p
accumulateUntil c tok acc pos p (s:ss) k
| c==s = emit (TokString (reverse acc)) pos:
emit tok p: k (addcol 1 p) ss
| isSpace s = accumulateUntil c tok (s:acc) pos (white s p) ss k
| otherwise = accumulateUntil c tok (s:acc) pos (addcol 1 p) ss k