{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE TypeSynonymInstances #-}
module Text.XML.Lexer where
import Common
import Text.XML.Types
import qualified Data.Text as TS
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Numeric (readHex)
class XmlSource s where
uncons :: s -> Maybe (Char,s)
instance XmlSource String where
uncons (c:s) = Just (c,s)
uncons "" = Nothing
instance XmlSource TS.Text where
uncons = TS.uncons
instance XmlSource TL.Text where
uncons = TL.uncons
data Scanner s = Scanner (Maybe (Char,s)) (s -> Maybe (Char,s))
customScanner :: (s -> Maybe (Char,s)) -> s -> Scanner s
customScanner next s = Scanner (next s) next
instance XmlSource (Scanner s) where
uncons (Scanner this next) = do
(c,s1) <- this
return (c, Scanner (next s1) next)
type LChar = (Pos,Char)
type LString = [LChar]
data Token = TokStart !Pos QName [Attr] Bool
| TokEnd !Pos QName
| TokCRef ShortText
| TokText CData
| TokError !Pos String
deriving (Show,Data,Typeable,Generic)
instance NFData Token
eofErr :: [Token]
eofErr = [TokError (-1) "Premature EOF"]
scanXML :: XmlSource source => source -> [Token]
scanXML = tokens' . go 0
where
go !n src = case uncons src of
Just (c,src') -> (n,c) : go (n+1) src'
Nothing -> []
tokens' :: LString -> [Token]
tokens' ((_,'<') : c@(_,'!') : cs) = special c cs
tokens' ((_,'<') : cs) = tag cs
tokens' [] = []
tokens' cs@((_,_):_) = let (as,bs) = breakn ('<' ==) cs
in map cvt (decode_text as) ++ tokens' bs
where cvt (TxtBit x) = TokText CData { cdVerbatim = CDataText
, cdData = fromString x
}
cvt (CRefBit x) = case cref_to_char x of
Just c -> TokText CData { cdVerbatim = CDataText
, cdData = T.singleton c
}
Nothing -> TokCRef (fromString x)
special :: LChar -> LString -> [Token]
special (_,_) ((_,'-') : (_,'-') : cs) = skip cs
where
skip ((pos,'-') : (_,'-') : (_,x) : ds)
| x == '>' = tokens' ds
| otherwise = [TokError pos "double hyphen within comment"]
skip (_ : ds) = skip ds
skip [] = eofErr
special _ ((_,'[') : (_,'C') : (_,'D') : (_,'A') : (_,'T') : (_,'A') : (_,'[') : cs) =
let (xs,ts) = cdata cs
in TokText CData { cdVerbatim = CDataVerbatim
, cdData = fromString xs
} : tokens' ts
where cdata ((_,']') : (_,']') : (_,'>') : ds) = ([],ds)
cdata ((_,d) : ds) = let (xs,ys) = cdata ds in (d:xs,ys)
cdata [] = ([],[])
special _ cs =
let (xs,ts) = munch "" 0 cs
in TokText CData { cdVerbatim = CDataRaw
, cdData = fromString ('<':'!':reverse xs)
} : tokens' ts
where munch acc nesting ((_,'>') : ds)
| nesting == (0::Int) = ('>':acc,ds)
| otherwise = munch ('>':acc) (nesting-1) ds
munch acc nesting ((_,'<') : ds)
= munch ('<':acc) (nesting+1) ds
munch acc n ((_,x) : ds) = munch (x:acc) n ds
munch acc _ [] = (acc,[])
qualName :: LString -> (QName,LString)
qualName xs = (QName { qURI = Nothing
, qPrefix = fmap fromString q
, qLName = LName (fromString n)
}, bs)
where
(as,bs) = breakn endName xs
(q,n) = case break (':'==) as of
(q1,_:n1) -> (Just q1, n1)
_ -> (Nothing, as)
endName x = isSpace x || x == '=' || x == '>' || x == '/'
tag :: LString -> [Token]
tag ((p,'/') : cs)
= TokEnd p n : case dropSpace ds of
(_,'>') : es -> tokens' es
(p',_) : _ -> [TokError p' "expected '>'"]
[] -> eofErr
where
(n,ds) = qualName (dropSpace cs)
tag [] = eofErr
tag cs
= TokStart (fst (head cs)) n as b : ts
where
(n,ds) = qualName cs
(as,b,ts) = attribs (dropSpace ds)
attribs :: LString -> ([Attr], Bool, [Token])
attribs cs = case cs of
(_,'>') : ds -> ([], False, tokens' ds)
(_,'/') : ds -> ([], True, case ds of
(_,'>') : es -> tokens' es
(pos,_) : _ -> [TokError pos "expected '>'"]
[] -> eofErr)
(_,'?') : (_,'>') : ds -> ([], True, tokens' ds)
[] -> ([],False,eofErr)
_ -> let (a,cs1) = attrib cs
(as,b,ts) = attribs cs1
in (a:as,b,ts)
attrib :: LString -> (Attr,LString)
attrib cs = let (ks,cs1) = qualName cs
(vs,cs2) = attr_val (dropSpace cs1)
in ((Attr ks (fromString $ decode_attr vs)),dropSpace cs2)
attr_val :: LString -> (String,LString)
attr_val ((_,'=') : cs0) = string (dropSpace cs0)
where
string :: LString -> (String,LString)
string ((_,'"') : cs) = break' ('"' ==) cs
string ((_,'\'') : cs) = break' ('\'' ==) cs
string cs = breakn eos cs
where eos x = isSpace x || x == '>' || x == '/'
attr_val cs = ("",cs)
dropSpace :: LString -> LString
dropSpace = dropWhile (isSpace . snd)
isSpace :: Char -> Bool
isSpace = (`elem` "\x20\x09\x0D\x0A")
break' :: (a -> Bool) -> [(b,a)] -> ([a],[(b,a)])
break' p xs = let (as,bs) = breakn p xs
in (as, case bs of
[] -> []
_ : cs -> cs)
breakn :: (a -> Bool) -> [(b,a)] -> ([a],[(b,a)])
breakn p l = (map snd as,bs) where (as,bs) = break (p . snd) l
decode_attr :: String -> String
decode_attr cs = concatMap cvt (decode_text cs)
where cvt (TxtBit x) = x
cvt (CRefBit x) = case cref_to_char x of
Just c -> [c]
Nothing -> '&' : x ++ ";"
data Txt = TxtBit String | CRefBit String deriving Show
decode_text :: [Char] -> [Txt]
decode_text xs@('&' : cs) = case break (';' ==) cs of
(as,_:bs) -> CRefBit as : decode_text bs
_ -> [TxtBit xs]
decode_text [] = []
decode_text cs = let (as,bs) = break ('&' ==) cs
in TxtBit as : decode_text bs
cref_to_char :: [Char] -> Maybe Char
cref_to_char cs = case cs of
'#' : ds -> num_esc ds
"lt" -> Just '<'
"gt" -> Just '>'
"amp" -> Just '&'
"apos" -> Just '\''
"quot" -> Just '"'
_ -> Nothing
num_esc :: String -> Maybe Char
num_esc cs = case cs of
'x' : ds -> check (readHex ds)
_ -> check (reads cs)
where check [(n,"")] = cvt_char n
check _ = Nothing
cvt_char :: Int -> Maybe Char
cvt_char x
| fromEnum (minBound :: Char) <= x && x <= fromEnum (maxBound::Char)
= Just (toEnum x)
| otherwise = Nothing