module Text.HTML.TagSoup.HT.Parser (
CharType,
runSoup, runSoupWithPositions, runSoupWithPositionsName,
runTag, runInnerOfTag,
) where
import Text.HTML.TagSoup.HT.Parser.Combinator
(allowFail, withDefault,
char, dropSpaces, getPos,
many, manyNull, many0toN, many1toN,
many1Satisfy, readUntil,
satisfy, string,
emit, modifyEmission, )
import qualified Text.HTML.TagSoup.HT.Parser.Combinator as Parser
import qualified Text.HTML.TagSoup.HT.PositionTag as PosTag
import qualified Text.HTML.TagSoup.HT.Tag as Tag
import qualified Text.XML.Basic.Position as Position
import qualified Text.HTML.Basic.Character as HTMLChar
import qualified Text.XML.Basic.ProcessingInstruction as PI
import qualified Text.XML.Basic.Attribute as Attr
import qualified Text.XML.Basic.Name as Name
import qualified Text.XML.Basic.Tag as TagName
import qualified Text.HTML.TagSoup.HT.Parser.Stream as Stream
import qualified Text.HTML.Basic.Entity as HTMLEntity
import qualified Control.Monad.Exception.Synchronous as Exc
import Control.Monad.Fix (MonadFix, mfix, )
import Control.Monad (mplus, msum, when, liftM, )
import Data.Monoid (Monoid, mempty, mconcat, )
import qualified Data.Map as Map
import Data.Tuple.HT (mapSnd, )
import Data.Char (isAlphaNum, isAscii, chr, ord, )
import Data.Maybe (fromMaybe, )
runTag ::
(Stream.C source, StringType sink, Show sink,
Name.Attribute name, Name.Tag name, Eq name, Show name) =>
source -> Tag.T name sink
runTag str =
let tags =
fromMaybe (error "runTag: no parse at all") $
Parser.write "string" parsePosTag str
makeError = error $
"runTag: parsing results in\n" ++
unlines (map show tags)
in case tags of
[postag] ->
let tag = PosTag.tag_ postag
in if Tag.isWarning tag
then makeError
else tag
_ -> makeError
runInnerOfTag ::
(StringType sink, Show sink,
Name.Attribute name, Name.Tag name, Eq name, Show name) =>
String -> Tag.T name sink
runInnerOfTag str = runTag $ "<"++str++">"
runSoupWithPositionsName ::
(Stream.C source, StringType sink,
Name.Attribute name, Name.Tag name, Eq name) =>
FilePath -> source -> [PosTag.T name sink]
runSoupWithPositionsName fileName =
PosTag.concatTexts .
Parser.runIdentity .
Parser.write fileName (manyNull parsePosTag)
runSoupWithPositions ::
(Stream.C source, StringType sink,
Name.Attribute name, Name.Tag name, Eq name) =>
source -> [PosTag.T name sink]
runSoupWithPositions =
runSoupWithPositionsName "input"
runSoup ::
(Stream.C source, StringType sink,
Name.Attribute name, Name.Tag name, Eq name) =>
source -> [Tag.T name sink]
runSoup = map PosTag.tag_ . runSoupWithPositions
type Parser name source sink a = Parser.Full source (PosTag.T name sink) a
type ParserEmit name source sink a = Parser.Emitting source (PosTag.T name sink) a
parsePosTag ::
(Stream.C source, StringType sink,
Name.Attribute name, Name.Tag name, Eq name) =>
Parser name source sink ()
parsePosTag = do
pos <- getPos
mplus
(do char '<'
allowFail $ withDefault
(msum $
parseSpecialTag pos :
parseProcessingTag pos :
parseCloseTag pos :
parseOpenTag pos :
[])
(do emitTag pos (Tag.Text $ stringFromChar '<')
emitWarning pos "A '<', that is not part of a tag. Encode it as < please."))
(parseText pos)
parseOpenTag ::
(Stream.C source, StringType sink,
Name.Attribute name, Name.Tag name) =>
Position.T -> Parser name source sink ()
parseOpenTag pos =
do name <- parseName
allowFail $
do dropSpaces
emittingTag pos (Tag.Open name) $
modifyEmission (restrictSoup 10) $ many parseAttribute
withDefault
(do closePos <- getPos
string "/>"
allowFail $ emitTag closePos (Tag.Close name))
(do junkPos <- getPos
readUntilTerm
(\ junk ->
emitWarningWhen
(not $ null junk)
junkPos ("Junk in opening tag: \"" ++ junk ++ "\""))
("Unterminated open tag \"" ++ Name.toString name ++ "\"") ">")
parseCloseTag ::
(Stream.C source, Name.Tag name) =>
Position.T -> Parser name source sink ()
parseCloseTag pos =
do char '/'
name <- parseName
allowFail $
do emitTag pos (Tag.Close name)
dropSpaces
junkPos <- getPos
readUntilTerm
(\ junk ->
emitWarningWhen
(not $ null junk)
junkPos ("Junk in closing tag: \"" ++ junk ++"\""))
("Unterminated closing tag \"" ++ Name.toString name ++"\"") ">"
parseSpecialTag ::
(Stream.C source, Name.Tag name) =>
Position.T -> Parser name source sink ()
parseSpecialTag pos =
do char '!'
msum $
(do string "--"
allowFail $ readUntilTerm
(\ cmt -> emitTag pos (Tag.Comment cmt))
"Unterminated comment" "-->") :
(do string TagName.cdataString
allowFail $ readUntilTerm
(\ cdata -> emitTag pos (Tag.Special (Name.fromString TagName.cdataString) cdata))
"Unterminated cdata" "]]>") :
(do name <- parseName
allowFail $
do dropSpaces
readUntilTerm
(\ info -> emitTag pos (Tag.Special name info))
("Unterminated special tag \"" ++ Name.toString name ++ "\"") ">") :
[]
parseProcessingTag ::
(Stream.C source, StringType sink,
Name.Attribute name, Name.Tag name, Eq name) =>
Position.T -> Parser name source sink ()
parseProcessingTag pos =
do char '?'
name <- parseName
allowFail $
do dropSpaces
emittingTag pos (Tag.Processing name) $
if Name.matchAny ["xml", "xml-stylesheet"] name
then
do attrs <- many parseAttribute
junkPos <- getPos
readUntilTerm
(\ junk ->
emitWarningWhen (not $ null junk) junkPos
("Junk in processing info tag: \"" ++ junk ++ "\""))
("Unterminated processing info tag \"" ++ Name.toString name ++ "\"") "?>"
return $ PI.Known attrs
else readUntilTerm (return . PI.Unknown)
"Unterminated processing instruction" "?>"
parseText ::
(Stream.C source, StringType sink) =>
Position.T -> Parser name source sink ()
parseText pos =
emittingTag pos Tag.Text (parseCharAsString (const True))
parseAttribute ::
(Stream.C source, StringType sink, Name.Attribute name) =>
Parser name source sink (Attr.T name sink)
parseAttribute =
parseName >>= \name -> allowFail $
do dropSpaces
value <-
withDefault
(string "=" >> allowFail (dropSpaces >> parseValue))
(return mempty)
dropSpaces
return $ Attr.Cons name value
parseName ::
(Stream.C source, Name.C pname) =>
Parser name source sink pname
parseName =
liftM Name.fromString $
many1Satisfy (\c -> isAlphaNum c && isAscii c || c `elem` "_-.:")
parseValue ::
(Stream.C source, StringType sink) =>
ParserEmit name source sink sink
parseValue =
(msum $
parseQuoted "Unterminated doubly quoted value string" '"' :
parseQuoted "Unterminated singly quoted value string" '\'' :
[])
`withDefault`
parseUnquotedValueAsString
parseUnquotedValueChar ::
(Stream.C source) =>
ParserEmit name source String String
parseUnquotedValueChar =
let parseValueChar =
do pos <- getPos
str <- parseUnicodeChar (not . flip elem " >\"\'")
let wrong = filter (not . isValidValueChar) str
allowFail $
emitWarningWhen (not (null wrong)) pos $
"Illegal characters in unquoted value: " ++ wrong
return str
in liftM concat $ many parseValueChar
parseUnquotedValueHTMLChar ::
(Stream.C source) =>
ParserEmit name source [HTMLChar.T] [HTMLChar.T]
parseUnquotedValueHTMLChar =
let parseValueChar =
do pos <- getPos
hc <- parseHTMLChar (not . flip elem " >\"\'")
allowFail $ mapM_ (checkUnquotedChar pos) hc
return hc
in liftM concat $ many parseValueChar
checkUnquotedChar :: Position.T -> HTMLChar.T -> ParserEmit name source sink ()
checkUnquotedChar pos x =
case x of
HTMLChar.Unicode c ->
emitWarningWhen (not (isValidValueChar c)) pos $
"Illegal characters in unquoted value: '" ++ c : "'"
_ -> return ()
isValidValueChar :: Char -> Bool
isValidValueChar c = isAlphaNum c || c `elem` "_-:."
parseQuoted ::
(Stream.C source, StringType sink) =>
String -> Char -> Parser name source sink sink
parseQuoted termMsg quote =
char quote >>
(allowFail $
do str <- parseString (quote/=)
withDefault
(char quote >> return ())
(do termPos <- getPos
emitWarning termPos termMsg)
return str)
readUntilTerm ::
(Stream.C source) =>
(String -> ParserEmit name source sink a) -> String -> String -> ParserEmit name source sink a
readUntilTerm generateTag termWarning termPat =
do ~(termFound,str) <- readUntil termPat
result <- generateTag str
termPos <- getPos
emitWarningWhen (not termFound) termPos termWarning
return result
class CharType char where
fromChar :: Char -> char
parseChar :: (Stream.C source) => (Char -> Bool) -> Parser name source sink [char]
parseUnquotedValue :: (Stream.C source) => ParserEmit name source [char] [char]
instance CharType Char where
fromChar = id
parseChar = parseUnicodeChar
parseUnquotedValue = parseUnquotedValueChar
instance CharType HTMLChar.T where
fromChar = HTMLChar.Unicode
parseChar = parseHTMLChar
parseUnquotedValue = parseUnquotedValueHTMLChar
class Monoid sink => StringType sink where
stringFromChar :: Char -> sink
parseCharAsString ::
(Stream.C source) =>
(Char -> Bool) -> Parser name source sink sink
parseUnquotedValueAsString ::
(Stream.C source) =>
ParserEmit name source sink sink
instance CharType char => StringType [char] where
stringFromChar c = [fromChar c]
parseCharAsString = parseChar
parseUnquotedValueAsString = parseUnquotedValue
parseString ::
(Stream.C source, StringType sink) =>
(Char -> Bool) -> ParserEmit name source sink sink
parseString p = liftM mconcat $ many (parseCharAsString p)
parseUnicodeChar ::
(Stream.C source) =>
(Char -> Bool) -> Parser name source sink String
parseUnicodeChar p =
do pos <- getPos
x <- parseHTMLChar p
allowFail $ liftM concat $
mapM (htmlCharToString pos) x
htmlCharToString ::
Position.T -> HTMLChar.T -> ParserEmit name source sink String
htmlCharToString pos x =
let returnChar c = return $ c:[]
in case x of
HTMLChar.Unicode c -> returnChar c
HTMLChar.CharRef num -> returnChar (chr num)
HTMLChar.EntityRef name ->
maybe
(let refName = '&':name++";"
in emitWarning pos ("Unknown HTML entity " ++ refName) >>
return refName)
returnChar
(Map.lookup name HTMLEntity.mapNameToChar)
parseHTMLChar ::
(Stream.C source) =>
(Char -> Bool) -> Parser name source sink [HTMLChar.T]
parseHTMLChar p =
do pos <- getPos
c <- satisfy p
allowFail $
if c=='&'
then
withDefault
(do ent <-
mplus
(do char '#'
digits <- allowFail $ many0toN 10 (satisfy isAlphaNum)
Exc.switch
(\e ->
allowFail (emitWarning pos ("Error in numeric entity: " ++ e)) >>
return (map HTMLChar.fromUnicode ('&':'#':digits)))
(return . (:[]) . HTMLChar.CharRef . ord)
(HTMLEntity.numberToChar digits))
(liftM ((:[]) . HTMLChar.EntityRef) $
many1toN 10 (satisfy isAlphaNum))
char ';'
return ent)
(emitWarning pos "Non-terminated entity reference" >>
return [HTMLChar.Unicode '&'])
else return [HTMLChar.Unicode c]
restrictSoup :: Int -> [PosTag.T name sink] -> [PosTag.T name sink]
restrictSoup n =
uncurry (++) .
mapSnd
(\rest ->
case rest of
(PosTag.Cons pos _) : _ ->
[PosTag.Cons pos (Tag.Warning "further warnings suppressed")]
_ -> []) .
splitAt n
emitWarningWhen :: Bool -> Position.T -> String -> ParserEmit name source sink ()
emitWarningWhen cond pos msg =
when cond $ emitWarning pos msg
emitWarning :: Position.T -> String -> ParserEmit name source sink ()
emitWarning pos msg = emitTag pos (Tag.Warning msg)
emitTag :: Position.T -> Tag.T name sink -> ParserEmit name source sink ()
emitTag p t = emit (PosTag.cons p t)
emittingTag ::
(MonadFix fail) =>
Position.T ->
(a -> Tag.T name sink) ->
Parser.T source [PosTag.T name sink] fail a ->
Parser.T source [PosTag.T name sink] fail ()
emittingTag pos f x =
mfix (\r -> emit (PosTag.cons pos (f r)) >> x) >> return ()