module HtmlEntities(decode,encode) where import Data.Char(isDigit,isHexDigit,isSpace,chr,ord) import Numeric(readHex) decode [] = [] decode ('&':cs) = case break (\c -> c==';' || isSpace c) cs of (_,"") -> '&':decode cs ("",cs) -> '&':decode cs (ent,cs) -> case decodeEntity ent of "" -> '&':ent++decode cs e -> e++decode (case cs of ';':cs' -> cs' _ -> cs) decode (c:cs) = c:decode cs decodeEntity e = case e of "lt" -> "<" "gt" -> ">" "amp" -> "&" "quot" -> "\"" "nbsp" -> "\160" -- no-break space "iexcl" -> "\161" -- inverted exclamation mark "cent" -> "\162" -- cent sign "pound" -> "\163" -- pound sterling sign "curren" -> "\164" -- general currency sign "yen" -> "\165" -- yen sign "brvbar" -> "\166" -- broken (vertical) bar "sect" -> "\167" -- section sign "uml" -> "\168" -- umlaut (dieresis) "copy" -> "\169" -- copyright sign "ordf" -> "\170" -- ordinal indicator, feminine "laquo" -> "\171" -- angle quotation mark, left "not" -> "\172" -- not sign "shy" -> "\173" -- soft hyphen "reg" -> "\174" -- registered sign "macr" -> "\175" -- macron "deg" -> "\176" -- degree sign "plusmn" -> "\177" -- plus-or-minus sign "sup2" -> "\178" -- superscript two "sup3" -> "\179" -- superscript three "acute" -> "\180" -- acute accent "micro" -> "\181" -- micro sign "para" -> "\182" -- pilcrow (paragraph sign) "middot" -> "\183" -- middle dot "cedil" -> "\184" -- cedilla "sup1" -> "\185" -- superscript one "ordm" -> "\186" -- ordinal indicator, masculine "raquo" -> "\187" -- angle quotation mark, right "frac14" -> "\188" -- fraction one-quarter "frac12" -> "\189" -- fraction one-half "frac34" -> "\190" -- fraction three-quarters "iquest" -> "\191" -- inverted question mark "Agrave" -> "\192" -- capital A, grave accent "Aacute" -> "\193" -- capital A, acute accent "Acirc" -> "\194" -- capital A, circumflex accent "Atilde" -> "\195" -- capital A, tilde "Auml" -> "\196" -- capital A, dieresis or umlaut mark "Aring" -> "\197" -- capital A, ring "AElig" -> "\198" -- capital AE diphthong (ligature) "Ccedil" -> "\199" -- capital C, cedilla "Egrave" -> "\200" -- capital E, grave accent "Eacute" -> "\201" -- capital E, acute accent "Ecirc" -> "\202" -- capital E, circumflex accent "Euml" -> "\203" -- capital E, dieresis or umlaut mark "Igrave" -> "\204" -- capital I, grave accent "Iacute" -> "\205" -- capital I, acute accent "Icirc" -> "\206" -- capital I, circumflex accent "Iuml" -> "\207" -- capital I, dieresis or umlaut mark "ETH" -> "\208" -- capital Eth, Icelandic "Ntilde" -> "\209" -- capital N, tilde "Ograve" -> "\210" -- capital O, grave accent "Oacute" -> "\211" -- capital O, acute accent "Ocirc" -> "\212" -- capital O, circumflex accent "Otilde" -> "\213" -- capital O, tilde "Ouml" -> "\214" -- capital O, dieresis or umlaut mark "times" -> "\215" -- multiply sign "Oslash" -> "\216" -- capital O, slash "Ugrave" -> "\217" -- capital U, grave accent "Uacute" -> "\218" -- capital U, acute accent "Ucirc" -> "\219" -- capital U, circumflex accent "Uuml" -> "\220" -- capital U, dieresis or umlaut mark "Yacute" -> "\221" -- capital Y, acute accent "THORN" -> "\222" -- capital THORN, Icelandic "szlig" -> "\223" -- small sharp s, German (sz ligature) "agrave" -> "\224" -- small a, grave accent "aacute" -> "\225" -- small a, acute accent "acirc" -> "\226" -- small a, circumflex accent "atilde" -> "\227" -- small a, tilde "auml" -> "\228" -- small a, dieresis or umlaut mark "aring" -> "\229" -- small a, ring "aelig" -> "\230" -- small ae diphthong (ligature) "ccedil" -> "\231" -- small c, cedilla "egrave" -> "\232" -- small e, grave accent "eacute" -> "\233" -- small e, acute accent "ecirc" -> "\234" -- small e, circumflex accent "euml" -> "\235" -- small e, dieresis or umlaut mark "igrave" -> "\236" -- small i, grave accent "iacute" -> "\237" -- small i, acute accent "icirc" -> "\238" -- small i, circumflex accent "iuml" -> "\239" -- small i, dieresis or umlaut mark "eth" -> "\240" -- small eth, Icelandic "ntilde" -> "\241" -- small n, tilde "ograve" -> "\242" -- small o, grave accent "oacute" -> "\243" -- small o, acute accent "ocirc" -> "\244" -- small o, circumflex accent "otilde" -> "\245" -- small o, tilde "ouml" -> "\246" -- small o, dieresis or umlaut mark "divide" -> "\247" -- divide sign "oslash" -> "\248" -- small o, slash "ugrave" -> "\249" -- small u, grave accent "uacute" -> "\250" -- small u, acute accent "ucirc" -> "\251" -- small u, circumflex accent "uuml" -> "\252" -- small u, dieresis or umlaut mark "yacute" -> "\253" -- small y, acute accent "thorn" -> "\254" -- small thorn, Icelandic "yuml" -> "\255" -- small y, dieresis or umlaut mark "euro" -> "\8364" -- Euro sign "lsquo" -> "‘" "rsquo" -> "’" "ldquo" -> "“" "rdquo" -> "”" "hellip" -> "…" "ndash" -> "–" '#':'x':cs | all isHexDigit cs -> [chr $ fst $ head $ readHex cs] '#':cs | all isDigit cs -> [chr (read cs)] _ -> "" -- !!! '&':e++";" encode :: String -> String encode = concatMap encodeEntity encodeEntity c = case c of '&' -> "&" '<' -> "<" '>' -> ">" _ | c>'\255' -> "&#"++show (ord c)++";" -- assume 8-bit Latin1 output!! | otherwise -> [c] {- -- Should probably use Utils.isSpace' instead of isSpace collapseSpace s = case s of "" -> "" c:cs | isSpace c -> ' ':collapseSpace (dropWhile isSpace cs) | otherwise -> c:collapseSpace cs -}