-- | Substituions. module Substituions ( authorSubst , chSubst , commonSubst , titleSubst ) where import Data.Text ( Text ) ------------------------------------------------------------------------------ -- Characters substituions -- If a new entry is added here, please also add it to the -- characters-decimal substituions. -- | Characters in non-numeric notation. chNNSubst ∷ [(Text,Text)] chNNSubst = [ ("ö", "o") -- LATIN SMALL LETTER O WITH DIAERESIS (ö) , ("Ä", "A") -- LATIN CAPITAL LETTER A WITH DIAERESIS (Ä) , ("Ü", "U") -- LATIN CAPITAL LETTER U WITH DIAERESIS (Ü) ] -- If a new entry is added here, please also add it to the -- characters-hexadecimal substituions. -- | Characters substituions in decimal notation. chDecSubst ∷ [(Text,Text)] chDecSubst = [ ("á", "a") -- LATIN SMALL LETTER A WITH ACUTE , ("é", "e") -- LATIN SMALL LETTER E WITH ACUTE , ("í", "i") -- LATIN SMALL LETTER I WITH ACUTE , ("ó", "o") -- LATIN SMALL LETTER O WITH ACUTE , ("ö", "o") -- LATIN SMALL LETTER O WITH DIAERESIS (ö) , ("ú", "u") -- LATIN SMALL LETTER U WITH ACUTE , ("Š", "S") -- LATIN CAPITAL LETTER S WITH CARON , ("š", "s") -- LATIN SMALL LETTER S WITH CARON , ("λ", "lambda") -- GREEK SMALL LETTER LAMDA , ("‘", "") -- LEFT SINGLE QUOTATION MARK , ("’", "") -- RIGHT SINGLE QUOTATION MARK ] -- | Characters substituions in hexadecimal notation. chHexSubst ∷ [(Text,Text)] chHexSubst = [ ("Ä", "A") -- LATIN CAPITAL LETTER A WITH DIAERESIS (Ä) , ("Ü", "U") -- LATIN CAPITAL LETTER U WITH DIAERESIS (Ü) , ("á", "a") -- LATIN SMALL LETTER A WITH ACUTE , ("é", "e") -- LATIN SMALL LETTER E WITH ACUTE , ("í", "i") -- LATIN SMALL LETTER I WITH ACUTE , ("ó", "o") -- LATIN SMALL LETTER O WITH ACUTE , ("ú", "u") -- LATIN SMALL LETTER U WITH ACUTE , ("ö", "o") -- LATIN SMALL LETTER O WITH DIAERESIS (ö) , ("ü", "u") -- LATIN SMALL LETTER U WITH DIAERESIS (ü) , ("Š", "S") -- LATIN CAPITAL LETTER S WITH CARON , ("š", "s") -- LATIN SMALL LETTER S WITH CARON , ("Ī", "I") -- LATIN CAPITAL LETTER I WITH MACRON , ("ī", "I") -- LATIN SMALL LETTER I WITH MACRON , ("Ś", "S") -- LATIN CAPITAL LETTER S WITH ACUTE , ("ś", "s") -- LATIN SMALL LETTER S WITH ACUTE , ("Ş", "s") -- LATIN CAPITAL LETTER S WITH CEDILLA , ("ş", "s") -- LATIN SMALL LETTER S WITH CEDILLA , ("Ī", "I") -- LATIN CAPITAL LETTER I WITH MACRON , ("ī", "i") -- LATIN SMALL LETTER I WITH MACRON , ("α", "alpha") -- GREEK SMALL LETTER ALPHA , ("β", "beta") -- GREEK SMALL LETTER BETA , ("γ", "gamma") -- GREEK SMALL LETTER GAMMA , ("δ", "delta") -- GREEK SMALL LETTER DELTA , ("ε", "epsilon") -- GREEK SMALL LETTER EPSILON , ("ζ", "zeta") -- GREEK SMALL LETTER ZETA , ("η", "eta") -- GREEK SMALL LETTER ETA , ("θ", "theta") -- GREEK SMALL LETTER THETA , ("ι", "iota") -- GREEK SMALL LETTER IOTA , ("κ", "kappa") -- GREEK SMALL LETTER KAPPA , ("λ", "lambda") -- GREEK SMALL LETTER LAMDA , ("μ", "mu") -- GREEK SMALL LETTER MU , ("ν", "nu") -- GREEK SMALL LETTER NU , ("ξ", "xi") -- GREEK SMALL LETTER XI , ("ο", "omicron") -- GREEK SMALL LETTER OMICRON , ("π", "pi") -- GREEK SMALL LETTER PI , ("ρ", "rho") -- GREEK SMALL LETTER RHO , ("ς", "sigma") -- GREEK SMALL LETTER FINAL SIGMA , ("σ", "sigma") -- GREEK SMALL LETTER SIGMA , ("τ", "tau") -- GREEK SMALL LETTER TAU , ("υ", "upsilon") -- GREEK SMALL LETTER UPSILON , ("φ", "phi") -- GREEK SMALL LETTER PHI , ("χ", "chi") -- GREEK SMALL LETTER CHI , ("ψ", "psi") -- GREEK SMALL LETTER PSI , ("ω", "omega") -- GREEK SMALL LETTER OMEGA , ("‐", "-") -- HYPHEN , ("–", "-") -- EN DASH , ("—", ".") -- EM DAS , ("‘", "") -- LEFT SINGLE QUOTATION MARK , ("’", "") -- RIGHT SINGLE QUOTATION MARK , ("‚", "") -- SINGLE LOW-9 QUOTATION MAR , ("‛", "") -- SINGLE HIGH-REVERSED-9 QUOTATION MARK , ("“", "") -- LEFT DOUBLE QUOTATION MARK , ("”", "") -- RIGHT DOUBLE QUOTATION MARK , ("„", "") -- DOUBLE LOW-9 QUOTATION MARK , ("‟", "") -- DOUBLE HIGH-REVERSED-9 QUOTATION MARK , ("†", "dagger") -- DAGGER , ("‡", "dagger-dagger") -- DOUBLE DAGGER , ("•", "") -- BULLET , ("…", "") -- HORIZONTAL ELLIPSIS , ("⊃", "") -- SUPERSET OF , ("⌝", "") -- TOP RIGHT CORNER , ("‐", "-") -- HYPHEN ] -- | Characters substituions in Unicode notation. chUnicodeSubst ∷ [(Text,Text)] chUnicodeSubst = [ ("\t", "") -- U+0009 CHARACTER TABULATION , ("\n", "") -- U+000A LINE FEED (LF) , ("\f", "") -- U+000C FORM FEED (FF) , ("\r", "") -- U+000D CARRIAGE RETURN (CR) , (" ", "-") -- U+0020 SPACE , ("!", "") -- U+0021 EXCLAMATION MARK , ("\"", "") -- U+0022 QUOTATION MARK , ("#", "") -- U+0023 NUMBER SIGN , ("$", "") -- U+0024 DOLLAR SIGN , ("%", "") -- U+0025 PERCENT SIGN , ("&", "") -- U+0026 AMPERSAND , ("'", "") -- U+0027 APOSTROPHE , ("(", "") -- U+0028 LEFT PARENTHESIS , (")", "") -- U+0029 RIGHT PARENTHESIS , ("*", "") -- U+002A ASTERISK , ("+", "") -- U+002B PLUS SIGN , (",", "") -- U+002C COMMA , (".", "") -- U+002D FULL STOP , ("/", "") -- U+002F SOLIDUS , (":", "") -- U+003A COLON , (";", "") -- U+003B SEMICOLON , ("<", "") -- U+003C LESS-THAN SIGN , ("=", "") -- U+003D EQUALS SIGN , (">", "") -- U+003E GREATER-THAN SIGN , ("?", "") -- U+003F QUESTION MARK , ("@", "") -- U+0040 COMMERCIAL AT , ("[", "") -- U+005B LEFT SQUARE BRACKET , ("\\", "") -- U+005C REVERSE SOLIDUS , ("]", "") -- U+005D RIGHT SQUARE BRACKET , ("^", "") -- U+005E CIRCUMFLEX ACCENT , ("_", "-") -- U+005F LOW LINE , ("`", "") -- U+0060 GRAVE ACCENT , ("{", "") -- U+007B LEFT CURLY BRACKET , ("|", "") -- U+007C VERTICAL LINE , ("}", "") -- U+007B RIGHT CURLY BRACKET , ("~", "") -- U+007E TILDE , ("¡", "") -- U+00A1 INVERTED EXCLAMATION MARK , ("¬", "") -- U+00AC NOT SIGN , ("²", "2") -- U+00B2 SUPERSCRIPT TWO , ("³", "3") -- U+00B3 SUPERSCRIPT THREE , ("¹", "1") -- U+00B9 SUPERSCRIPT ONE , ("À", "A") -- U+00C0 LATIN CAPITAL LETTER A WITH GRAVE , ("Á", "A") -- U+00C1 LATIN CAPITAL LETTER A WITH ACUTE , ("Â", "A") -- U+00C2 LATIN CAPITAL LETTER A WITH CIRCUMFLEX , ("Ã", "A") -- U+00C3 LATIN CAPITAL LETTER A WITH TILDE , ("Ä", "A") -- U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS , ("Å", "A") -- U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE , ("Æ", "AE") -- U+00C6 LATIN CAPITAL LETTER AE , ("Ç", "C") -- U+00C7 LATIN CAPITAL LETTER C WITH CEDILLA , ("È", "E") -- U+00C8 LATIN CAPITAL LETTER E WITH GRAVE , ("É", "E") -- U+00C9 LATIN CAPITAL LETTER E WITH ACUTE , ("Ê", "E") -- U+00CA LATIN CAPITAL LETTER E WITH CIRCUMFLEX , ("Ë", "E") -- U+00CB LATIN CAPITAL LETTER E WITH DIAERESIS , ("Í", "I") -- U+00CD LATIN CAPITAL LETTER I WITH ACUTE , ("Î", "I") -- U+00CE LATIN CAPITAL LETTER I WITH CIRCUMFLEX , ("Ï", "I") -- U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS , ("Ñ", "N") -- U+00D1 LATIN CAPITAL LETTER N WITH TILDE , ("Ó", "O") -- U+00D3 LATIN CAPITAL LETTER O WITH ACUTE , ("Ô", "O") -- U+00D4 LATIN CAPITAL LETTER O WITH CIRCUMFLEX , ("Õ", "O") -- U+00D5 LATIN CAPITAL LETTER O WITH TILDE , ("Ú", "U") -- U+00DA LATIN CAPITAL LETTER U WITH ACUTE , ("Û", "U") -- U+00DB LATIN CAPITAL LETTER U WITH CIRCUMFLEX , ("Ü", "U") -- U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS , ("Ö", "O") -- U+00D6 LATIN CAPITAL LETTER O WITH DIAERESIS , ("×", "") -- U+00D7 MULTIPLICATION SIGN , ("Ø", "O") -- U+00D8 LATIN CAPITAL LETTER O WITH STROKE , ("Ù", "U") -- U+00D9 LATIN CAPITAL LETTER U WITH GRAVE , ("ß", "ss") -- U+00DF LATIN SMALL LETTER SHARP S , ("à", "a") -- U+00E0 LATIN SMALL LETTER A WITH GRAVE , ("á", "a") -- U+00E1 LATIN SMALL LETTER A WITH ACUTE , ("â", "a") -- U+00E2 LATIN SMALL LETTER A CIRCUMFLEX , ("ã", "a") -- U+00E3 LATIN SMALL LETTER A WITH TILDE , ("ä", "a") -- U+00E4 LATIN SMALL LETTER A WITH DIAERESIS , ("å", "a") -- U+00E5 LATIN SMALL LETTER A WITH RING ABOVE , ("æ", "ae") -- U+00E6 LATIN SMALL LETTER AE , ("ç", "c") -- U+00E7 LATIN SMALL LETTER C WITH CEDILLA , ("è", "e") -- U+00E8 LATIN SMALL LETTER E WITH GRAVE , ("é", "e") -- U+00E9 LATIN SMALL LETTER E WITH ACUTE , ("ê", "e") -- U+00EA LATIN SMALL LETTER E WITH CIRCUMFLEX , ("ë", "e") -- U+00EB LATIN SMALL LETTER E WITH DIAERESIS , ("í", "i") -- U+00ED LATIN SMALL LETTER I WITH ACUTE , ("î", "i") -- U+00EE LATIN SMALL LETTER I WITH CIRCUMFLEX , ("ï", "i") -- U+00EF LATIN SMALL LETTER I WITH DIAERESIS , ("ñ", "n") -- U+00F1 LATIN SMALL LETTER N WITH TILDE , ("ò", "o") -- U+00F2 LATIN SMALL LETTER O WITH GRAVE , ("ó", "o") -- U+00F3 LATIN SMALL LETTER O WITH ACUTE , ("ô", "o") -- U+00F4 LATIN SMALL LETTER O WITH CIRCUMFLEX , ("õ", "o") -- U+00F5 LATIN SMALL LETTER O WITH TILDE , ("ö", "o") -- U+00F6 LATIN SMALL LETTER O WITH DIAERESIS , ("ø", "o") -- U+00F8 LATIN SMALL LETTER O WITH STROKE , ("ù", "u") -- U+00F9 LATIN SMALL LETTER U WITH GRAVE , ("ú", "u") -- U+00FA LATIN SMALL LETTER U WITH ACUTE , ("û", "u") -- U+00FB LATIN SMALL LETTER U WITH CIRCUMFLEX , ("ü", "u") -- U+00FC LATIN SMALL LETTER U WITH DIAERESIS , ("þ", "t") -- U+00FE LATIN SMALL LETTER THORN , ("ÿ", "y") -- U+00FF LATIN SMALL LETTER Y WITH DIAERESIS , ("Ă", "A") -- U+0102 LATIN CAPITAL LETTER A WITH BREVE , ("ă", "a") -- U+0103 LATIN SMALL LETTER A WITH , ("Ą", "A") -- U+0104 LATIN CAPITAL LETTER A WITH OGONEK , ("ą", "a") -- U+0105 LATIN SMALL LETTER A WITH OGONEK , ("Ć", "c") -- U+0106 LATIN CAPITAL LETTER C WITH ACUTE , ("ć", "c") -- U+0107 LATIN SMALL LETTER C WITH ACUTE , ("č", "c") -- U+010D LATIN SMALL LETTER C WITH CARON , ("Ę", "e") -- U+0118 LATIN CAPITAL LETTER E WITH OGONEK , ("ę", "e") -- U+0119 LATIN SMALL LETTER E WITH OGONEK , ("Ł", "L") -- U+0141 LATIN CAPITAL LETTER L WITH STROKE , ("ł", "l") -- U+0142 LATIN SMALL LETTER L WITH STROKE , ("Ń", "N") -- U+0143 LATIN CAPITAL LETTER N WITH ACUTE , ("ń", "n") -- U+0144 LATIN SMALL LETTER N WITH ACUTE , ("ņ", "n") -- U+0146 LATIN SMALL LETTER N WITH CEDILLA , ("Ő", "O") -- U+0150 LATIN CAPITAL LETTER O WITH DOUBLE ACUTE , ("ő", "o") -- U+0151 LATIN SMALL LETTER O WITH DOUBLE ACUTE , ("Œ", "OE") -- U+0152 LATIN CAPITAL LIGATURE OE , ("œ", "oe") -- U+0153 LATIN SMALL LIGATURE OE , ("ř", "r") -- U+0159 LATIN SMALL LETTER R WITH CARON , ("Ś", "S") -- U+015A LATIN CAPITAL LETTER S WITH ACUTE , ("ś", "s") -- U+015B LATIN SMALL LETTER S WITH ACUTE , ("Š", "S") -- U+0160 LATIN CAPITAL LETTER S WITH CARON , ("š", "s") -- U+0161 LATIN SMALL LETTER S WITH CARON , ("ū", "u") -- U+016B LATIN SMALL LETTER U WITH MACRON , ("Ű", "U") -- U+0170 LATIN CAPITAL LETTER U WITH DOUBLE ACUTE , ("ű", "u") -- U+0171 LATIN SMALL LETTER U WITH DOUBLE ACUTE , ("Ÿ", "Y") -- U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS , ("Ź", "Z") -- U+0179 LATIN CAPITAL LETTER Z WITH ACUTE , ("ź", "z") -- U+017A LATIN SMALL LETTER Z WITH ACUTE , ("Ż", "Z") -- U+017B LATIN CAPITAL LETTER Z WITH DOT ABOVE , ("ż", "z") -- U+017C LATIN SMALL LETTER Z WITH DOT ABOVE , ("Ž", "z") -- U+017D LATIN CAPITAL LETTER Z WITH CARON , ("ž", "z") -- U+017E LATIN SMALL LETTER Z WITH CARON , ("Ș", "S") -- U+0218 LATIN CAPITAL LETTER S WITH COMMA BELOW , ("ș", "s") -- U+0219 LATIN SMALL LETTER S WITH COMMA BELOW , ("Ț", "T") -- U+021A LATIN CAPITAL LETTER T WITH COMMA BELOW , ("ț", "t") -- U+021B LATIN SMALL LETTER T WITH COMMA BELOW , ("Ω", "Omega") -- U+03A9 GREEK CAPITAL LETTER OMEGA , ("α", "alpha") -- U+03B1 GREEK SMALL LETTER ALPHA , ("β", "beta") -- U+03B2 GREEK SMALL LETTER BETA , ("γ", "gamma") -- U+03B3 GREEK SMALL LETTER GAMMA , ("δ", "delta") -- U+03B4 GREEK SMALL LETTER DELTA , ("ε", "epsilon") -- U+03B5 GREEK SMALL LETTER EPSILON , ("ζ", "zeta") -- U+03B6 GREEK SMALL LETTER ZETA , ("η", "eta") -- U+03B7 GREEK SMALL LETTER ETA , ("θ", "theta") -- U+03B8 GREEK SMALL LETTER THETA , ("ι", "iota") -- U+03B9 GREEK SMALL LETTER IOTA , ("κ", "kappa") -- U+03BA GREEK SMALL LETTER KAPPA , ("λ", "lambda") -- U+03BB GREEK SMALL LETTER LAMDA , ("μ", "mu") -- U+03BC GREEK SMALL LETTER MU , ("ν", "nu") -- U+03BD GREEK SMALL LETTER NU , ("ξ", "xi") -- U+03BE GREEK SMALL LETTER ZI , ("ο", "omicron") -- U+03BF GREEK SMALL LETTER OMICRON , ("π", "pi") -- U+03C0 GREEK SMALL LETTER PI , ("ρ", "rho") -- U+03C1 GREEK SMALL LETTER RHO , ("σ", "sigma") -- U+03C3 GREEK SMALL LETTER SIGMA , ("ς", "sigma") -- U+03C2 GREEK SMALL LETTER FINAL SIGMA , ("τ", "tau") -- U+03C4 GREEK SMALL LETTER TAU , ("υ", "upsilon") -- U+03C5 GREEK SMALL LETTER UPSILON , ("φ", "phi") -- U+03C6 GREEK SMALL LETTER PHI , ("χ", "chi") -- U+03C7 GREEK SMALL LETTER CHI , ("ψ", "psi") -- U+03C8 GREEK SMALL LETTER PSI , ("ω", "omega") -- U+03C9 GREEK SMALL LETTER OMEGA , ("ẞ", "SS") -- U+1E9E LATIN CAPITAL LETTER SHARP S , ("–", "-") -- U+2013 EN DASH , ("—", "-") -- U+2014 EM DASH , ("‘", "") -- U+2018 LEFT SINGLE QUOTATION MARK , ("’", "") -- U+2019 RIGHT SINGLE QUOTATION MARK , ("‚", "") -- U+201A SINGLE LOW-9 QUOTATION MARK , ("‛", "") -- U+201B SINGLE HIGH-REVERSED-9 QUOTATION MARK , ("“", "") -- U+201C LEFT DOUBLE QUOTATION MARK , ("”", "") -- U+201D RIGHT DOUBLE QUOTATION MARK , ("„", "") -- U+201E DOUBLE LOW-9 QUOTATION MARK , ("‟", "") -- U+201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK , ("‡", "") -- U+2021 DOUBLE DAGGER , ("™", "") -- U+2122 TRADE MARK SIGN , ("ℵ", "aleph") -- U+2135 ALEF SYMBOL , ("ℶ", "beth") -- U+2136 BET SYMBOL , ("�", "") -- U+FFFD REPLACEMENT CHARACTER ] -- | All the characters substituions. -- NB that the substituions are not commutative. chSubst ∷ [(Text, Text)] chSubst = chHexSubst ++ chNNSubst ++ chDecSubst ++ chUnicodeSubst ------------------------------------------------------------------------------ -- Author and title substituions commonSubst ∷ [(Text, Text)] commonSubst = [ ("á", "a") -- U+00C3 and U+00A1 (LATIN SMALL LETTER A WITH GRAVE) , ("é", "e") -- U+00C3 and U+00A9 (LATIN SMALL LETTER E WITH ACUTE) , ("Ã\x00AD", "i") -- U+00C3 and U+00AD (LATIN SMALL LETTER I GRAVE) -- We erase `ö` because it follows an `o` in the examples we know. , ("ö", "") -- U+00C3 and U+00B6 (LATIN SMALL LETTER O WITH DIAERESIS) , ("Å›", "s") -- U+00C5 and U+203A (LATIN CAPITAL LETTER S WITH ACUTE) ] ------------------------------------------------------------------------------ -- Author substituions authorSubst ∷ [(Text, Text)] authorSubst = [ (", ", ",") , (" and", ",") ] ------------------------------------------------------------------------------ -- Title substituions -- These substituions should be done before converting to lower case. titleSubst ∷ [(Text,Text)] titleSubst = [ ("P ", "P") , ("0 ", "0") , ("C", "C") , ("CC", "CC") , ("I ", "I") , ("J", "J") , ("Modus ponens", "Modus ponens") , ("P ", "P") , ("really ", "really") , ("S-P", "S-P") , ("3", "3") -- The whitespace around `+` is not the standard one. -- TODO (2017-07-04): Added test case. , (" + ", "plus") , ("ω", "omega") , ("$\\alpha$", "alpha") , ("$\\beta$", "beta") , ("$\\gamma$", "gamma") , ("$\\epsilon$", "epsilon") , ("$\\eta$", "eta") , ("$\\lambda$", "lambda") , ("$\\pi$", "pi") , ("$\\omega$", "omega") , ("{\\sc Coq}", "Coq") , ("{\\sf Haskell}:", "Haskell") , ("{\\sc QuickSpec}:", "QuickSpec") , ("{\\sc QuodLibet}!", "QuodLibet") , ("{\\sc Vampire}", "Vampire") ]