-- | Substituions.
module Substituions
( authorSubst
, chSubst
, commonSubst
, titleSubst
) where
import Data.Text ( Text )
------------------------------------------------------------------------------
-- Characters substituions
-- If a new entry is added here, please also add it to the
-- characters-decimal substituions.
-- | Characters in non-numeric notation.
chNNSubst ∷ [(Text,Text)]
chNNSubst =
[ ("ö", "o") -- LATIN SMALL LETTER O WITH DIAERESIS (ö)
, ("Ä", "A") -- LATIN CAPITAL LETTER A WITH DIAERESIS (Ä)
, ("Ü", "U") -- LATIN CAPITAL LETTER U WITH DIAERESIS (Ü)
]
-- If a new entry is added here, please also add it to the
-- characters-hexadecimal substituions.
-- | Characters substituions in decimal notation.
chDecSubst ∷ [(Text,Text)]
chDecSubst =
[ ("á", "a") -- LATIN SMALL LETTER A WITH ACUTE
, ("é", "e") -- LATIN SMALL LETTER E WITH ACUTE
, ("í", "i") -- LATIN SMALL LETTER I WITH ACUTE
, ("ó", "o") -- LATIN SMALL LETTER O WITH ACUTE
, ("ö", "o") -- LATIN SMALL LETTER O WITH DIAERESIS (ö)
, ("ú", "u") -- LATIN SMALL LETTER U WITH ACUTE
, ("Š", "S") -- LATIN CAPITAL LETTER S WITH CARON
, ("š", "s") -- LATIN SMALL LETTER S WITH CARON
, ("λ", "lambda") -- GREEK SMALL LETTER LAMDA
, ("‘", "") -- LEFT SINGLE QUOTATION MARK
, ("’", "") -- RIGHT SINGLE QUOTATION MARK
]
-- | Characters substituions in hexadecimal notation.
chHexSubst ∷ [(Text,Text)]
chHexSubst =
[ ("Ä", "A") -- LATIN CAPITAL LETTER A WITH DIAERESIS (Ä)
, ("Ü", "U") -- LATIN CAPITAL LETTER U WITH DIAERESIS (Ü)
, ("á", "a") -- LATIN SMALL LETTER A WITH ACUTE
, ("é", "e") -- LATIN SMALL LETTER E WITH ACUTE
, ("í", "i") -- LATIN SMALL LETTER I WITH ACUTE
, ("ó", "o") -- LATIN SMALL LETTER O WITH ACUTE
, ("ú", "u") -- LATIN SMALL LETTER U WITH ACUTE
, ("ö", "o") -- LATIN SMALL LETTER O WITH DIAERESIS (ö)
, ("ü", "u") -- LATIN SMALL LETTER U WITH DIAERESIS (ü)
, ("Š", "S") -- LATIN CAPITAL LETTER S WITH CARON
, ("š", "s") -- LATIN SMALL LETTER S WITH CARON
, ("Ī", "I") -- LATIN CAPITAL LETTER I WITH MACRON
, ("ī", "I") -- LATIN SMALL LETTER I WITH MACRON
, ("Ś", "S") -- LATIN CAPITAL LETTER S WITH ACUTE
, ("ś", "s") -- LATIN SMALL LETTER S WITH ACUTE
, ("Ş", "s") -- LATIN CAPITAL LETTER S WITH CEDILLA
, ("ş", "s") -- LATIN SMALL LETTER S WITH CEDILLA
, ("Ī", "I") -- LATIN CAPITAL LETTER I WITH MACRON
, ("ī", "i") -- LATIN SMALL LETTER I WITH MACRON
, ("α", "alpha") -- GREEK SMALL LETTER ALPHA
, ("β", "beta") -- GREEK SMALL LETTER BETA
, ("γ", "gamma") -- GREEK SMALL LETTER GAMMA
, ("δ", "delta") -- GREEK SMALL LETTER DELTA
, ("ε", "epsilon") -- GREEK SMALL LETTER EPSILON
, ("ζ", "zeta") -- GREEK SMALL LETTER ZETA
, ("η", "eta") -- GREEK SMALL LETTER ETA
, ("θ", "theta") -- GREEK SMALL LETTER THETA
, ("ι", "iota") -- GREEK SMALL LETTER IOTA
, ("κ", "kappa") -- GREEK SMALL LETTER KAPPA
, ("λ", "lambda") -- GREEK SMALL LETTER LAMDA
, ("μ", "mu") -- GREEK SMALL LETTER MU
, ("ν", "nu") -- GREEK SMALL LETTER NU
, ("ξ", "xi") -- GREEK SMALL LETTER XI
, ("ο", "omicron") -- GREEK SMALL LETTER OMICRON
, ("π", "pi") -- GREEK SMALL LETTER PI
, ("ρ", "rho") -- GREEK SMALL LETTER RHO
, ("ς", "sigma") -- GREEK SMALL LETTER FINAL SIGMA
, ("σ", "sigma") -- GREEK SMALL LETTER SIGMA
, ("τ", "tau") -- GREEK SMALL LETTER TAU
, ("υ", "upsilon") -- GREEK SMALL LETTER UPSILON
, ("φ", "phi") -- GREEK SMALL LETTER PHI
, ("χ", "chi") -- GREEK SMALL LETTER CHI
, ("ψ", "psi") -- GREEK SMALL LETTER PSI
, ("ω", "omega") -- GREEK SMALL LETTER OMEGA
, ("‐", "-") -- HYPHEN
, ("–", "-") -- EN DASH
, ("—", ".") -- EM DAS
, ("‘", "") -- LEFT SINGLE QUOTATION MARK
, ("’", "") -- RIGHT SINGLE QUOTATION MARK
, ("‚", "") -- SINGLE LOW-9 QUOTATION MAR
, ("‛", "") -- SINGLE HIGH-REVERSED-9 QUOTATION MARK
, ("“", "") -- LEFT DOUBLE QUOTATION MARK
, ("”", "") -- RIGHT DOUBLE QUOTATION MARK
, ("„", "") -- DOUBLE LOW-9 QUOTATION MARK
, ("‟", "") -- DOUBLE HIGH-REVERSED-9 QUOTATION MARK
, ("†", "dagger") -- DAGGER
, ("‡", "dagger-dagger") -- DOUBLE DAGGER
, ("•", "") -- BULLET
, ("…", "") -- HORIZONTAL ELLIPSIS
, ("⊃", "") -- SUPERSET OF
, ("⌝", "") -- TOP RIGHT CORNER
, ("‐", "-") -- HYPHEN
]
-- | Characters substituions in Unicode notation.
chUnicodeSubst ∷ [(Text,Text)]
chUnicodeSubst =
[ ("\t", "") -- U+0009 CHARACTER TABULATION
, ("\n", "") -- U+000A LINE FEED (LF)
, ("\f", "") -- U+000C FORM FEED (FF)
, ("\r", "") -- U+000D CARRIAGE RETURN (CR)
, (" ", "-") -- U+0020 SPACE
, ("!", "") -- U+0021 EXCLAMATION MARK
, ("\"", "") -- U+0022 QUOTATION MARK
, ("#", "") -- U+0023 NUMBER SIGN
, ("$", "") -- U+0024 DOLLAR SIGN
, ("%", "") -- U+0025 PERCENT SIGN
, ("&", "") -- U+0026 AMPERSAND
, ("'", "") -- U+0027 APOSTROPHE
, ("(", "") -- U+0028 LEFT PARENTHESIS
, (")", "") -- U+0029 RIGHT PARENTHESIS
, ("*", "") -- U+002A ASTERISK
, ("+", "") -- U+002B PLUS SIGN
, (",", "") -- U+002C COMMA
, (".", "") -- U+002D FULL STOP
, ("/", "") -- U+002F SOLIDUS
, (":", "") -- U+003A COLON
, (";", "") -- U+003B SEMICOLON
, ("<", "") -- U+003C LESS-THAN SIGN
, ("=", "") -- U+003D EQUALS SIGN
, (">", "") -- U+003E GREATER-THAN SIGN
, ("?", "") -- U+003F QUESTION MARK
, ("@", "") -- U+0040 COMMERCIAL AT
, ("[", "") -- U+005B LEFT SQUARE BRACKET
, ("\\", "") -- U+005C REVERSE SOLIDUS
, ("]", "") -- U+005D RIGHT SQUARE BRACKET
, ("^", "") -- U+005E CIRCUMFLEX ACCENT
, ("_", "-") -- U+005F LOW LINE
, ("`", "") -- U+0060 GRAVE ACCENT
, ("{", "") -- U+007B LEFT CURLY BRACKET
, ("|", "") -- U+007C VERTICAL LINE
, ("}", "") -- U+007B RIGHT CURLY BRACKET
, ("~", "") -- U+007E TILDE
, ("¡", "") -- U+00A1 INVERTED EXCLAMATION MARK
, ("¬", "") -- U+00AC NOT SIGN
, ("²", "2") -- U+00B2 SUPERSCRIPT TWO
, ("³", "3") -- U+00B3 SUPERSCRIPT THREE
, ("¹", "1") -- U+00B9 SUPERSCRIPT ONE
, ("À", "A") -- U+00C0 LATIN CAPITAL LETTER A WITH GRAVE
, ("Á", "A") -- U+00C1 LATIN CAPITAL LETTER A WITH ACUTE
, ("Â", "A") -- U+00C2 LATIN CAPITAL LETTER A WITH CIRCUMFLEX
, ("Ã", "A") -- U+00C3 LATIN CAPITAL LETTER A WITH TILDE
, ("Ä", "A") -- U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
, ("Å", "A") -- U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE
, ("Æ", "AE") -- U+00C6 LATIN CAPITAL LETTER AE
, ("Ç", "C") -- U+00C7 LATIN CAPITAL LETTER C WITH CEDILLA
, ("È", "E") -- U+00C8 LATIN CAPITAL LETTER E WITH GRAVE
, ("É", "E") -- U+00C9 LATIN CAPITAL LETTER E WITH ACUTE
, ("Ê", "E") -- U+00CA LATIN CAPITAL LETTER E WITH CIRCUMFLEX
, ("Ë", "E") -- U+00CB LATIN CAPITAL LETTER E WITH DIAERESIS
, ("Í", "I") -- U+00CD LATIN CAPITAL LETTER I WITH ACUTE
, ("Î", "I") -- U+00CE LATIN CAPITAL LETTER I WITH CIRCUMFLEX
, ("Ï", "I") -- U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
, ("Ñ", "N") -- U+00D1 LATIN CAPITAL LETTER N WITH TILDE
, ("Ó", "O") -- U+00D3 LATIN CAPITAL LETTER O WITH ACUTE
, ("Ô", "O") -- U+00D4 LATIN CAPITAL LETTER O WITH CIRCUMFLEX
, ("Õ", "O") -- U+00D5 LATIN CAPITAL LETTER O WITH TILDE
, ("Ú", "U") -- U+00DA LATIN CAPITAL LETTER U WITH ACUTE
, ("Û", "U") -- U+00DB LATIN CAPITAL LETTER U WITH CIRCUMFLEX
, ("Ü", "U") -- U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS
, ("Ö", "O") -- U+00D6 LATIN CAPITAL LETTER O WITH DIAERESIS
, ("×", "") -- U+00D7 MULTIPLICATION SIGN
, ("Ø", "O") -- U+00D8 LATIN CAPITAL LETTER O WITH STROKE
, ("Ù", "U") -- U+00D9 LATIN CAPITAL LETTER U WITH GRAVE
, ("ß", "ss") -- U+00DF LATIN SMALL LETTER SHARP S
, ("à", "a") -- U+00E0 LATIN SMALL LETTER A WITH GRAVE
, ("á", "a") -- U+00E1 LATIN SMALL LETTER A WITH ACUTE
, ("â", "a") -- U+00E2 LATIN SMALL LETTER A CIRCUMFLEX
, ("ã", "a") -- U+00E3 LATIN SMALL LETTER A WITH TILDE
, ("ä", "a") -- U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
, ("å", "a") -- U+00E5 LATIN SMALL LETTER A WITH RING ABOVE
, ("æ", "ae") -- U+00E6 LATIN SMALL LETTER AE
, ("ç", "c") -- U+00E7 LATIN SMALL LETTER C WITH CEDILLA
, ("è", "e") -- U+00E8 LATIN SMALL LETTER E WITH GRAVE
, ("é", "e") -- U+00E9 LATIN SMALL LETTER E WITH ACUTE
, ("ê", "e") -- U+00EA LATIN SMALL LETTER E WITH CIRCUMFLEX
, ("ë", "e") -- U+00EB LATIN SMALL LETTER E WITH DIAERESIS
, ("í", "i") -- U+00ED LATIN SMALL LETTER I WITH ACUTE
, ("î", "i") -- U+00EE LATIN SMALL LETTER I WITH CIRCUMFLEX
, ("ï", "i") -- U+00EF LATIN SMALL LETTER I WITH DIAERESIS
, ("ñ", "n") -- U+00F1 LATIN SMALL LETTER N WITH TILDE
, ("ò", "o") -- U+00F2 LATIN SMALL LETTER O WITH GRAVE
, ("ó", "o") -- U+00F3 LATIN SMALL LETTER O WITH ACUTE
, ("ô", "o") -- U+00F4 LATIN SMALL LETTER O WITH CIRCUMFLEX
, ("õ", "o") -- U+00F5 LATIN SMALL LETTER O WITH TILDE
, ("ö", "o") -- U+00F6 LATIN SMALL LETTER O WITH DIAERESIS
, ("ø", "o") -- U+00F8 LATIN SMALL LETTER O WITH STROKE
, ("ù", "u") -- U+00F9 LATIN SMALL LETTER U WITH GRAVE
, ("ú", "u") -- U+00FA LATIN SMALL LETTER U WITH ACUTE
, ("û", "u") -- U+00FB LATIN SMALL LETTER U WITH CIRCUMFLEX
, ("ü", "u") -- U+00FC LATIN SMALL LETTER U WITH DIAERESIS
, ("þ", "t") -- U+00FE LATIN SMALL LETTER THORN
, ("ÿ", "y") -- U+00FF LATIN SMALL LETTER Y WITH DIAERESIS
, ("Ă", "A") -- U+0102 LATIN CAPITAL LETTER A WITH BREVE
, ("ă", "a") -- U+0103 LATIN SMALL LETTER A WITH
, ("Ą", "A") -- U+0104 LATIN CAPITAL LETTER A WITH OGONEK
, ("ą", "a") -- U+0105 LATIN SMALL LETTER A WITH OGONEK
, ("Ć", "c") -- U+0106 LATIN CAPITAL LETTER C WITH ACUTE
, ("ć", "c") -- U+0107 LATIN SMALL LETTER C WITH ACUTE
, ("č", "c") -- U+010D LATIN SMALL LETTER C WITH CARON
, ("Ę", "e") -- U+0118 LATIN CAPITAL LETTER E WITH OGONEK
, ("ę", "e") -- U+0119 LATIN SMALL LETTER E WITH OGONEK
, ("Ł", "L") -- U+0141 LATIN CAPITAL LETTER L WITH STROKE
, ("ł", "l") -- U+0142 LATIN SMALL LETTER L WITH STROKE
, ("Ń", "N") -- U+0143 LATIN CAPITAL LETTER N WITH ACUTE
, ("ń", "n") -- U+0144 LATIN SMALL LETTER N WITH ACUTE
, ("ņ", "n") -- U+0146 LATIN SMALL LETTER N WITH CEDILLA
, ("Ő", "O") -- U+0150 LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
, ("ő", "o") -- U+0151 LATIN SMALL LETTER O WITH DOUBLE ACUTE
, ("Œ", "OE") -- U+0152 LATIN CAPITAL LIGATURE OE
, ("œ", "oe") -- U+0153 LATIN SMALL LIGATURE OE
, ("ř", "r") -- U+0159 LATIN SMALL LETTER R WITH CARON
, ("Ś", "S") -- U+015A LATIN CAPITAL LETTER S WITH ACUTE
, ("ś", "s") -- U+015B LATIN SMALL LETTER S WITH ACUTE
, ("Š", "S") -- U+0160 LATIN CAPITAL LETTER S WITH CARON
, ("š", "s") -- U+0161 LATIN SMALL LETTER S WITH CARON
, ("ū", "u") -- U+016B LATIN SMALL LETTER U WITH MACRON
, ("Ű", "U") -- U+0170 LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
, ("ű", "u") -- U+0171 LATIN SMALL LETTER U WITH DOUBLE ACUTE
, ("Ÿ", "Y") -- U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
, ("Ź", "Z") -- U+0179 LATIN CAPITAL LETTER Z WITH ACUTE
, ("ź", "z") -- U+017A LATIN SMALL LETTER Z WITH ACUTE
, ("Ż", "Z") -- U+017B LATIN CAPITAL LETTER Z WITH DOT ABOVE
, ("ż", "z") -- U+017C LATIN SMALL LETTER Z WITH DOT ABOVE
, ("Ž", "z") -- U+017D LATIN CAPITAL LETTER Z WITH CARON
, ("ž", "z") -- U+017E LATIN SMALL LETTER Z WITH CARON
, ("Ș", "S") -- U+0218 LATIN CAPITAL LETTER S WITH COMMA BELOW
, ("ș", "s") -- U+0219 LATIN SMALL LETTER S WITH COMMA BELOW
, ("Ț", "T") -- U+021A LATIN CAPITAL LETTER T WITH COMMA BELOW
, ("ț", "t") -- U+021B LATIN SMALL LETTER T WITH COMMA BELOW
, ("Ω", "Omega") -- U+03A9 GREEK CAPITAL LETTER OMEGA
, ("α", "alpha") -- U+03B1 GREEK SMALL LETTER ALPHA
, ("β", "beta") -- U+03B2 GREEK SMALL LETTER BETA
, ("γ", "gamma") -- U+03B3 GREEK SMALL LETTER GAMMA
, ("δ", "delta") -- U+03B4 GREEK SMALL LETTER DELTA
, ("ε", "epsilon") -- U+03B5 GREEK SMALL LETTER EPSILON
, ("ζ", "zeta") -- U+03B6 GREEK SMALL LETTER ZETA
, ("η", "eta") -- U+03B7 GREEK SMALL LETTER ETA
, ("θ", "theta") -- U+03B8 GREEK SMALL LETTER THETA
, ("ι", "iota") -- U+03B9 GREEK SMALL LETTER IOTA
, ("κ", "kappa") -- U+03BA GREEK SMALL LETTER KAPPA
, ("λ", "lambda") -- U+03BB GREEK SMALL LETTER LAMDA
, ("μ", "mu") -- U+03BC GREEK SMALL LETTER MU
, ("ν", "nu") -- U+03BD GREEK SMALL LETTER NU
, ("ξ", "xi") -- U+03BE GREEK SMALL LETTER ZI
, ("ο", "omicron") -- U+03BF GREEK SMALL LETTER OMICRON
, ("π", "pi") -- U+03C0 GREEK SMALL LETTER PI
, ("ρ", "rho") -- U+03C1 GREEK SMALL LETTER RHO
, ("σ", "sigma") -- U+03C3 GREEK SMALL LETTER SIGMA
, ("ς", "sigma") -- U+03C2 GREEK SMALL LETTER FINAL SIGMA
, ("τ", "tau") -- U+03C4 GREEK SMALL LETTER TAU
, ("υ", "upsilon") -- U+03C5 GREEK SMALL LETTER UPSILON
, ("φ", "phi") -- U+03C6 GREEK SMALL LETTER PHI
, ("χ", "chi") -- U+03C7 GREEK SMALL LETTER CHI
, ("ψ", "psi") -- U+03C8 GREEK SMALL LETTER PSI
, ("ω", "omega") -- U+03C9 GREEK SMALL LETTER OMEGA
, ("ẞ", "SS") -- U+1E9E LATIN CAPITAL LETTER SHARP S
, ("–", "-") -- U+2013 EN DASH
, ("—", "-") -- U+2014 EM DASH
, ("‘", "") -- U+2018 LEFT SINGLE QUOTATION MARK
, ("’", "") -- U+2019 RIGHT SINGLE QUOTATION MARK
, ("‚", "") -- U+201A SINGLE LOW-9 QUOTATION MARK
, ("‛", "") -- U+201B SINGLE HIGH-REVERSED-9 QUOTATION MARK
, ("“", "") -- U+201C LEFT DOUBLE QUOTATION MARK
, ("”", "") -- U+201D RIGHT DOUBLE QUOTATION MARK
, ("„", "") -- U+201E DOUBLE LOW-9 QUOTATION MARK
, ("‟", "") -- U+201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK
, ("‡", "") -- U+2021 DOUBLE DAGGER
, ("™", "") -- U+2122 TRADE MARK SIGN
, ("ℵ", "aleph") -- U+2135 ALEF SYMBOL
, ("ℶ", "beth") -- U+2136 BET SYMBOL
, ("�", "") -- U+FFFD REPLACEMENT CHARACTER
]
-- | All the characters substituions.
-- NB that the substituions are not commutative.
chSubst ∷ [(Text, Text)]
chSubst = chHexSubst ++ chNNSubst ++ chDecSubst ++ chUnicodeSubst
------------------------------------------------------------------------------
-- Author and title substituions
commonSubst ∷ [(Text, Text)]
commonSubst =
[ ("á", "a") -- U+00C3 and U+00A1 (LATIN SMALL LETTER A WITH GRAVE)
, ("é", "e") -- U+00C3 and U+00A9 (LATIN SMALL LETTER E WITH ACUTE)
, ("Ã\x00AD", "i") -- U+00C3 and U+00AD (LATIN SMALL LETTER I GRAVE)
-- We erase `ö` because it follows an `o` in the examples we know.
, ("ö", "") -- U+00C3 and U+00B6 (LATIN SMALL LETTER O WITH DIAERESIS)
, ("Å›", "s") -- U+00C5 and U+203A (LATIN CAPITAL LETTER S WITH ACUTE)
]
------------------------------------------------------------------------------
-- Author substituions
authorSubst ∷ [(Text, Text)]
authorSubst =
[ (", ", ",")
, (" and", ",")
]
------------------------------------------------------------------------------
-- Title substituions
-- These substituions should be done before converting to lower case.
titleSubst ∷ [(Text,Text)]
titleSubst =
[ ("P ", "P")
, ("0 ", "0")
, ("C", "C")
, ("CC", "CC")
, ("I ", "I")
, ("J", "J")
, ("Modus ponens", "Modus ponens")
, ("P ", "P")
, ("really ", "really")
, ("S-P", "S-P")
, ("3", "3")
-- The whitespace around `+` is not the standard one.
-- TODO (2017-07-04): Added test case.
, (" + ", "plus")
, ("ω", "omega")
, ("$\\alpha$", "alpha")
, ("$\\beta$", "beta")
, ("$\\gamma$", "gamma")
, ("$\\epsilon$", "epsilon")
, ("$\\eta$", "eta")
, ("$\\lambda$", "lambda")
, ("$\\pi$", "pi")
, ("$\\omega$", "omega")
, ("{\\sc Coq}", "Coq")
, ("{\\sf Haskell}:", "Haskell")
, ("{\\sc QuickSpec}:", "QuickSpec")
, ("{\\sc QuodLibet}!", "QuodLibet")
, ("{\\sc Vampire}", "Vampire")
]