{-# LANGUAGE ViewPatterns #-}
-- TODO : learn how to use Functional Morphology instead
-- |Simple default rules for English morphology
module NLP.Minimorph.English where

#if !(MIN_VERSION_base(4,11,0))
  -- this is redundant starting with base-4.11 / GHC 8.4
import Data.Semigroup
#endif

import           Data.Char (isSpace, isUpper, toLower)
import           Data.Text (Text)
import qualified Data.Text as T

import NLP.Minimorph.Util

-- ---------------------------------------------------------------------
-- ** Punctuation
-- ---------------------------------------------------------------------

-- | No Oxford commas, alas.
--
-- > commas "and" "foo bar"       == "foo and bar"
-- > commas "and" "foo, bar, baz" == "foo, bar and baz"
commas :: Text -> [Text] -> Text
commas :: Text -> [Text] -> Text
commas Text
_ []  = Text
""
commas Text
_ [Text
x] = Text
x
commas Text
et [Text]
xs = Text -> [Text] -> Text
T.intercalate Text
", " ([Text] -> [Text]
forall a. [a] -> [a]
init [Text]
xs) Text -> Text -> Text
<+> Text
et Text -> Text -> Text
<+> [Text] -> Text
forall a. [a] -> a
last [Text]
xs

-- ---------------------------------------------------------------------
-- ** Numbers
-- ---------------------------------------------------------------------

-- | > cardinal 0 == "zero"
--   > cardinal 1 == "one"
--   > cardinal 2 == "two"
--   > cardinal 10 == "ten"
--   > cardinal 11 == "11"
cardinal :: Int -> Text
cardinal :: Int -> Text
cardinal Int
n = case Int
n of
    Int
0  -> Text
"zero"
    Int
1  -> Text
"one"
    Int
2  -> Text
"two"
    Int
3  -> Text
"three"
    Int
4  -> Text
"four"
    Int
5  -> Text
"five"
    Int
6  -> Text
"six"
    Int
7  -> Text
"seven"
    Int
8  -> Text
"eight"
    Int
9  -> Text
"nine"
    Int
10 -> Text
"ten"
    Int
_ -> Int -> Text
forall a. Show a => a -> Text
tshow Int
n

-- | > ordinalNotSpelled 1 == "1st"
--   > ordinalNotSpelled 2 == "2nd"
--   > ordinalNotSpelled 11 == "11th"
ordinalNotSpelled :: Int -> Text
ordinalNotSpelled :: Int -> Text
ordinalNotSpelled Int
k = case Int -> Int
forall a. Num a => a -> a
abs Int
k Int -> Int -> Int
forall a. Integral a => a -> a -> a
`rem` Int
100 of
  Int
n | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
3 Bool -> Bool -> Bool
&& Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
21 -> Int
k Int -> Text -> Text
forall a. Show a => a -> Text -> Text
`suf` Text
"th"
    | Int
n Int -> Int -> Int
forall a. Integral a => a -> a -> a
`rem` Int
10 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1 -> Int
k Int -> Text -> Text
forall a. Show a => a -> Text -> Text
`suf` Text
"st"
    | Int
n Int -> Int -> Int
forall a. Integral a => a -> a -> a
`rem` Int
10 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
2 -> Int
k Int -> Text -> Text
forall a. Show a => a -> Text -> Text
`suf` Text
"nd"
    | Int
n Int -> Int -> Int
forall a. Integral a => a -> a -> a
`rem` Int
10 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
3 -> Int
k Int -> Text -> Text
forall a. Show a => a -> Text -> Text
`suf` Text
"rd"
    | Bool
otherwise       -> Int
k Int -> Text -> Text
forall a. Show a => a -> Text -> Text
`suf` Text
"th"
 where
  a
num suf :: a -> Text -> Text
`suf` Text
s = a -> Text
forall a. Show a => a -> Text
tshow a
num Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
s

-- | > ordinal 1 == "first"
--   > ordinal 2 == "second"
--   > ordinal 3 == "third"
--   > ordinal 11 == "11th"
--   > ordinal 42 == "42nd"
ordinal :: Int -> Text
ordinal :: Int -> Text
ordinal Int
n = case Int
n of
    Int
0  -> Text
"zeroth"
    Int
1  -> Text
"first"
    Int
2  -> Text
"second"
    Int
3  -> Text
"third"
    Int
4  -> Text
"fourth"
    Int
5  -> Text
"fifth"
    Int
6  -> Text
"sixth"
    Int
7  -> Text
"seventh"
    Int
8  -> Text
"eighth"
    Int
9  -> Text
"ninth"
    Int
10 -> Text
"tenth"
    Int
k  -> Int -> Text
ordinalNotSpelled Int
k

-- ---------------------------------------------------------------------
-- ** Nouns and verbs
-- ---------------------------------------------------------------------

-- | Heuristics for English plural for an unknown noun.
--
-- > defaultNounPlural "egg"    == "eggs"
-- > defaultNounPlural "patch"  == "patches"
-- > defaultNounPlural "boy"    == "boys"
-- > defaultNounPlural "spy"    == "spies"
-- > defaultNounPlural "thesis" == "theses"
--
-- http://www.paulnoll.com/Books/Clear-English/English-plurals-1.html
--
-- http://en.wikipedia.org/wiki/English_plural
defaultNounPlural :: Text -> Text
defaultNounPlural :: Text -> Text
defaultNounPlural Text
x
    | Text
"is" Text -> Text -> Bool
`T.isSuffixOf` Text
x = Text
thesis
    | Text -> Bool
hasSibilantSuffix Text
x   = Text
sibilant_o
    | Text -> Bool
hasCoSuffix Text
x         = Text
sibilant_o
    | Text -> Bool
hasCySuffix Text
x         = Text
y_final
    | Text
"ff" Text -> Text -> Bool
`T.isSuffixOf` Text
x = Text
ff_final  -- quite often not the case
    | Text
"f" Text -> Text -> Bool
`T.isSuffixOf` Text
x  = Text
f_final   -- but this one as well, so both needed
    | Bool
otherwise             = Text
plain
  where
    plain :: Text
plain      = Text
x            Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"s"
    sibilant_o :: Text
sibilant_o = Text
x            Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"es"
    y_final :: Text
y_final    = Text -> Text
T.init Text
x     Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"ies"
    f_final :: Text
f_final    = Text -> Text
T.init Text
x     Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"ves"
    ff_final :: Text
ff_final   = Int -> Text -> Text
T.dropEnd Int
2 Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"ves"
    thesis :: Text
thesis     = Int -> Text -> Text
T.dropEnd Int
2 Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"es"

-- | Heuristics for 3rd person singular and past participle
--   for an unknown regular verb. Doubling of final consonants
--   can be handled via a table of (partially) irregular verbs.
--
-- > defaultVerbStuff "walk"  == ("walks",  "walked")
-- > defaultVerbStuff "push"  == ("pushes", "pushed")
-- > defaultVerbStuff "play"  == ("plays",  "played")
-- > defaultVerbStuff "cry"   == ("cries",  "cried")
defaultVerbStuff :: Text -> (Text, Text)
defaultVerbStuff :: Text -> (Text, Text)
defaultVerbStuff Text
x
    | Text -> Bool
hasSibilantSuffix Text
x   = (Text, Text)
sibilant_o
    | Text -> Bool
hasCoSuffix Text
x         = (Text, Text)
sibilant_o
    | Text -> Bool
hasCySuffix Text
x         = (Text, Text)
y_final
    | Text
"e" Text -> Text -> Bool
`T.isSuffixOf` Text
x  = (Text, Text)
e_final
    | Bool
otherwise             = (Text, Text)
plain
  where
    plain :: (Text, Text)
plain      = (Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"s"         , Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"ed")
    sibilant_o :: (Text, Text)
sibilant_o = (Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"es"        , Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"ed")
    e_final :: (Text, Text)
e_final    = (Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"s"         , Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"d")
    y_final :: (Text, Text)
y_final    = (Text -> Text
T.init Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"ies", Text -> Text
T.init Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"ied")

-- | Heuristics for a possesive form for an unknown noun.
--
-- > defaultPossesive "pass"        == "pass'"
-- > defaultPossesive "SOS"         == "SOS'"
-- > defaultPossesive "Mr Blinkin'" == "Mr Blinkin's"
-- > defaultPossesive "cry"         == "cry's"
defaultPossesive :: Text -> Text
defaultPossesive :: Text -> Text
defaultPossesive Text
t =
  case Text -> Char
T.last Text
t of
    Char
's'  -> Text
t Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"'"
    Char
'S'  -> Text
t Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"'"
    Char
'\'' -> Text
t Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"s"
    Char
_    -> Text
t Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"'s"

-- ---------------------------------------------------------------------
-- ** Determiners
-- ---------------------------------------------------------------------

anNumerals :: [Text]
anNumerals :: [Text]
anNumerals = [ Text
"11", Text
"11th", Text
"18", Text
"18th" ]

-- | > indefiniteDet "dog"  == "a"
--   > indefiniteDet "egg"  == "an"
--   > indefiniteDet "ewe"  == "a"
--   > indefiniteDet "ewok" == "an"
--   > indefiniteDet "8th"  == "an"
indefiniteDet :: Text -> Text
indefiniteDet :: Text -> Text
indefiniteDet Text
t = if Text -> Bool
wantsAn Text
t then Text
"an" else Text
"a"

-- | True if the indefinite determiner for a word would normally be
--   \'an\' as opposed to \'a\'.
wantsAn :: Text -> Bool
wantsAn :: Text -> Bool
wantsAn Text
t_ =
    if Text -> Bool
startsWithAcronym Text
t_
       then Text -> Bool
acronymWantsAn Text
t_
       else Bool
useAn0 Bool -> Bool -> Bool
|| Bool
useAn1
  where
    t :: Text
t      = (Text, Text) -> Text
forall a b. (a, b) -> a
fst ((Text, Text) -> Text) -> (Text, Text) -> Text
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Text -> (Text, Text)
T.break Char -> Bool
isSep (Text -> (Text, Text)) -> Text -> (Text, Text)
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.toLower Text
t_
    useAn0 :: Bool
useAn0 = Text
t Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
anNumerals
    useAn1 :: Bool
useAn1 = case Text -> Maybe (Char, Text)
T.uncons Text
t of
                Just (Char
h, Text
"") -> Char -> Bool
isLetterWithInitialVowelSound Char
h
                Just (Char
'8',Text
_) -> Bool
True
                Just (Char
'u',Text
_) -> Text -> Bool
hasVowel_U_Prefix Text
t
                Just (Char
h, Text
_)  -> Char -> Bool
isVowel Char
h Bool -> Bool -> Bool
`butNot` Text -> Bool
hasSemivowelPrefix Text
t
                Maybe (Char, Text)
Nothing      -> Bool
False
    Bool
x butNot :: Bool -> Bool -> Bool
`butNot` Bool
y = Bool
x Bool -> Bool -> Bool
&& Bool -> Bool
not Bool
y
    isSep :: Char -> Bool
isSep Char
c = Char -> Bool
isSpace Char
c Bool -> Bool -> Bool
|| Char
c Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` ([Char]
"-" :: String)

-- | Variant of 'wantsAn' that assumes the input string is pronounced
--   one letter at a time.
--
--   > wantsAn        "x-ray" == False
--   > acronymWantsAn "x-ray" == True
--
--   Note that this won't do the right thing for words like \"SCUBA\".
--   You really have to reserve it for those separate-letter acronyms.
acronymWantsAn :: Text -> Bool
acronymWantsAn :: Text -> Bool
acronymWantsAn (Text -> Text
T.toLower -> Text
t) =
    Bool
useAn0 Bool -> Bool -> Bool
|| Bool
useAn1
  where
    useAn0 :: Bool
useAn0 = Text
t Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
anNumerals
    useAn1 :: Bool
useAn1 = case Text -> Maybe (Char, Text)
T.uncons Text
t of
                Just (Char
'8',Text
_) -> Bool
True
                Just (Char
h,Text
_)   -> Char -> Bool
isLetterWithInitialVowelSound Char
h
                Maybe (Char, Text)
Nothing      -> Bool
False

-- ---------------------------------------------------------------------
-- ** Acronyms
-- ---------------------------------------------------------------------

-- | True if all upper case from second letter and up.
--
--   > looksLikeAcronym "DNA"  == True
--   > looksLikeAcronym "tRNA" == True
--   > looksLikeAcronym "x"    == False
--   > looksLikeAcronym "DnA"  == False
looksLikeAcronym :: Text -> Bool
looksLikeAcronym :: Text -> Bool
looksLikeAcronym Text
"" = Bool
False
looksLikeAcronym Text
x = (Char -> Bool) -> Text -> Bool
T.all Char -> Bool
isUpper (if Text -> Int
T.length Text
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1 then Int -> Text -> Text
T.drop Int
1 Text
x else Text
x)

-- | True if the first word (separating on either hyphen or space)
--   looks like an acronym.
startsWithAcronym :: Text -> Bool
startsWithAcronym :: Text -> Bool
startsWithAcronym =
    Text -> Bool
looksLikeAcronym (Text -> Bool) -> (Text -> Text) -> Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
firstWord
  where
    firstWord :: Text -> Text
firstWord = (Text, Text) -> Text
forall a b. (a, b) -> a
fst ((Text, Text) -> Text) -> (Text -> (Text, Text)) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Text -> (Text, Text)
T.break Char -> Bool
isSep
    isSep :: Char -> Bool
isSep Char
c   = Char -> Bool
isSpace Char
c Bool -> Bool -> Bool
|| Char
c Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` ([Char]
"-" :: String)

-- ---------------------------------------------------------------------
-- ** Sounds
-- ---------------------------------------------------------------------

-- | Ends with a \'sh\' sound.
hasSibilantSuffix :: Text -> Bool
hasSibilantSuffix :: Text -> Bool
hasSibilantSuffix Text
x = (Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text -> Text -> Bool
`T.isSuffixOf` Text
x) [Text
"x",Text
"s",Text
"ch",Text
"sh",Text
"z",Text
"j"]

-- | Starts with a semivowel.
hasSemivowelPrefix :: Text -> Bool
hasSemivowelPrefix :: Text -> Bool
hasSemivowelPrefix Text
ls = (Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text -> Text -> Bool
`T.isPrefixOf` Text
ls) [Text
"y",Text
"w",Text
"eu",Text
"ewe"]

-- | Starts with a vowel-y \'U\' sound
hasVowel_U_Prefix :: Text -> Bool
hasVowel_U_Prefix :: Text -> Bool
hasVowel_U_Prefix Text
t =
    case Text -> [Char]
T.unpack Text
t of
        [Char
'u']       -> Bool
False
        [Char
'u',Char
_]     -> Bool
True
        (Char
'u':Char
c:Char
v:[Char]
_) -> Bool -> Bool
not (Char -> Bool
isConsonant Char
c Bool -> Bool -> Bool
&& Char -> Bool
isVowel Char
v)
        [Char]
_           -> Bool
False

-- | Last two letters are a consonant and \'y\'.
hasCySuffix :: Text -> Bool
hasCySuffix :: Text -> Bool
hasCySuffix (Text -> [Char]
T.unpack (Text -> [Char]) -> (Text -> Text) -> Text -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text -> Text
T.takeEnd Int
2 -> [Char
x, Char
'y']) = Char -> Bool
isConsonant Char
x
hasCySuffix Text
_ = Bool
False

-- | Last two letters are a consonant and \'o\'.
hasCoSuffix :: Text -> Bool
hasCoSuffix :: Text -> Bool
hasCoSuffix (Text -> [Char]
T.unpack (Text -> [Char]) -> (Text -> Text) -> Text -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text -> Text
T.takeEnd Int
2 -> [Char
x, Char
'o']) = Char -> Bool
isConsonant Char
x
hasCoSuffix Text
_ = Bool
False

-- | Is a vowel.
isVowel :: Char -> Bool
isVowel :: Char -> Bool
isVowel = (Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` ([Char]
"aeiou" :: String)) (Char -> Bool) -> (Char -> Char) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
toLower

-- | Letters that when pronounced independently in English sound like they
--   begin with vowels.
--
--   > isLetterWithInitialVowelSound 'r' == True
--   > isLetterWithInitialVowelSound 'k' == False
--
--   (In the above, \'r\' is pronounced \"are\", but \'k\' is pronounced
--   \"kay\".)
isLetterWithInitialVowelSound :: Char -> Bool
isLetterWithInitialVowelSound :: Char -> Bool
isLetterWithInitialVowelSound = (Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` ([Char]
"aeiofhlmnrsx" :: String)) (Char -> Bool) -> (Char -> Char) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
toLower

-- | Is a consonant.
--
--   Note that not every `Char` is either a vowel or a consonant.
--   We consider numbers, spaces and symbols to be neither vowel or consonants
isConsonant :: Char -> Bool
isConsonant :: Char -> Bool
isConsonant = (Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` ([Char]
"bcdfghjklmnpqrstvwxyz" :: String)) (Char -> Bool) -> (Char -> Char) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
toLower