module NLP.Stemmer.Cistem (stem,stemCaseInsensitive) where
import Data.Char
import Data.Text as T
stem :: Text -> Text
stem t =
let firstUpper = isUpper (T.head t)
in postpare $ loop firstUpper $ prepare t
stemCaseInsensitive :: Text -> Text
stemCaseInsensitive t = postpare $ loop False $ prepare t
loop u t | T.length t <= 3 = t
| (T.length t > 5) && (["em","er","nd"] `isSuffixOf'` t) = loop u (stripSuffix' ["em","er","nd"] t)
| not u && ("t" `isSuffixOf` t) = loop u (stripSuffix' ["t"] t)
| ["e","s","n"] `isSuffixOf'` t = loop u (stripSuffix' ["e","s","n"] t)
| otherwise = t
prepare :: Text -> Text
prepare =
replace "ü" "u" .
replace "ö" "o" .
replace "ä" "a" .
replace "ß" "ss" .
replace "sch" "$" .
replace "ie" "&" .
replace "ei" "%" .
replxx .
stripge .
T.toLower
postpare :: Text -> Text
postpare =
replace "%" "ei" .
replace "&" "ie" .
replace "$" "sch" .
replxxback
replxx :: Text -> Text
replxx = snd . mapAccumL f '\0'
where f prev curr | prev == curr = (curr,'*')
| otherwise = (curr,curr)
replxxback :: Text -> Text
replxxback = snd . mapAccumL f '\0'
where f prev '*' = (prev,prev)
f prev curr = (curr,curr)
stripge :: Text -> Text
stripge t | T.length t >= 6 =
case stripPrefix "ge" t of
Nothing -> t
Just t -> t
| otherwise = t
isSuffixOf' [] _ = False
isSuffixOf' (s:ss) t = (s `isSuffixOf` t) || (ss `isSuffixOf'` t)
stripSuffix' :: [Text] -> Text -> Text
stripSuffix' [] hay = hay
stripSuffix' (suff:ss) hay =
case stripSuffix suff hay of
Just t -> t
Nothing -> stripSuffix' ss hay