src/NLP/Stemmer/Cistem.hs

{-# LANGUAGE OverloadedStrings #-}

module NLP.Stemmer.Cistem (stem,stemCaseInsensitive) where

import Data.Char
import Data.Text as T

-- | Guess the word stem. This module uses the CISTEM algorithm, published by L. Weißweiler and A. Fraser in "Developing a Stemmer for German Based on a Comparative Analysis of Publicly Available Stemmers" (2017).
stem :: Text -> Text
stem t =
    let firstUpper = isUpper (T.head t)
    in postpare $ loop firstUpper $ prepare t

-- | A case insensitive variant. Use only if the text may be incorrectly upper case.
stemCaseInsensitive :: Text -> Text
stemCaseInsensitive t = postpare $ loop False $ prepare t

loop u t | T.length t <= 3 = t
         | (T.length t > 5) && (["em","er","nd"] `isSuffixOf'` t) = loop u (stripSuffix' ["em","er","nd"] t)
         | not u && ("t" `isSuffixOf` t) = loop u (stripSuffix' ["t"] t)
         | ["e","s","n"] `isSuffixOf'` t = loop u (stripSuffix' ["e","s","n"] t)
         | otherwise = t
                                           
       
prepare :: Text -> Text
prepare =
    replace "ü" "u" .
    replace "ö" "o" .
    replace "ä" "a" .
    replace "ß" "ss" .
    replace "sch" "$" .
    replace "ie" "&" .
    replace "ei" "%" .
    replxx .
    stripge .
    T.toLower

postpare :: Text -> Text
postpare =
    replace "%" "ei" .
    replace "&" "ie" .
    replace "$" "sch" .
    replxxback
     
replxx :: Text -> Text
replxx = snd . mapAccumL f '\0'
    where f prev curr | prev == curr = (curr,'*')
                      | otherwise = (curr,curr)

replxxback :: Text -> Text
replxxback = snd . mapAccumL f '\0'
    where f prev '*' = (prev,prev)
          f prev curr = (curr,curr)

stripge :: Text -> Text
stripge t | T.length t >= 6 =
              case stripPrefix "ge" t of
                Nothing -> t
                Just t -> t
          | otherwise = t

isSuffixOf' [] _ = False
isSuffixOf' (s:ss) t = (s `isSuffixOf` t) || (ss `isSuffixOf'` t)

stripSuffix' :: [Text] -> Text -> Text
stripSuffix' [] hay = hay
stripSuffix' (suff:ss) hay =
    case stripSuffix suff hay of
      Just t -> t
      Nothing -> stripSuffix' ss hay