module NLP.Stemmer.Cistem (stem,stemCaseInsensitive,Segmentation(..),segment',segment,segment'CaseInsensitive,segmentCaseInsensitive) where
import Data.Char
import Data.Monoid
import Data.Text as T
stem :: Text -> Text
stem t =
let firstUpper = isUpper (T.head t)
in postpare $ loop firstUpper $ prepare t
stemCaseInsensitive :: Text -> Text
stemCaseInsensitive t = postpare $ loop False $ prepare t
data Segmentation = Segmentation { segPrefix :: Text, segStem :: Text, segSuffix :: Text } deriving (Show,Eq)
segment' :: Text -> Segmentation
segment' t =
let firstUpper = isUpper (T.head t)
lower = T.toLower t
prepared = segmentPrepare t
theStem = postpare $ loop firstUpper prepared
thePrefix | theStem `isPrefixOf` lower = ""
| "ge" `isPrefixOf` lower = "ge"
| otherwise = error ("segment' should be debugged; extracted stem: "++ unpack theStem)
theSuffix = T.drop (T.length thePrefix + T.length theStem) t
in Segmentation thePrefix theStem theSuffix
segment :: Text -> (Text,Text)
segment t =
let Segmentation{..} = segment' t
in (segPrefix<>segStem, segSuffix)
segmentCaseInsensitive :: Text -> (Text,Text)
segmentCaseInsensitive = segment . T.toLower
segment'CaseInsensitive :: Text -> Segmentation
segment'CaseInsensitive = segment' . T.toLower
loop u t | T.length t <= 3 = t
| (T.length t > 5) && (["em","er","nd"] `isSuffixOf'` t) = loop u (stripSuffix' ["em","er","nd"] t)
| not u && ("t" `isSuffixOf` t) = loop u (stripSuffix' ["t"] t)
| ["e","s","n"] `isSuffixOf'` t = loop u (stripSuffix' ["e","s","n"] t)
| otherwise = t
prepare :: Text -> Text
prepare =
replace "ü" "u" .
replace "ö" "o" .
replace "ä" "a" .
replace "ß" "ss" .
segmentPrepare
segmentPrepare :: Text -> Text
segmentPrepare =
replace "sch" "$" .
replace "ie" "&" .
replace "ei" "%" .
replxx .
T.toLower .
stripge
postpare :: Text -> Text
postpare =
replace "%" "ei" .
replace "&" "ie" .
replace "$" "sch" .
replxxback
replxx :: Text -> Text
replxx = snd . mapAccumL f '\0'
where f prev curr | prev == curr = (curr,'*')
| otherwise = (curr,curr)
replxxback :: Text -> Text
replxxback = snd . mapAccumL f '\0'
where f prev '*' = (prev,prev)
f prev curr = (curr,curr)
stripge :: Text -> Text
stripge t | T.length t >= 6 =
case stripPrefix "ge" t of
Nothing -> t
Just t -> t
| otherwise = t
isSuffixOf' [] _ = False
isSuffixOf' (s:ss) t = (s `isSuffixOf` t) || (ss `isSuffixOf'` t)
stripSuffix' :: [Text] -> Text -> Text
stripSuffix' [] hay = hay
stripSuffix' (suff:ss) hay =
case stripSuffix suff hay of
Just t -> t
Nothing -> stripSuffix' ss hay