{-# LANGUAGE OverloadedStrings #-}
module Kurita.Prompt.Filter where
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Encoding
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as Char8
import Data.Set (Set)
import qualified Data.Set as Set
import Language.Phonetic.DoubleMetaphone
data BadEntry = BadEntry {
badEntryWord :: Text
, badEntryMetaphone :: Text
} deriving (Eq, Ord, Show)
unreasonableMetaphone :: Set BadEntry -> Text -> Set BadEntry
unreasonableMetaphone =
unreasonableMatches (\entry input -> badEntryMetaphone entry `Text.isInfixOf` input)
unreasonableWord :: Set BadEntry -> Text -> Set BadEntry
unreasonableWord =
unreasonableMatches (\entry input -> badEntryWord entry `Text.isInfixOf` input)
unreasonableMatches :: (BadEntry -> Text -> Bool) -> Set BadEntry -> Text -> Set BadEntry
unreasonableMatches p badTokens input =
Set.foldr go Set.empty badTokens
where
go entry matches =
if p entry input
then Set.insert entry matches
else matches
readWordSet :: String -> IO (Set BadEntry)
readWordSet wordSetFile = do
Set.fromList . filterEmptyEntries . fmap makeBadEntry <$> readWordFile
where
readWordFile = Char8.split '\n' <$> Char8.readFile wordSetFile
filterEmptyEntries = filter (not . isEmptyEntry)
makeBadEntry :: ByteString -> BadEntry
makeBadEntry word =
BadEntry
(Encoding.decodeUtf8 word)
(Encoding.decodeUtf8 . fst $ doubleMetaphone word)
isEmptyEntry :: BadEntry -> Bool
isEmptyEntry entry =
Text.null (badEntryWord entry) || Text.null (badEntryMetaphone entry)