{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE TupleSections #-} {-| __Warning:__ This module is __internal__, and does __not__ follow the Package Versioning Policy. It may be useful for extending Brassica, but be prepared to track development closely if you import this module. -} module Brassica.SoundChange.Frontend.Internal where import Data.Bifunctor (second) import Data.Maybe (fromMaybe, mapMaybe) import Data.Void (Void) import GHC.Generics (Generic) import Control.DeepSeq (NFData) import Text.Megaparsec (ParseErrorBundle) import Brassica.MDF (MDF, parseMDFWithTokenisation, componentiseMDF, componentiseMDFWordsOnly, duplicateEtymologies) import Brassica.SoundChange.Apply import Brassica.SoundChange.Apply.Internal (applyChangesWithLog, toPWordLog) import Brassica.SoundChange.Tokenise import Brassica.SoundChange.Types -- | Rule application mode of the SCA. data ApplicationMode = ApplyRules HighlightMode OutputMode | ReportRulesApplied deriving (Show, Eq) data HighlightMode = NoHighlight | DifferentToLastRun | DifferentToInput deriving (Show, Eq) instance Enum HighlightMode where -- used for conversion to and from C, so want control over values fromEnum NoHighlight = 0 fromEnum DifferentToLastRun = 1 fromEnum DifferentToInput = 2 toEnum 0 = NoHighlight toEnum 1 = DifferentToLastRun toEnum 2 = DifferentToInput toEnum _ = undefined data OutputMode = MDFOutput | WordsOnlyOutput | MDFOutputWithEtymons | WordsWithProtoOutput deriving (Show, Eq) instance Enum OutputMode where -- used for conversion to and from C, so want control over values fromEnum MDFOutput = 0 fromEnum WordsOnlyOutput = 1 fromEnum MDFOutputWithEtymons = 2 fromEnum WordsWithProtoOutput = 3 toEnum 0 = MDFOutput toEnum 1 = WordsOnlyOutput toEnum 2 = MDFOutputWithEtymons toEnum 3 = WordsWithProtoOutput toEnum _ = undefined data TokenisationMode = Normal | AddEtymons deriving (Show, Eq) instance Enum TokenisationMode where -- used for conversion to and from C, so want control over values fromEnum Normal = 0 fromEnum AddEtymons = 1 toEnum 0 = Normal toEnum 1 = AddEtymons toEnum _ = undefined tokenisationModeFor :: ApplicationMode -> TokenisationMode tokenisationModeFor (ApplyRules _ MDFOutputWithEtymons) = AddEtymons tokenisationModeFor _ = Normal -- | Output of a single application of rules to a wordlist: either a -- list of possibly highlighted words, an applied rules table, or a -- parse error. data ApplicationOutput a r = HighlightedWords [Component (a, Bool)] | AppliedRulesTable [PWordLog r] | ParseError (ParseErrorBundle String Void) deriving (Show, Generic, NFData) -- | Kind of input: either a raw wordlist, or an MDF file. data InputLexiconFormat = Raw | MDF deriving (Show, Eq) instance Enum InputLexiconFormat where -- used for conversion to and from C, so want control over values fromEnum Raw = 0 fromEnum MDF = 1 toEnum 0 = Raw toEnum 1 = MDF toEnum _ = undefined data ParseOutput a = ParsedRaw [Component a] | ParsedMDF (MDF [Component a]) deriving (Show, Functor) componentise :: OutputMode -> [a] -> ParseOutput a -> [Component a] componentise WordsWithProtoOutput ws (ParsedRaw cs) = intersperseWords ws cs componentise _ _ (ParsedRaw cs) = cs componentise MDFOutput _ (ParsedMDF mdf) = componentiseMDF mdf componentise MDFOutputWithEtymons _ (ParsedMDF mdf) = componentiseMDF mdf componentise WordsOnlyOutput _ (ParsedMDF mdf) = componentiseMDFWordsOnly mdf componentise WordsWithProtoOutput ws (ParsedMDF mdf) = intersperseWords ws $ componentiseMDFWordsOnly mdf intersperseWords :: [a] -> [Component a] -> [Component a] intersperseWords (w:ws) (Word c:cs) = Word w : Separator " → " : Word c : Separator "\n" : intersperseWords ws cs intersperseWords ws (_:cs) = intersperseWords ws cs intersperseWords [] cs = cs intersperseWords _ [] = [] tokeniseAccordingToInputFormat :: InputLexiconFormat -> TokenisationMode -> SoundChanges -> String -> Either (ParseErrorBundle String Void) (ParseOutput PWord) tokeniseAccordingToInputFormat Raw _ cs = fmap ParsedRaw . withFirstCategoriesDecl tokeniseWords cs tokeniseAccordingToInputFormat MDF Normal cs = fmap ParsedMDF . withFirstCategoriesDecl parseMDFWithTokenisation cs tokeniseAccordingToInputFormat MDF AddEtymons cs = fmap ParsedMDF . second (duplicateEtymologies $ ('*':) . detokeniseWords) . withFirstCategoriesDecl parseMDFWithTokenisation cs getParsedWords :: ParseOutput a -> [a] getParsedWords (ParsedRaw cs) = getWords cs getParsedWords (ParsedMDF mdf) = getWords $ componentiseMDF mdf -- | Top-level dispatcher for an interactive frontend: given a textual -- wordlist and a list of sound changes, returns the result of running -- the changes in the specified mode. parseTokeniseAndApplyRules :: SoundChanges -- ^ changes -> String -- ^ words -> InputLexiconFormat -> ApplicationMode -> Maybe [Component PWord] -- ^ previous results -> ApplicationOutput PWord Statement parseTokeniseAndApplyRules statements ws intype mode prev = let tmode = tokenisationModeFor mode in case tokeniseAccordingToInputFormat intype tmode statements ws of Left e -> ParseError e Right toks | ws' <- getParsedWords toks -> case mode of ReportRulesApplied -> AppliedRulesTable $ mapMaybe toPWordLog $ concat $ getWords $ componentise WordsOnlyOutput [] $ applyChangesWithLog statements <$> toks ApplyRules DifferentToLastRun mdfout -> let result = concatMap (splitMultipleResults " ") $ componentise mdfout (fmap pure ws') $ applyChanges statements <$> toks in HighlightedWords $ zipWithComponents result (fromMaybe [] prev) [] $ \thisWord prevWord -> (thisWord, thisWord /= prevWord) ApplyRules DifferentToInput mdfout -> HighlightedWords $ concatMap (splitMultipleResults " ") $ componentise mdfout (fmap (pure . (,False)) ws') $ applyChangesWithChanges statements <$> toks ApplyRules NoHighlight mdfout -> HighlightedWords $ (fmap.fmap) (,False) $ concatMap (splitMultipleResults " ") $ componentise mdfout (fmap pure ws') $ applyChanges statements <$> toks where -- Zips two tokenised input strings. Compared to normal 'zipWith' -- this has two special properties: -- -- * It only zips v'Word's. Any non-v'Word's in the first argument -- will be passed unaltered to the output; any in the second -- argument will be ignored. -- -- * The returned list will have the same number of elements as does -- the first argument. If a v'Word' in the first argument has no -- corresponding v'Word' in the second, the zipping function is -- called using the default @b@ value given as the third argument. -- Such a v'Word' in the second argument will simply be ignored. -- -- Note the persistent assymetry in the definition: each 'Component' -- in the first argument will be reflected in the output, but each in -- the second argument may be ignored. zipWithComponents :: [Component a] -> [Component b] -> b -> (a -> b -> c) -> [Component c] zipWithComponents [] _ _ _ = [] zipWithComponents as [] bd f = (fmap.fmap) (`f` bd) as zipWithComponents (Word a:as) (Word b:bs) bd f = Word (f a b) : zipWithComponents as bs bd f zipWithComponents as@(Word _:_) (_:bs) bd f = zipWithComponents as bs bd f zipWithComponents (a:as) bs@(Word _:_) bd f = unsafeCastComponent a : zipWithComponents as bs bd f zipWithComponents (a:as) (_:bs) bd f = unsafeCastComponent a : zipWithComponents as bs bd f unsafeCastComponent :: Component a -> Component b unsafeCastComponent (Word _) = error "unsafeCastComponent: attempted to cast a word!" unsafeCastComponent (Separator s) = Separator s unsafeCastComponent (Gloss s) = Gloss s