Safe Haskell	Safe-Inferred
Language	Haskell2010

Brassica.SoundChange.Tokenise

Contents

Components
High-level interface
Lower-level functions

Synopsis

data Component a
- = Word a
- | Separator String
- | Gloss String
getWords :: [Component a] -> [a]
splitMultipleResults :: String -> Component [a] -> [Component a]
tokeniseWord :: [String] -> String -> Either (ParseErrorBundle String Void) PWord
tokeniseWords :: [String] -> String -> Either (ParseErrorBundle String Void) [Component PWord]
detokeniseWords' :: (a -> String) -> [Component a] -> String
detokeniseWords :: [Component PWord] -> String
concatWithBoundary :: PWord -> String
findFirstCategoriesDecl :: SoundChanges c [Grapheme] -> [String]
withFirstCategoriesDecl :: ([String] -> t) -> SoundChanges c [Grapheme] -> t
wordParser :: [Char] -> [String] -> ParsecT Void String Identity PWord
componentsParser :: ParsecT Void String Identity a -> ParsecT Void String Identity [Component a]
sortByDescendingLength :: [[a]] -> [[a]]

Components

data Component a Source #

Represents a component of a tokenised input string. Words in the input are represented as the type parameter a — which for this reason will usually, though not always, be PWord.

Constructors

Word a
Separator String
Gloss String

Instances

Instances details

Foldable Component Source #
Instance details Defined in Brassica.SoundChange.Tokenise Methods fold :: Monoid m => Component m -> m # foldMap :: Monoid m => (a -> m) -> Component a -> m # foldMap' :: Monoid m => (a -> m) -> Component a -> m # foldr :: (a -> b -> b) -> b -> Component a -> b # foldr' :: (a -> b -> b) -> b -> Component a -> b # foldl :: (b -> a -> b) -> b -> Component a -> b # foldl' :: (b -> a -> b) -> b -> Component a -> b # foldr1 :: (a -> a -> a) -> Component a -> a # foldl1 :: (a -> a -> a) -> Component a -> a # toList :: Component a -> [a] # null :: Component a -> Bool # length :: Component a -> Int # elem :: Eq a => a -> Component a -> Bool # maximum :: Ord a => Component a -> a # minimum :: Ord a => Component a -> a # sum :: Num a => Component a -> a # product :: Num a => Component a -> a #
Traversable Component Source #
Instance details Defined in Brassica.SoundChange.Tokenise Methods traverse :: Applicative f => (a -> f b) -> Component a -> f (Component b) # sequenceA :: Applicative f => Component (f a) -> f (Component a) # mapM :: Monad m => (a -> m b) -> Component a -> m (Component b) # sequence :: Monad m => Component (m a) -> m (Component a) #
Functor Component Source #
Instance details Defined in Brassica.SoundChange.Tokenise Methods fmap :: (a -> b) -> Component a -> Component b # (<$) :: a -> Component b -> Component a #
Generic (Component a) Source #
Instance details Defined in Brassica.SoundChange.Tokenise Associated Types type Rep (Component a) :: Type -> Type # Methods from :: Component a -> Rep (Component a) x # to :: Rep (Component a) x -> Component a #
Show a => Show (Component a) Source #
Instance details Defined in Brassica.SoundChange.Tokenise Methods showsPrec :: Int -> Component a -> ShowS # show :: Component a -> String # showList :: [Component a] -> ShowS #
NFData a => NFData (Component a) Source #
Instance details Defined in Brassica.SoundChange.Tokenise Methods rnf :: Component a -> () #
Eq a => Eq (Component a) Source #
Instance details Defined in Brassica.SoundChange.Tokenise Methods (==) :: Component a -> Component a -> Bool # (/=) :: Component a -> Component a -> Bool #
type Rep (Component a) Source #
Instance details Defined in Brassica.SoundChange.Tokenise type Rep (Component a) = D1 ('MetaData "Component" "Brassica.SoundChange.Tokenise" "brassica-0.2.0-6DYGqWgsRcQ5pGt1m6P3TU" 'False) (C1 ('MetaCons "Word" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 a)) :+: (C1 ('MetaCons "Separator" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 String)) :+: C1 ('MetaCons "Gloss" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 String))))

getWords :: [Component a] -> [a] Source #

Given a tokenised input string, return only the Words within it.

splitMultipleResults :: String -> Component [a] -> [Component a] Source #

Given a Component containing multiple values in a Word, split it apart into a list of Components in which the given String is used as a Separator between multiple results.

For instance:

>>> splitMultipleResults " " (Word ["abc", "def", "ghi"])
[Word "abc", Separator " ", Word "def", Separator " ", Word "ghi"]

>>> splitMultipleResults " " (Word ["abc"])
[Word "abc"]

High-level interface

tokeniseWord :: [String] -> String -> Either (ParseErrorBundle String Void) PWord Source #

Tokenise a String input word into a PWord by splitting it up into Graphemes. A list of available multigraphs is supplied as the first argument.

Note that this tokeniser is greedy: if one of the given multigraphs is a prefix of another, the tokeniser will prefer the longest if possible. If there are no matching multigraphs starting at a particular character in the String, tokeniseWord will treat that character as its own Grapheme. For instance:

>>> tokeniseWord [] "cherish"
Right [GMulti "c",GMulti "h",GMulti "e",GMulti "r",GMulti "i",GMulti "s",GMulti "h"]

>>> tokeniseWord ["e","h","i","r","s","sh"] "cherish"
Right [GMulti "c",GMulti "h",GMulti "e",GMulti "r",GMulti "i",GMulti "sh"]

>>> tokeniseWord ["c","ch","e","h","i","r","s","sh"] "cherish"
Right [GMulti "ch",GMulti "e",GMulti "r",GMulti "i",GMulti "sh"]

tokeniseWords :: [String] -> String -> Either (ParseErrorBundle String Void) [Component PWord] Source #

Given a list of available multigraphs, tokenise an input string into a list of words and other Components. This uses the same tokenisation strategy as tokeniseWords, but also recognises Glosses (in square brackets) and Separators (in the form of whitespace).

detokeniseWords' :: (a -> String) -> [Component a] -> String Source #

Given a function to convert Components to strings, converts a list of Components to strings.

detokeniseWords :: [Component PWord] -> String Source #

Specialisation of detokeniseWords' for PWords, converting words to strings using concatWithBoundary.

concatWithBoundary :: PWord -> String Source #

Render a PWord as a String. Very much like concat, but treating GBoundarys specially. Word-external boundaries are deleted, while word-internal boundaries are converted to "#".

findFirstCategoriesDecl :: SoundChanges c [Grapheme] -> [String] Source #

Given a list of sound changes, extract the list of multigraphs defined in the first categories declaration of the SoundChanges.

withFirstCategoriesDecl :: ([String] -> t) -> SoundChanges c [Grapheme] -> t Source #

CPS'd form of findFirstCategoriesDecl. Nice for doing things like withFirstCategoriesDecl tokeniseWords changes words (to tokenise using the graphemes from the first categories declaration) and so on.

Lower-level functions

wordParser :: [Char] -> [String] -> ParsecT Void String Identity PWord Source #

Megaparsec parser for PWords — see tokeniseWord documentation for details on the parsing strategy and the meaning of the second parameter. For most usecases tokeniseWord should suffice; wordParser itself is only really useful in unusual situations (e.g. as part of a larger parser). The first parameter gives a list of characters (aside from whitespace) which should be excluded from words, i.e. the parser will stop if any of them are found. The second gives a list of multigraphs which might be expected.

Note: the second parameter must be sortByDescendingLength-ed; otherwise multigraphs will not be parsed correctly.

componentsParser :: ParsecT Void String Identity a -> ParsecT Void String Identity [Component a] Source #

Megaparsec parser for Components. Similarly to wordParser, usually it’s easier to use tokeniseWords instead.

sortByDescendingLength :: [[a]] -> [[a]] Source #