{-# LANGUAGE BangPatterns, RecordWildCards, CPP #-} -- | -- Module : Data.Text.ICU.Break.Pure -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- String breaking functions for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- -- The text boundary positions are found according to the rules described in -- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex -- #14, Line Breaking Properties. These are available at -- and -- . module Data.Text.ICU.Break.Pure ( -- * Types Breaker , Break , brkPrefix , brkBreak , brkSuffix , brkStatus , Line(..) , Data.Text.ICU.Break.Word(..) -- * Breaking functions , breakCharacter , breakLine , breakSentence , breakWord -- * Iteration , breaks , breaksRight ) where import Control.DeepSeq (NFData(..)) import Data.Text (Text, empty) import Data.Text.ICU.Break (Line, Word) import Data.Text.ICU.Break.Types (BreakIterator(..)) import Data.Text.ICU.Internal (LocaleName, takeWord, dropWord) import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO) import qualified Data.Text.ICU.Break as IO -- | A boundary analyser. newtype Breaker a = B (BreakIterator a) new :: (LocaleName -> Text -> IO (BreakIterator a)) -> LocaleName -> Breaker a new act loc = unsafePerformIO $ B `fmap` act loc empty -- | Break a string on character boundaries. -- -- Character boundary analysis identifies the boundaries of "Extended -- Grapheme Clusters", which are groupings of codepoints that should be -- treated as character-like units for many text operations. Please see -- Unicode Standard Annex #29, Unicode Text Segmentation, -- for additional information on -- grapheme clusters and guidelines on their use. breakCharacter :: LocaleName -> Breaker () breakCharacter = new IO.breakCharacter -- | Break a string on line boundaries. -- -- Line boundary analysis determines where a text string can be broken when -- line wrapping. The mechanism correctly handles punctuation and hyphenated -- words. breakLine :: LocaleName -> Breaker Line breakLine = new IO.breakLine -- | Break a string on sentence boundaries. -- -- Sentence boundary analysis allows selection with correct interpretation -- of periods within numbers and abbreviations, and trailing punctuation -- marks such as quotation marks and parentheses. breakSentence :: LocaleName -> Breaker () breakSentence = new IO.breakSentence -- | Break a string on word boundaries. -- -- Word boundary analysis is used by search and replace functions, as well -- as within text editing applications that allow the user to select words -- with a double click. Word selection provides correct interpretation of -- punctuation marks within and following words. Characters that are not -- part of a word, such as symbols or punctuation marks, have word breaks on -- both sides. breakWord :: LocaleName -> Breaker Data.Text.ICU.Break.Word breakWord = new IO.breakWord -- | A break in a string. data Break a = Break { brkPrefix :: {-# UNPACK #-} !Text -- ^ Prefix of the current break. , brkBreak :: {-# UNPACK #-} !Text -- ^ Text of the current break. , brkSuffix :: {-# UNPACK #-} !Text -- ^ Suffix of the current break. , brkStatus :: !a -- ^ Status of the current break (only meaningful if 'Line' or 'Word'). } deriving (Eq, Show) instance (NFData a) => NFData (Break a) where rnf Break{..} = rnf brkStatus -- | Return a list of all breaks in a string, from left to right. breaks :: Breaker a -> Text -> [Break a] breaks (B b) t = unsafePerformIO $ do bi <- IO.clone b IO.setText bi t let go p = do mix <- IO.next bi case mix of Nothing -> return [] Just n -> do s <- IO.getStatus bi let d = n-p u = dropWord p t (Break (takeWord p t) (takeWord d u) (dropWord d u) s :) `fmap` go n unsafeInterleaveIO $ go =<< IO.first bi -- | Return a list of all breaks in a string, from right to left. breaksRight :: Breaker a -> Text -> [Break a] breaksRight (B b) t = unsafePerformIO $ do bi <- IO.clone b IO.setText bi t let go p = do mix <- IO.previous bi case mix of Nothing -> return [] Just n -> do s <- IO.getStatus bi let d = p-n u = dropWord n t (Break (takeWord n t) (takeWord d u) (dropWord d u) s :) `fmap` go n unsafeInterleaveIO $ go =<< IO.last bi