{-# LANGUAGE BangPatterns, RecordWildCards #-}
-- |
-- Module      : Data.Text.ICU.Break.Pure
-- Copyright   : (c) 2010 Bryan O'Sullivan
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- String breaking functions for Unicode, implemented as bindings to
-- the International Components for Unicode (ICU) libraries.
--
-- The text boundary positions are found according to the rules described in
-- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex
-- #14, Line Breaking Properties.  These are available at
-- <http://www.unicode.org/reports/tr14/> and
-- <http://www.unicode.org/reports/tr29/>.

module Data.Text.ICU.Break.Pure
    (
    -- * Types
      Breaker
    , Break
    , brkPrefix
    , brkBreak
    , brkSuffix
    , brkStatus
    , Line(..)
    , Data.Text.ICU.Break.Word(..)
    -- * Breaking functions
    , breakCharacter
    , breakLine
    , breakSentence
    , breakWord
    -- * Iteration
    , breaks
    , breaksRight
    ) where

import Control.DeepSeq (NFData(..))
import Data.Text (Text, empty)
import Data.Text.Foreign (dropWord16, takeWord16)
import Data.Text.ICU.Break (Line, Word)
import Data.Text.ICU.Break.Types (BreakIterator(..))
import Data.Text.ICU.Internal (LocaleName)
import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO)
import qualified Data.Text.ICU.Break as IO

-- | A boundary analyser.
newtype Breaker a = B (BreakIterator a)

new :: (LocaleName -> Text -> IO (BreakIterator a)) -> LocaleName -> Breaker a
new :: (LocaleName -> Text -> IO (BreakIterator a))
-> LocaleName -> Breaker a
new LocaleName -> Text -> IO (BreakIterator a)
act LocaleName
loc = IO (Breaker a) -> Breaker a
forall a. IO a -> a
unsafePerformIO (IO (Breaker a) -> Breaker a) -> IO (Breaker a) -> Breaker a
forall a b. (a -> b) -> a -> b
$ BreakIterator a -> Breaker a
forall a. BreakIterator a -> Breaker a
B (BreakIterator a -> Breaker a)
-> IO (BreakIterator a) -> IO (Breaker a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` LocaleName -> Text -> IO (BreakIterator a)
act LocaleName
loc Text
empty

-- | Break a string on character boundaries.
--
-- Character boundary analysis identifies the boundaries of "Extended
-- Grapheme Clusters", which are groupings of codepoints that should be
-- treated as character-like units for many text operations.  Please see
-- Unicode Standard Annex #29, Unicode Text Segmentation,
-- <http://www.unicode.org/reports/tr29/> for additional information on
-- grapheme clusters and guidelines on their use.
breakCharacter :: LocaleName -> Breaker ()
breakCharacter :: LocaleName -> Breaker ()
breakCharacter = (LocaleName -> Text -> IO (BreakIterator ()))
-> LocaleName -> Breaker ()
forall a.
(LocaleName -> Text -> IO (BreakIterator a))
-> LocaleName -> Breaker a
new LocaleName -> Text -> IO (BreakIterator ())
IO.breakCharacter

-- | Break a string on line boundaries.
--
-- Line boundary analysis determines where a text string can be broken when
-- line wrapping. The mechanism correctly handles punctuation and hyphenated
-- words.
breakLine :: LocaleName -> Breaker Line
breakLine :: LocaleName -> Breaker Line
breakLine = (LocaleName -> Text -> IO (BreakIterator Line))
-> LocaleName -> Breaker Line
forall a.
(LocaleName -> Text -> IO (BreakIterator a))
-> LocaleName -> Breaker a
new LocaleName -> Text -> IO (BreakIterator Line)
IO.breakLine

-- | Break a string on sentence boundaries.
--
-- Sentence boundary analysis allows selection with correct interpretation
-- of periods within numbers and abbreviations, and trailing punctuation
-- marks such as quotation marks and parentheses.
breakSentence :: LocaleName -> Breaker ()
breakSentence :: LocaleName -> Breaker ()
breakSentence = (LocaleName -> Text -> IO (BreakIterator ()))
-> LocaleName -> Breaker ()
forall a.
(LocaleName -> Text -> IO (BreakIterator a))
-> LocaleName -> Breaker a
new LocaleName -> Text -> IO (BreakIterator ())
IO.breakSentence

-- | Break a string on word boundaries.
--
-- Word boundary analysis is used by search and replace functions, as well
-- as within text editing applications that allow the user to select words
-- with a double click. Word selection provides correct interpretation of
-- punctuation marks within and following words. Characters that are not
-- part of a word, such as symbols or punctuation marks, have word breaks on
-- both sides.
breakWord :: LocaleName -> Breaker Data.Text.ICU.Break.Word
breakWord :: LocaleName -> Breaker Word
breakWord = (LocaleName -> Text -> IO (BreakIterator Word))
-> LocaleName -> Breaker Word
forall a.
(LocaleName -> Text -> IO (BreakIterator a))
-> LocaleName -> Breaker a
new LocaleName -> Text -> IO (BreakIterator Word)
IO.breakWord

-- | A break in a string.
data Break a = Break {
      Break a -> Text
brkPrefix :: {-# UNPACK #-} !Text -- ^ Prefix of the current break.
    , Break a -> Text
brkBreak :: {-# UNPACK #-} !Text  -- ^ Text of the current break.
    , Break a -> Text
brkSuffix :: {-# UNPACK #-} !Text -- ^ Suffix of the current break.
    , Break a -> a
brkStatus :: !a
    -- ^ Status of the current break (only meaningful if 'Line' or 'Word').
    } deriving (Break a -> Break a -> Bool
(Break a -> Break a -> Bool)
-> (Break a -> Break a -> Bool) -> Eq (Break a)
forall a. Eq a => Break a -> Break a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Break a -> Break a -> Bool
$c/= :: forall a. Eq a => Break a -> Break a -> Bool
== :: Break a -> Break a -> Bool
$c== :: forall a. Eq a => Break a -> Break a -> Bool
Eq, Int -> Break a -> ShowS
[Break a] -> ShowS
Break a -> String
(Int -> Break a -> ShowS)
-> (Break a -> String) -> ([Break a] -> ShowS) -> Show (Break a)
forall a. Show a => Int -> Break a -> ShowS
forall a. Show a => [Break a] -> ShowS
forall a. Show a => Break a -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Break a] -> ShowS
$cshowList :: forall a. Show a => [Break a] -> ShowS
show :: Break a -> String
$cshow :: forall a. Show a => Break a -> String
showsPrec :: Int -> Break a -> ShowS
$cshowsPrec :: forall a. Show a => Int -> Break a -> ShowS
Show)

instance (NFData a) => NFData (Break a) where
    rnf :: Break a -> ()
rnf Break{a
Text
brkStatus :: a
brkSuffix :: Text
brkBreak :: Text
brkPrefix :: Text
brkStatus :: forall a. Break a -> a
brkSuffix :: forall a. Break a -> Text
brkBreak :: forall a. Break a -> Text
brkPrefix :: forall a. Break a -> Text
..} = a -> ()
forall a. NFData a => a -> ()
rnf a
brkStatus

-- | Return a list of all breaks in a string, from left to right.
breaks :: Breaker a -> Text -> [Break a]
breaks :: Breaker a -> Text -> [Break a]
breaks (B BreakIterator a
b) Text
t = IO [Break a] -> [Break a]
forall a. IO a -> a
unsafePerformIO (IO [Break a] -> [Break a]) -> IO [Break a] -> [Break a]
forall a b. (a -> b) -> a -> b
$ do
  BreakIterator a
bi <- BreakIterator a -> IO (BreakIterator a)
forall a. BreakIterator a -> IO (BreakIterator a)
IO.clone BreakIterator a
b
  BreakIterator a -> Text -> IO ()
forall a. BreakIterator a -> Text -> IO ()
IO.setText BreakIterator a
bi Text
t
  let go :: I16 -> IO [Break a]
go I16
p = do
        Maybe I16
mix <- BreakIterator a -> IO (Maybe I16)
forall a. BreakIterator a -> IO (Maybe I16)
IO.next BreakIterator a
bi
        case Maybe I16
mix of
          Maybe I16
Nothing -> [Break a] -> IO [Break a]
forall (m :: * -> *) a. Monad m => a -> m a
return []
          Just I16
n -> do
            a
s <- BreakIterator a -> IO a
forall a. BreakIterator a -> IO a
IO.getStatus BreakIterator a
bi
            let d :: I16
d = I16
nI16 -> I16 -> I16
forall a. Num a => a -> a -> a
-I16
p
                u :: Text
u = I16 -> Text -> Text
dropWord16 I16
p Text
t
            (Text -> Text -> Text -> a -> Break a
forall a. Text -> Text -> Text -> a -> Break a
Break (I16 -> Text -> Text
takeWord16 I16
p Text
t) (I16 -> Text -> Text
takeWord16 I16
d Text
u) (I16 -> Text -> Text
dropWord16 I16
d Text
u) a
s Break a -> [Break a] -> [Break a]
forall a. a -> [a] -> [a]
:) ([Break a] -> [Break a]) -> IO [Break a] -> IO [Break a]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` I16 -> IO [Break a]
go I16
n
  IO [Break a] -> IO [Break a]
forall a. IO a -> IO a
unsafeInterleaveIO (IO [Break a] -> IO [Break a]) -> IO [Break a] -> IO [Break a]
forall a b. (a -> b) -> a -> b
$ I16 -> IO [Break a]
go (I16 -> IO [Break a]) -> IO I16 -> IO [Break a]
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< BreakIterator a -> IO I16
forall a. BreakIterator a -> IO I16
IO.first BreakIterator a
bi

-- | Return a list of all breaks in a string, from right to left.
breaksRight :: Breaker a -> Text -> [Break a]
breaksRight :: Breaker a -> Text -> [Break a]
breaksRight (B BreakIterator a
b) Text
t = IO [Break a] -> [Break a]
forall a. IO a -> a
unsafePerformIO (IO [Break a] -> [Break a]) -> IO [Break a] -> [Break a]
forall a b. (a -> b) -> a -> b
$ do
  BreakIterator a
bi <- BreakIterator a -> IO (BreakIterator a)
forall a. BreakIterator a -> IO (BreakIterator a)
IO.clone BreakIterator a
b
  BreakIterator a -> Text -> IO ()
forall a. BreakIterator a -> Text -> IO ()
IO.setText BreakIterator a
bi Text
t
  let go :: I16 -> IO [Break a]
go I16
p = do
        Maybe I16
mix <- BreakIterator a -> IO (Maybe I16)
forall a. BreakIterator a -> IO (Maybe I16)
IO.previous BreakIterator a
bi
        case Maybe I16
mix of
          Maybe I16
Nothing -> [Break a] -> IO [Break a]
forall (m :: * -> *) a. Monad m => a -> m a
return []
          Just I16
n -> do
            a
s <- BreakIterator a -> IO a
forall a. BreakIterator a -> IO a
IO.getStatus BreakIterator a
bi
            let d :: I16
d = I16
pI16 -> I16 -> I16
forall a. Num a => a -> a -> a
-I16
n
                u :: Text
u = I16 -> Text -> Text
dropWord16 I16
n Text
t
            (Text -> Text -> Text -> a -> Break a
forall a. Text -> Text -> Text -> a -> Break a
Break (I16 -> Text -> Text
takeWord16 I16
n Text
t) (I16 -> Text -> Text
takeWord16 I16
d Text
u) (I16 -> Text -> Text
dropWord16 I16
d Text
u) a
s Break a -> [Break a] -> [Break a]
forall a. a -> [a] -> [a]
:) ([Break a] -> [Break a]) -> IO [Break a] -> IO [Break a]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` I16 -> IO [Break a]
go I16
n
  IO [Break a] -> IO [Break a]
forall a. IO a -> IO a
unsafeInterleaveIO (IO [Break a] -> IO [Break a]) -> IO [Break a] -> IO [Break a]
forall a b. (a -> b) -> a -> b
$ I16 -> IO [Break a]
go (I16 -> IO [Break a]) -> IO I16 -> IO [Break a]
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< BreakIterator a -> IO I16
forall a. BreakIterator a -> IO I16
IO.last BreakIterator a
bi