{-# LANGUAGE DeriveDataTypeable #-}
-----------------------------------------------------------------------------
-- |
-- Module      :  Data.CharSet.Unicode.Category
-- Copyright   :  (c) Edward Kmett 2010-2012
-- License     :  BSD3
-- Maintainer  :  ekmett@gmail.com
-- Stability   :  experimental
-- Portability :  DeriveDataTypeable
--
-- Provides unicode general categories, which are typically connoted by
-- @\\p{Ll}@ or @\\p{Modifier_Letter}@. Lookups can be constructed using 'categories'
-- or individual character sets can be used directly.
--
-- A case, @_@ and @-@ insensitive lookup is provided by 'lookupCategory'
-- and can be used to provide behavior similar to that of Perl or PCRE.
-------------------------------------------------------------------------------

module Data.CharSet.Unicode.Category
    (
    -- * Unicode General Category
      Category(..)
    -- * Lookup
    , categories
    , lookupCategory
    , lookupCategoryCharSet
    -- * CharSets by Category
    -- ** Letter
    , modifierLetter, otherLetter, letter
    -- *** Letter\&
    , lowercaseLetter, uppercaseLetter, titlecaseLetter, letterAnd
    -- ** Mark
    , nonSpacingMark, spacingCombiningMark, enclosingMark, mark
    -- ** Separator
    , space, lineSeparator, paragraphSeparator, separator
    -- ** Symbol
    , mathSymbol, currencySymbol, modifierSymbol, otherSymbol, symbol
    -- ** Number
    , decimalNumber, letterNumber, otherNumber, number
    -- ** Punctuation
    , dashPunctuation, openPunctuation, closePunctuation, initialQuote
    , finalQuote, connectorPunctuation, otherPunctuation, punctuation
    -- ** Other
    , control, format, privateUse, surrogate, notAssigned, other
    ) where

import Data.Char
import Data.CharSet
import Data.Data
import Data.HashMap.Lazy (HashMap)
import qualified Data.HashMap.Lazy as HashMap

data Category = Category
    { Category -> String
categoryName :: String
    , Category -> String
categoryAbbreviation :: String
    , Category -> CharSet
categoryCharSet :: CharSet
    , Category -> String
categoryDescription :: String
    } deriving (Int -> Category -> ShowS
[Category] -> ShowS
Category -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Category] -> ShowS
$cshowList :: [Category] -> ShowS
show :: Category -> String
$cshow :: Category -> String
showsPrec :: Int -> Category -> ShowS
$cshowsPrec :: Int -> Category -> ShowS
Show, Typeable Category
Category -> DataType
Category -> Constr
(forall b. Data b => b -> b) -> Category -> Category
forall a.
Typeable a
-> (forall (c :: * -> *).
    (forall d b. Data d => c (d -> b) -> d -> c b)
    -> (forall g. g -> c g) -> a -> c a)
-> (forall (c :: * -> *).
    (forall b r. Data b => c (b -> r) -> c r)
    -> (forall r. r -> c r) -> Constr -> c a)
-> (a -> Constr)
-> (a -> DataType)
-> (forall (t :: * -> *) (c :: * -> *).
    Typeable t =>
    (forall d. Data d => c (t d)) -> Maybe (c a))
-> (forall (t :: * -> * -> *) (c :: * -> *).
    Typeable t =>
    (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c a))
-> ((forall b. Data b => b -> b) -> a -> a)
-> (forall r r'.
    (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall r r'.
    (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall u. (forall d. Data d => d -> u) -> a -> [u])
-> (forall u. Int -> (forall d. Data d => d -> u) -> a -> u)
-> (forall (m :: * -> *).
    Monad m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> Data a
forall u. Int -> (forall d. Data d => d -> u) -> Category -> u
forall u. (forall d. Data d => d -> u) -> Category -> [u]
forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Category -> m Category
forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Category -> m Category
forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Category
forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Category -> c Category
forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Category)
forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Category)
gmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Category -> m Category
$cgmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Category -> m Category
gmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Category -> m Category
$cgmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Category -> m Category
gmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Category -> m Category
$cgmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Category -> m Category
gmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> Category -> u
$cgmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> Category -> u
gmapQ :: forall u. (forall d. Data d => d -> u) -> Category -> [u]
$cgmapQ :: forall u. (forall d. Data d => d -> u) -> Category -> [u]
gmapQr :: forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
$cgmapQr :: forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
gmapQl :: forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
$cgmapQl :: forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
gmapT :: (forall b. Data b => b -> b) -> Category -> Category
$cgmapT :: (forall b. Data b => b -> b) -> Category -> Category
dataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Category)
$cdataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Category)
dataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Category)
$cdataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Category)
dataTypeOf :: Category -> DataType
$cdataTypeOf :: Category -> DataType
toConstr :: Category -> Constr
$ctoConstr :: Category -> Constr
gunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Category
$cgunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Category
gfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Category -> c Category
$cgfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Category -> c Category
Data, Typeable)

-- \p{Letter} or \p{Mc}
categories :: [Category]
categories :: [Category]
categories =
    [ String -> String -> CharSet -> String -> Category
Category String
"Letter" String
"L" CharSet
letter String
"any kind of letter from any language."
    ,     String -> String -> CharSet -> String -> Category
Category String
"Lowercase_Letter" String
"Ll" CharSet
lowercaseLetter String
"a lowercase letter that has an uppercase variant"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Uppercase_Letter" String
"Lu" CharSet
uppercaseLetter String
"an uppercase letter that has a lowercase variant"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Titlecase_Letter" String
"Lt" CharSet
titlecaseLetter String
"a letter that appears at the start of a word when only the first letter of the word is capitalized"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Letter&" String
"L&" CharSet
letterAnd String
"a letter that exists in lowercase and uppercase variants (combination of Ll, Lu and Lt)"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Modifier_Letter" String
"Lm" CharSet
modifierLetter String
"a special character that is used like a letter"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Other_Letter" String
"Lo" CharSet
otherLetter String
"a letter or ideograph that does not have lowercase and uppercase variants"
    , String -> String -> CharSet -> String -> Category
Category String
"Mark" String
"M" CharSet
mark String
"a character intended to be combined with another character (e.g. accents, umlauts, enclosing boxes, etc.)"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Non_Spacing_Mark" String
"Mn" CharSet
nonSpacingMark String
"a character intended to be combined with another character without taking up extra space (e.g. accents, umlauts, etc.)"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Spacing_Combining_Mark" String
"Mc" CharSet
spacingCombiningMark String
"a character intended to be combined with another character that takes up extra space (vowel signs in many Eastern languages)"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Enclosing_Mark" String
"Me" CharSet
enclosingMark String
"a character that encloses the character is is combined with (circle, square, keycap, etc.)"
    , String -> String -> CharSet -> String -> Category
Category String
"Separator" String
"Z" CharSet
separator String
"any kind of whitespace or invisible separator"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Space_Separator" String
"Zs" CharSet
space String
"a whitespace character that is invisible, but does take up space"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Line_Separator" String
"Zl" CharSet
lineSeparator String
"line separator character U+2028"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Paragraph_Separator" String
"Zp" CharSet
paragraphSeparator String
"paragraph separator character U+2029"
    , String -> String -> CharSet -> String -> Category
Category String
"Symbol" String
"S" CharSet
symbol String
"math symbols, currency signs, dingbats, box-drawing characters, etc."
    ,     String -> String -> CharSet -> String -> Category
Category String
"Math_Symbol" String
"Sm" CharSet
mathSymbol String
"any mathematical symbol"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Currency_Symbol" String
"Sc" CharSet
currencySymbol String
"any currency sign"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Modifier_Symbol" String
"Sk" CharSet
modifierSymbol String
"a combining character (mark) as a full character on its own"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Other_Symbol" String
"So" CharSet
otherSymbol String
"various symbols that are not math symbols, currency signs, or combining characters"
    , String -> String -> CharSet -> String -> Category
Category String
"Number" String
"N" CharSet
number String
"any kind of numeric character in any script"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Decimal_Digit_Number" String
"Nd" CharSet
decimalNumber String
"a digit zero through nine in any script except ideographic scripts"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Letter_Number" String
"Nl" CharSet
letterNumber String
"a number that looks like a letter, such as a Roman numeral"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Other_Number" String
"No" CharSet
otherNumber String
"a superscript or subscript digit, or a number that is not a digit 0..9 (excluding numbers from ideographic scripts)"
    , String -> String -> CharSet -> String -> Category
Category String
"Punctuation" String
"P" CharSet
punctuation String
"any kind of punctuation character"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Dash_Punctuation" String
"Pd" CharSet
dashPunctuation String
"any kind of hyphen or dash"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Open_Punctuation" String
"Ps" CharSet
openPunctuation String
"any kind of opening bracket"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Close_Punctuation" String
"Pe" CharSet
closePunctuation String
"any kind of closing bracket"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Initial_Punctuation" String
"Pi" CharSet
initialQuote String
"any kind of opening quote"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Final_Punctuation" String
"Pf" CharSet
finalQuote String
"any kind of closing quote"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Connector_Punctuation" String
"Pc" CharSet
connectorPunctuation String
"a punctuation character such as an underscore that connects words"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Other_Punctuation" String
"Po" CharSet
otherPunctuation String
"any kind of punctuation character that is not a dash, bracket, quote or connector"
    , String -> String -> CharSet -> String -> Category
Category String
"Other" String
"C" CharSet
other String
"invisible control characters and unused code points"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Control" String
"Cc" CharSet
control String
"an ASCII 0x00..0x1F or Latin-1 0x80..0x9F control character"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Format" String
"Cf" CharSet
format String
"invisible formatting indicator"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Private_Use" String
"Co" CharSet
privateUse String
"any code point reserved for private use"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Surrogate" String
"Cs" CharSet
surrogate String
"one half of a surrogate pair in UTF-16 encoding"
    ,     String -> String -> CharSet -> String -> Category
Category String
"Unassigned" String
"Cn" CharSet
notAssigned String
"any code point to which no character has been assigned.properties" ]

lookupTable :: HashMap String Category
lookupTable :: HashMap String Category
lookupTable = forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
HashMap.fromList
  [ (ShowS
canonicalize String
x, Category
category)
  | category :: Category
category@(Category String
l String
s CharSet
_ String
_) <- [Category]
categories
  , String
x <- [String
l,String
s]
  ]

lookupCategory :: String -> Maybe Category
lookupCategory :: String -> Maybe Category
lookupCategory String
s = forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup (ShowS
canonicalize String
s) HashMap String Category
lookupTable

lookupCategoryCharSet :: String -> Maybe CharSet
lookupCategoryCharSet :: String -> Maybe CharSet
lookupCategoryCharSet = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Category -> CharSet
categoryCharSet forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Maybe Category
lookupCategory

canonicalize :: String -> String
canonicalize :: ShowS
canonicalize String
s = case forall a b. (a -> b) -> [a] -> [b]
Prelude.map Char -> Char
toLower String
s of
  Char
'i' : Char
's' : String
xs -> ShowS
go String
xs
  String
xs -> ShowS
go String
xs
  where
    go :: ShowS
go (Char
'-':String
xs) = ShowS
go String
xs
    go (Char
'_':String
xs) = ShowS
go String
xs
    go (Char
' ':String
xs) = ShowS
go String
xs
    go (Char
x:String
xs) = Char
x forall a. a -> [a] -> [a]
: ShowS
go String
xs
    go [] = []

cat :: GeneralCategory -> CharSet
cat :: GeneralCategory -> CharSet
cat GeneralCategory
category = (Char -> Bool) -> CharSet
build ((GeneralCategory
category forall a. Eq a => a -> a -> Bool
==) forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> GeneralCategory
generalCategory)

-- Letter
lowercaseLetter, uppercaseLetter, titlecaseLetter, letterAnd, modifierLetter, otherLetter, letter :: CharSet
lowercaseLetter :: CharSet
lowercaseLetter = GeneralCategory -> CharSet
cat GeneralCategory
LowercaseLetter
uppercaseLetter :: CharSet
uppercaseLetter = GeneralCategory -> CharSet
cat GeneralCategory
UppercaseLetter
titlecaseLetter :: CharSet
titlecaseLetter = GeneralCategory -> CharSet
cat GeneralCategory
TitlecaseLetter
letterAnd :: CharSet
letterAnd = CharSet
lowercaseLetter
    CharSet -> CharSet -> CharSet
`union` CharSet
uppercaseLetter
    CharSet -> CharSet -> CharSet
`union` CharSet
titlecaseLetter
modifierLetter :: CharSet
modifierLetter  = GeneralCategory -> CharSet
cat GeneralCategory
ModifierLetter
otherLetter :: CharSet
otherLetter = GeneralCategory -> CharSet
cat GeneralCategory
OtherLetter
letter :: CharSet
letter
          = CharSet
letterAnd
    CharSet -> CharSet -> CharSet
`union` CharSet
modifierLetter
    CharSet -> CharSet -> CharSet
`union` CharSet
otherLetter

-- Marks
nonSpacingMark, spacingCombiningMark, enclosingMark, mark :: CharSet
nonSpacingMark :: CharSet
nonSpacingMark = GeneralCategory -> CharSet
cat GeneralCategory
NonSpacingMark
spacingCombiningMark :: CharSet
spacingCombiningMark = GeneralCategory -> CharSet
cat GeneralCategory
SpacingCombiningMark
enclosingMark :: CharSet
enclosingMark = GeneralCategory -> CharSet
cat GeneralCategory
EnclosingMark
mark :: CharSet
mark
          = CharSet
nonSpacingMark
    CharSet -> CharSet -> CharSet
`union` CharSet
spacingCombiningMark
    CharSet -> CharSet -> CharSet
`union` CharSet
enclosingMark

space, lineSeparator, paragraphSeparator, separator :: CharSet
space :: CharSet
space = GeneralCategory -> CharSet
cat GeneralCategory
Space
lineSeparator :: CharSet
lineSeparator = GeneralCategory -> CharSet
cat GeneralCategory
LineSeparator
paragraphSeparator :: CharSet
paragraphSeparator = GeneralCategory -> CharSet
cat GeneralCategory
ParagraphSeparator
separator :: CharSet
separator
          = CharSet
space
    CharSet -> CharSet -> CharSet
`union` CharSet
lineSeparator
    CharSet -> CharSet -> CharSet
`union` CharSet
paragraphSeparator

mathSymbol, currencySymbol, modifierSymbol, otherSymbol, symbol :: CharSet
mathSymbol :: CharSet
mathSymbol = GeneralCategory -> CharSet
cat GeneralCategory
MathSymbol
currencySymbol :: CharSet
currencySymbol = GeneralCategory -> CharSet
cat GeneralCategory
CurrencySymbol
modifierSymbol :: CharSet
modifierSymbol = GeneralCategory -> CharSet
cat GeneralCategory
ModifierSymbol
otherSymbol :: CharSet
otherSymbol = GeneralCategory -> CharSet
cat GeneralCategory
OtherSymbol
symbol :: CharSet
symbol
          = CharSet
mathSymbol
    CharSet -> CharSet -> CharSet
`union` CharSet
currencySymbol
    CharSet -> CharSet -> CharSet
`union` CharSet
modifierSymbol
    CharSet -> CharSet -> CharSet
`union` CharSet
otherSymbol

decimalNumber, letterNumber, otherNumber, number :: CharSet
decimalNumber :: CharSet
decimalNumber = GeneralCategory -> CharSet
cat GeneralCategory
DecimalNumber
letterNumber :: CharSet
letterNumber = GeneralCategory -> CharSet
cat GeneralCategory
LetterNumber
otherNumber :: CharSet
otherNumber = GeneralCategory -> CharSet
cat GeneralCategory
OtherNumber
number :: CharSet
number
          = CharSet
decimalNumber
    CharSet -> CharSet -> CharSet
`union` CharSet
letterNumber
    CharSet -> CharSet -> CharSet
`union` CharSet
otherNumber

dashPunctuation, openPunctuation, closePunctuation, initialQuote,
  finalQuote, connectorPunctuation, otherPunctuation, punctuation :: CharSet

dashPunctuation :: CharSet
dashPunctuation = GeneralCategory -> CharSet
cat GeneralCategory
DashPunctuation
openPunctuation :: CharSet
openPunctuation = GeneralCategory -> CharSet
cat GeneralCategory
OpenPunctuation
closePunctuation :: CharSet
closePunctuation = GeneralCategory -> CharSet
cat GeneralCategory
ClosePunctuation
initialQuote :: CharSet
initialQuote = GeneralCategory -> CharSet
cat GeneralCategory
InitialQuote
finalQuote :: CharSet
finalQuote = GeneralCategory -> CharSet
cat GeneralCategory
FinalQuote
connectorPunctuation :: CharSet
connectorPunctuation  = GeneralCategory -> CharSet
cat GeneralCategory
ConnectorPunctuation
otherPunctuation :: CharSet
otherPunctuation = GeneralCategory -> CharSet
cat GeneralCategory
OtherPunctuation
punctuation :: CharSet
punctuation
          = CharSet
dashPunctuation
    CharSet -> CharSet -> CharSet
`union` CharSet
openPunctuation
    CharSet -> CharSet -> CharSet
`union` CharSet
closePunctuation
    CharSet -> CharSet -> CharSet
`union` CharSet
initialQuote
    CharSet -> CharSet -> CharSet
`union` CharSet
finalQuote
    CharSet -> CharSet -> CharSet
`union` CharSet
connectorPunctuation
    CharSet -> CharSet -> CharSet
`union` CharSet
otherPunctuation

control, format, privateUse, surrogate, notAssigned, other :: CharSet
control :: CharSet
control = GeneralCategory -> CharSet
cat GeneralCategory
Control
format :: CharSet
format = GeneralCategory -> CharSet
cat GeneralCategory
Format
privateUse :: CharSet
privateUse = GeneralCategory -> CharSet
cat GeneralCategory
PrivateUse
surrogate :: CharSet
surrogate = GeneralCategory -> CharSet
cat GeneralCategory
Surrogate
notAssigned :: CharSet
notAssigned = GeneralCategory -> CharSet
cat GeneralCategory
NotAssigned
other :: CharSet
other = CharSet
control
    CharSet -> CharSet -> CharSet
`union` CharSet
format
    CharSet -> CharSet -> CharSet
`union` CharSet
privateUse
    CharSet -> CharSet -> CharSet
`union` CharSet
surrogate
    CharSet -> CharSet -> CharSet
`union` CharSet
notAssigned