Safe Haskell	None
Language	Haskell2010

Data.Text.Utf16

Description

This module provides functions that allow treating Text values as series of Utf16 codepoints instead of characters.

Synopsis

type CodeUnit = Word16
newtype CodeUnitIndex = CodeUnitIndex {
- codeUnitIndex :: Int
}
lengthUtf16 :: Text -> CodeUnitIndex
lowerUtf16 :: Text -> Text
lowerCodeUnit :: CodeUnit -> CodeUnit
upperUtf16 :: Text -> Text
upperCodeUnit :: CodeUnit -> CodeUnit
isCaseInvariant :: Text -> Bool
unpackUtf16 :: Text -> [CodeUnit]
unsafeCutUtf16 :: CodeUnitIndex -> CodeUnitIndex -> Text -> (Text, Text)
unsafeSliceUtf16 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text
unsafeIndexUtf16 :: Text -> CodeUnitIndex -> CodeUnit
indexTextArray :: Array -> Int -> CodeUnit

Documentation

type CodeUnit = Word16 Source #

A code unit is a 16-bit integer from which UTF-16 encoded text is built up. The Text type is represented as a UTF-16 string.

newtype CodeUnitIndex Source #

An index into the raw UTF-16 data of a Text. This is not the code point index as conventionally accepted by Text, so we wrap it to avoid confusing the two. Incorrect index manipulation can lead to surrogate pairs being sliced, so manipulate indices with care. This type is also used for lengths.

Constructors

CodeUnitIndex
Fields codeUnitIndex :: Int

Instances

Bounded CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf16 Methods minBound :: CodeUnitIndex # maxBound :: CodeUnitIndex #
Eq CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf16 Methods (==) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (/=) :: CodeUnitIndex -> CodeUnitIndex -> Bool #
Num CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf16 Methods (+) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # (-) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # (*) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # negate :: CodeUnitIndex -> CodeUnitIndex # abs :: CodeUnitIndex -> CodeUnitIndex # signum :: CodeUnitIndex -> CodeUnitIndex # fromInteger :: Integer -> CodeUnitIndex #
Ord CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf16 Methods compare :: CodeUnitIndex -> CodeUnitIndex -> Ordering # (<) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (<=) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (>) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (>=) :: CodeUnitIndex -> CodeUnitIndex -> Bool # max :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # min :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex #
Show CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf16 Methods showsPrec :: Int -> CodeUnitIndex -> ShowS # show :: CodeUnitIndex -> String # showList :: [CodeUnitIndex] -> ShowS #
Generic CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf16 Associated Types type Rep CodeUnitIndex :: Type -> Type # Methods from :: CodeUnitIndex -> Rep CodeUnitIndex x # to :: Rep CodeUnitIndex x -> CodeUnitIndex #
Hashable CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf16 Methods hashWithSalt :: Int -> CodeUnitIndex -> Int # hash :: CodeUnitIndex -> Int #
ToJSON CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf16 Methods toJSON :: CodeUnitIndex -> Value # toEncoding :: CodeUnitIndex -> Encoding # toJSONList :: [CodeUnitIndex] -> Value # toEncodingList :: [CodeUnitIndex] -> Encoding #
FromJSON CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf16 Methods parseJSON :: Value -> Parser CodeUnitIndex # parseJSONList :: Value -> Parser [CodeUnitIndex] #
NFData CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf16 Methods rnf :: CodeUnitIndex -> () #
type Rep CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf16 type Rep CodeUnitIndex = D1 (MetaData "CodeUnitIndex" "Data.Text.Utf16" "alfred-margaret-1.1.1.0-C7p4DoDIXY7azqNtjX433" True) (C1 (MetaCons "CodeUnitIndex" PrefixI True) (S1 (MetaSel (Just "codeUnitIndex") NoSourceUnpackedness NoSourceStrictness DecidedLazy) (Rec0 Int)))

lengthUtf16 :: Text -> CodeUnitIndex Source #

Return the length of the text, in number of code units.

lowerUtf16 :: Text -> Text Source #

Lowercase each individual code unit of a text without changing their index. This is not a proper case folding, but it does ensure that indices into the lowercased string correspond to indices into the original string.

Differences from toLower include code points in the BMP that lowercase to multiple code points, and code points outside of the BMP.

For example, İ (U+0130), which toLower converts to "i" (U+0069, U+0307), is converted into U+0069 only by lowerUtf16. Also, 𑢢 (U+118A2), a code point from the Warang City writing system in the Supplementary Multilingual Plane, introduced in 2014 to Unicode 7. It would be lowercased to U+118C2 by toLower, but it is left untouched by lowerUtf16.

lowerCodeUnit :: CodeUnit -> CodeUnit Source #

Convert CodeUnits that represent a character on their own (i.e. that are not part of a surrogate pair) to their lower case representation.

This function has a special code path for ASCII characters, because Char.toLower is **incredibly** slow. It's implemented there if you want to see for yourself: (https:/github.comghcghcblobghc-8.6.3-releaselibrariesbasecbits/WCsubst.c#L4732) (It does a binary search on 1276 casing rules)

upperUtf16 :: Text -> Text Source #

upperCodeUnit :: CodeUnit -> CodeUnit Source #

isCaseInvariant :: Text -> Bool Source #

Return whether text is the same lowercase as uppercase, such that this function will not return true when Aho–Corasick would differentiate when doing case-insensitive matching.

unpackUtf16 :: Text -> [CodeUnit] Source #

Return a Text as a list of UTF-16 code units.

unsafeCutUtf16 :: CodeUnitIndex -> CodeUnitIndex -> Text -> (Text, Text) Source #

The complement of unsafeSliceUtf16: removes the slice, and returns the part before and after. See unsafeSliceUtf16 for details.

unsafeSliceUtf16 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text Source #

Extract a substring from a text, at a code unit offset and length. This is similar to `Text.take length . Text.drop begin`, except that the begin and length are in code *units*, not code points, so we can slice the UTF-16 array, and we don't have to walk the entire text to take surrogate pairs into account. It is the responsibility of the user to not slice surrogate pairs, and to ensure that the length is within bounds, hence this function is unsafe.

unsafeIndexUtf16 :: Text -> CodeUnitIndex -> CodeUnit Source #

Return the code unit (not character) with the given index. Note: The boudns are not checked.

indexTextArray :: Array -> Int -> CodeUnit Source #