Safe Haskell	None
Language	Haskell2010

Data.Text.Utf8

Contents

Decoding
Indexing
Slicing Functions
General Functions

Description

This module provides functions that allow treating Text values as series of UTF-8 code units instead of characters. Currently, it also contains a stub Text type which treats its internal byte array as UTF-8 encoded. We use this as a placeholder until we can use text-2.

Synopsis

type CodePoint = Char
type CodeUnit = Word8
newtype CodeUnitIndex = CodeUnitIndex {
- codeUnitIndex :: Int
}
data Text = Text !ByteArray !Int !Int
lengthUtf8 :: Text -> CodeUnitIndex
lowerCodePoint :: Char -> Char
lowerUtf8 :: Text -> Text
toLowerAscii :: Char -> Char
unicode2utf8 :: (Ord a, Num a, Bits a) => a -> [a]
unpackUtf8 :: Text -> [CodeUnit]
decode2 :: CodeUnit -> CodeUnit -> CodePoint
decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
decodeUtf8 :: [CodeUnit] -> [CodePoint]
stringToByteArray :: String -> ByteArray
indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
unsafeIndexCodePoint' :: ByteArray -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' :: ByteArray -> CodeUnitIndex -> CodeUnit
unsafeCutUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> (Text, Text)
unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text
concat :: [Text] -> Text
dropWhile :: (Char -> Bool) -> Text -> Text
null :: Text -> Bool
readFile :: FilePath -> IO Text
replicate :: Int -> Text -> Text
indices :: Text -> Text -> [Int]
isInfixOf :: Text -> Text -> Bool
pack :: String -> Text
unpack :: Text -> String

Documentation

type CodePoint = Char Source #

A Unicode code point.

type CodeUnit = Word8 Source #

A UTF-8 code unit is a byte. A Unicode code point can be encoded as up to four code units.

newtype CodeUnitIndex Source #

An index into the raw UTF-8 data of a Text. This is not the code point index as conventionally accepted by Text, so we wrap it to avoid confusing the two. Incorrect index manipulation can lead to surrogate pairs being sliced, so manipulate indices with care. This type is also used for lengths.

Constructors

CodeUnitIndex
Fields codeUnitIndex :: Int

Instances

Instances details

Bounded CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods minBound :: CodeUnitIndex # maxBound :: CodeUnitIndex #
Eq CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods (==) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (/=) :: CodeUnitIndex -> CodeUnitIndex -> Bool #
Num CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods (+) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # (-) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # (*) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # negate :: CodeUnitIndex -> CodeUnitIndex # abs :: CodeUnitIndex -> CodeUnitIndex # signum :: CodeUnitIndex -> CodeUnitIndex # fromInteger :: Integer -> CodeUnitIndex #
Ord CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods compare :: CodeUnitIndex -> CodeUnitIndex -> Ordering # (<) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (<=) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (>) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (>=) :: CodeUnitIndex -> CodeUnitIndex -> Bool # max :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # min :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex #
Show CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods showsPrec :: Int -> CodeUnitIndex -> ShowS # show :: CodeUnitIndex -> String # showList :: [CodeUnitIndex] -> ShowS #
Generic CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Associated Types type Rep CodeUnitIndex :: Type -> Type # Methods from :: CodeUnitIndex -> Rep CodeUnitIndex x # to :: Rep CodeUnitIndex x -> CodeUnitIndex #
Hashable CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods hashWithSalt :: Int -> CodeUnitIndex -> Int # hash :: CodeUnitIndex -> Int #
ToJSON CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods toJSON :: CodeUnitIndex -> Value # toEncoding :: CodeUnitIndex -> Encoding # toJSONList :: [CodeUnitIndex] -> Value # toEncodingList :: [CodeUnitIndex] -> Encoding #
FromJSON CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods parseJSON :: Value -> Parser CodeUnitIndex # parseJSONList :: Value -> Parser [CodeUnitIndex] #
NFData CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods rnf :: CodeUnitIndex -> () #
type Rep CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 type Rep CodeUnitIndex = D1 ('MetaData "CodeUnitIndex" "Data.Text.Utf8" "alfred-margaret-1.1.2.0-XY9PIx1ODVHYQ5PowYWXi" 'True) (C1 ('MetaCons "CodeUnitIndex" 'PrefixI 'True) (S1 ('MetaSel ('Just "codeUnitIndex") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 Int)))

data Text Source #

Constructors

Text	A placeholder data type for UTF-8 encoded text until we can use text-2.0.
Fields !ByteArray Underlying array encoded using UTF-8. !Int Starting position of the UTF-8 sequence in bytes. !Int Length of the UTF-8 sequence in bytes.

Instances

Instances details

Eq Text Source #
Instance details Defined in Data.Text.Utf8 Methods (==) :: Text -> Text -> Bool # (/=) :: Text -> Text -> Bool #
Ord Text Source #
Instance details Defined in Data.Text.Utf8 Methods compare :: Text -> Text -> Ordering # (<) :: Text -> Text -> Bool # (<=) :: Text -> Text -> Bool # (>) :: Text -> Text -> Bool # (>=) :: Text -> Text -> Bool # max :: Text -> Text -> Text # min :: Text -> Text -> Text #
Show Text Source #
Instance details Defined in Data.Text.Utf8 Methods showsPrec :: Int -> Text -> ShowS # show :: Text -> String # showList :: [Text] -> ShowS #
IsString Text Source #
Instance details Defined in Data.Text.Utf8 Methods fromString :: String -> Text #
Hashable Text Source #
Instance details Defined in Data.Text.Utf8 Methods hashWithSalt :: Int -> Text -> Int # hash :: Text -> Int #
ToJSON Text Source #
Instance details Defined in Data.Text.Utf8 Methods toJSON :: Text -> Value # toEncoding :: Text -> Encoding # toJSONList :: [Text] -> Value # toEncodingList :: [Text] -> Encoding #
FromJSON Text Source #
Instance details Defined in Data.Text.Utf8 Methods parseJSON :: Value -> Parser Text # parseJSONList :: Value -> Parser [Text] #
NFData Text Source #
Instance details Defined in Data.Text.Utf8 Methods rnf :: Text -> () #

lengthUtf8 :: Text -> CodeUnitIndex Source #

The return value of this function is not really an index. However the signature is supposed to make it clear that the length is returned in terms of code units, not code points.

lowerCodePoint :: Char -> Char Source #

Lower-Case a UTF-8 codepoint. Uses toLowerAscii for ASCII and toLower otherwise.

lowerUtf8 :: Text -> Text Source #

toLowerAscii :: Char -> Char Source #

Lower-case the ASCII code points A-Z and leave the rest of ASCII intact.

unicode2utf8 :: (Ord a, Num a, Bits a) => a -> [a] Source #

Convert a Unicode Code Point c into a list of UTF-8 code units (bytes).

unpackUtf8 :: Text -> [CodeUnit] Source #

Decoding

Functions that turns code unit sequences into code point sequences.

decode2 :: CodeUnit -> CodeUnit -> CodePoint Source #

Decode 2 UTF-8 code units into their code point. The given code units should have the following format:

┌───────────────┬───────────────┐
│1 1 0 x x x x x│1 0 x x x x x x│
└───────────────┴───────────────┘

decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> CodePoint Source #

Decode 3 UTF-8 code units into their code point. The given code units should have the following format:

┌───────────────┬───────────────┬───────────────┐
│1 1 1 0 x x x x│1 0 x x x x x x│1 0 x x x x x x│
└───────────────┴───────────────┴───────────────┘

decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> CodePoint Source #

Decode 4 UTF-8 code units into their code point. The given code units should have the following format:

┌───────────────┬───────────────┬───────────────┬───────────────┐
│1 1 1 1 0 x x x│1 0 x x x x x x│1 0 x x x x x x│1 0 x x x x x x│
└───────────────┴───────────────┴───────────────┴───────────────┘

decodeUtf8 :: [CodeUnit] -> [CodePoint] Source #

Decode a list of UTF-8 code units into a list of code points.

stringToByteArray :: String -> ByteArray Source #

Indexing

Text can be indexed by code units or code points. A CodePoint is a 21-bit Unicode code point and can consist of up to four code units. A CodeUnit is a single byte.

indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit Source #

Get the code unit at the given CodeUnitIndex. Performs bounds checking.

unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, CodePoint) Source #

Does exactly the same thing as unsafeIndexCodePoint', but on Text values.

unsafeIndexCodePoint' :: ByteArray -> CodeUnitIndex -> (CodeUnitIndex, CodePoint) Source #

Decode a code point at the given CodeUnitIndex. Returns garbage if there is no valid code point at that position. Does not perform bounds checking. See decode2, decode3 and decode4 for the expected format of multi-byte code points.

unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit Source #

unsafeIndexCodeUnit' :: ByteArray -> CodeUnitIndex -> CodeUnit Source #

Slicing Functions

unsafeCutUtf8 and unsafeSliceUtf8 are used to retrieve slices of Text values. unsafeSliceUtf8 begin length returns a substring of length length starting at begin. unsafeSliceUtf8 begin length returns a tuple of the "surrounding" substrings.

They satisfy the following property:

let (prefix, suffix) = unsafeCutUtf8 begin length t
in concat [prefix, unsafeSliceUtf8 begin length t, suffix] == t

The following diagram visualizes the relevant offsets for begin = CodeUnitIndex 2, length = CodeUnitIndex 6 and t = "BCDEFGHIJKL".

 off                 off+len
  │                     │
  ▼                     ▼
──┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬──
 A│B│C│D│E│F│G│H│I│J│K│L│M│N
──┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴──
      ▲           ▲
      │           │
 off+begin   off+begin+length

unsafeSliceUtf8 begin length t == "DEFGHI"
unsafeCutUtf8 begin length t == ("BC", "JKL")

The shown array is open at each end because in general, t may be a slice as well.

WARNING: As their name implies, these functions are not (necessarily) bounds-checked. Use at your own risk.

unsafeCutUtf8 Source #

Arguments

:: CodeUnitIndex	Starting position of substring.
-> CodeUnitIndex	Length of substring.
-> Text	Initial string.
-> (Text, Text)

unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text Source #

General Functions

These functions are available in text as well and should be removed once this library moves to text-2. You should be able to use these by doing import qualified Data.Text.Utf8 as Text just like you would with text.

NOTE: The Text instances for Show, Eq, Ord, IsString, FromJSON, ToJSON and Hashable in this file also fall in this category.

concat :: [Text] -> Text Source #

TODO: Inefficient placeholder implementation.

dropWhile :: (Char -> Bool) -> Text -> Text Source #

See dropWhile.

null :: Text -> Bool Source #

Checks whether a text is the empty string.

readFile :: FilePath -> IO Text Source #

See readFile. TODO: Uses readFile and loops through each byte individually. Use copyPtrToMutableByteArray here if possible.

replicate :: Int -> Text -> Text Source #

TODO: Inefficient placeholder implementation. See replicate

indices :: Text -> Text -> [Int] Source #

TODO: Inefficient placeholder implementation. This function implements very basic string search. It's text counterpart is indices, which implements the Boyer-Moore algorithm. Since we have this function only to check whether our own Boyer-Moore implementation works, it would not make much sense to implement it using the same algorithm. Once we can use text-2, we can compare our implementation to the official text one which presumably works.

isInfixOf :: Text -> Text -> Bool Source #

TODO: Inefficient placeholder implementation.

pack :: String -> Text Source #

TODO: Inefficient placeholder implementation.

unpack :: Text -> String Source #

TODO: Inefficient placeholder implementation.