Copyright | © Herbert Valerio Riedel 2017 |
---|---|
License | BSD3 |
Maintainer | hvr@gnu.org |
Stability | stable |
Safe Haskell | Trustworthy |
Language | Haskell2010 |
Memory-efficient representation of Unicode text strings.
This module is intended to be imported qualified
, to avoid name
clashes with Prelude functions, e.g.
import qualified Data.Text.Short as TS import qualified Data.Text.Short (ShortText)
This modules deliberately omits (common) partial functions, which can be found in Data.Text.Short.Partial instead.
Since: 0.1
Synopsis
- data ShortText
- empty :: ShortText
- singleton :: Char -> ShortText
- pack :: [Char] -> ShortText
- append :: ShortText -> ShortText -> ShortText
- concat :: [ShortText] -> ShortText
- cons :: Char -> ShortText -> ShortText
- snoc :: ShortText -> Char -> ShortText
- replicate :: Int -> ShortText -> ShortText
- unpack :: ShortText -> [Char]
- uncons :: ShortText -> Maybe (Char, ShortText)
- unsnoc :: ShortText -> Maybe (ShortText, Char)
- null :: ShortText -> Bool
- length :: ShortText -> Int
- isAscii :: ShortText -> Bool
- all :: (Char -> Bool) -> ShortText -> Bool
- any :: (Char -> Bool) -> ShortText -> Bool
- find :: (Char -> Bool) -> ShortText -> Maybe Char
- isPrefixOf :: ShortText -> ShortText -> Bool
- isSuffixOf :: ShortText -> ShortText -> Bool
- (!?) :: ShortText -> Int -> Maybe Char
- indexMaybe :: ShortText -> Int -> Maybe Char
- indexEndMaybe :: ShortText -> Int -> Maybe Char
- findIndex :: (Char -> Bool) -> ShortText -> Maybe Int
- take :: Int -> ShortText -> ShortText
- takeEnd :: Int -> ShortText -> ShortText
- drop :: Int -> ShortText -> ShortText
- dropEnd :: Int -> ShortText -> ShortText
- takeWhile :: (Char -> Bool) -> ShortText -> ShortText
- takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
- dropWhile :: (Char -> Bool) -> ShortText -> ShortText
- dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
- dropAround :: (Char -> Bool) -> ShortText -> ShortText
- splitAt :: Int -> ShortText -> (ShortText, ShortText)
- splitAtEnd :: Int -> ShortText -> (ShortText, ShortText)
- span :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- break :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- spanEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- breakEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- split :: (Char -> Bool) -> ShortText -> [ShortText]
- stripPrefix :: ShortText -> ShortText -> Maybe ShortText
- stripSuffix :: ShortText -> ShortText -> Maybe ShortText
- intersperse :: Char -> ShortText -> ShortText
- intercalate :: ShortText -> [ShortText] -> ShortText
- reverse :: ShortText -> ShortText
- filter :: (Char -> Bool) -> ShortText -> ShortText
- foldl :: (a -> Char -> a) -> a -> ShortText -> a
- foldl' :: (a -> Char -> a) -> a -> ShortText -> a
- foldr :: (Char -> a -> a) -> a -> ShortText -> a
- fromString :: String -> ShortText
- toString :: ShortText -> String
- fromText :: Text -> ShortText
- toText :: ShortText -> Text
- fromShortByteString :: ShortByteString -> Maybe ShortText
- toShortByteString :: ShortText -> ShortByteString
- fromByteString :: ByteString -> Maybe ShortText
- toByteString :: ShortText -> ByteString
- toBuilder :: ShortText -> Builder
The ShortText
type
A compact representation of Unicode strings.
A ShortText
value is a sequence of Unicode scalar values, as defined in
§3.9, definition D76 of the Unicode 5.2 standard;
This means that a ShortText
is a list of (scalar) Unicode code-points (i.e. code-points in the range [U+00 .. U+D7FF] ∪ [U+E000 .. U+10FFFF]
).
This type relates to Text
as ShortByteString
relates to ByteString
by providing a more compact type. Please consult the documentation of Data.ByteString.Short for more information.
Currently, a boxed unshared Text
has a memory footprint of 6 words (i.e. 48 bytes on 64-bit systems) plus 2 or 4 bytes per code-point (due to the internal UTF-16 representation). Each Text
value which can share its payload with another Text
requires only 4 words additionally. Unlike ByteString
, Text
use unpinned memory.
In comparison, the footprint of a boxed ShortText
is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1, 2, 3, or 4 bytes per code-point (due to the internal UTF-8 representation).
It can be shown that for realistic data UTF-16 has a space overhead of 50% over UTF-8.
NOTE: The Typeable
instance isn't defined for GHC 7.8 (and older) prior to text-short-0.1.3
Since: 0.1
Instances
IsList ShortText Source # | Note: Surrogate pairs ( Since: 0.1.2 |
Eq ShortText Source # | |
Data ShortText Source # | It exposes a similar Since: 0.1.3 |
Defined in Data.Text.Short.Internal gfoldl :: (forall d b. Data d => c (d -> b) -> d -> c b) -> (forall g. g -> c g) -> ShortText -> c ShortText # gunfold :: (forall b r. Data b => c (b -> r) -> c r) -> (forall r. r -> c r) -> Constr -> c ShortText # toConstr :: ShortText -> Constr # dataTypeOf :: ShortText -> DataType # dataCast1 :: Typeable t => (forall d. Data d => c (t d)) -> Maybe (c ShortText) # dataCast2 :: Typeable t => (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c ShortText) # gmapT :: (forall b. Data b => b -> b) -> ShortText -> ShortText # gmapQl :: (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> ShortText -> r # gmapQr :: (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> ShortText -> r # gmapQ :: (forall d. Data d => d -> u) -> ShortText -> [u] # gmapQi :: Int -> (forall d. Data d => d -> u) -> ShortText -> u # gmapM :: Monad m => (forall d. Data d => d -> m d) -> ShortText -> m ShortText # gmapMp :: MonadPlus m => (forall d. Data d => d -> m d) -> ShortText -> m ShortText # gmapMo :: MonadPlus m => (forall d. Data d => d -> m d) -> ShortText -> m ShortText # | |
Ord ShortText Source # | |
Defined in Data.Text.Short.Internal | |
Read ShortText Source # | |
Show ShortText Source # | |
IsString ShortText Source # | Note: Surrogate pairs ( |
Defined in Data.Text.Short.Internal fromString :: String -> ShortText # | |
Semigroup ShortText Source # | |
Monoid ShortText Source # | |
PrintfArg ShortText Source # | Since: 0.1.2 |
Defined in Data.Text.Short.Internal formatArg :: ShortText -> FieldFormatter # parseFormat :: ShortText -> ModifierParser # | |
Binary ShortText Source # | |
NFData ShortText Source # | |
Defined in Data.Text.Short.Internal | |
Hashable ShortText Source # | |
Defined in Data.Text.Short.Internal | |
type Item ShortText Source # | |
Defined in Data.Text.Short.Internal |
Basic operations
Construction
singleton :: Char -> ShortText Source #
\(\mathcal{O}(1)\) Construct ShortText
from single codepoint.
singleton c == pack [c]
length (singleton c) == 1
>>>
singleton 'A'
"A"
>>>
map singleton ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
["\55295","\65533","\65533","\57344"]
Note: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
Since: 0.1.2
pack :: [Char] -> ShortText Source #
\(\mathcal{O}(n)\) Construct a ShortText
from a list of Char
s.
This is an alias for fromString
.
Since: 0.1.2
snoc :: ShortText -> Char -> ShortText Source #
\(\mathcal{O}(n)\) Append a character to the ond of a ShortText
.
snoc t c == t <> singleton c
Since: 0.1.2
replicate :: Int -> ShortText -> ShortText Source #
\(\mathcal{O}(n*m)\) Replicate a ShortText
.
A repetition count smaller than 1 results in an empty string result.
>>>
replicate 3 "jobs!"
"jobs!jobs!jobs!"
>>>
replicate 10000 ""
""
>>>
replicate 0 "nothing"
""
length (replicate n t) == max 0 n * length t
Since: 0.1.2
Deconstruction
Querying & predicates
null :: ShortText -> Bool Source #
\(\mathcal{O}(1)\) Test whether a ShortText
is empty.
>>>
null ""
True
null (singleton c) == False
null t == (length t == 0)
Since: 0.1
length :: ShortText -> Int Source #
\(\mathcal{O}(n)\) Count the number of Unicode code-points in a ShortText
.
>>>
length "abcd€"
5
>>>
length ""
0
length t >= 0
Since: 0.1
all :: (Char -> Bool) -> ShortText -> Bool Source #
\(\mathcal{O}(n)\) Test whether all code points in ShortText
satisfy a predicate.
>>>
all (const False) ""
True
>>>
all (> 'c') "abcdabcd"
False
>>>
all (/= 'c') "abdabd"
True
Since: 0.1.2
any :: (Char -> Bool) -> ShortText -> Bool Source #
\(\mathcal{O}(n)\) Test whether any code points in ShortText
satisfy a predicate.
>>>
any (> 'c') "abcdabcd"
True
>>>
any (const True) ""
False
>>>
any (== 'c') "abdabd"
False
any p t == not (all (not . p) t)
Since: 0.1.2
find :: (Char -> Bool) -> ShortText -> Maybe Char Source #
\(\mathcal{O}(n)\) Return the left-most codepoint in ShortText
that satisfies the given predicate.
>>>
find (> 'b') "abcdabcd"
Just 'c'
>>>
find (> 'b') "ababab"
Nothing
Since: 0.1.2
Lookup & indexing
(!?) :: ShortText -> Int -> Maybe Char Source #
\(\mathcal{O}(n)\) Index i-th code-point in ShortText
.
Infix operator alias of indexMaybe
>>>
"abcdefg" !? 2
Just 'c'
Since: 0.1.2
findIndex :: (Char -> Bool) -> ShortText -> Maybe Int Source #
\(\mathcal{O}(n)\) Return the index of the left-most codepoint in ShortText
that satisfies the given predicate.
>>>
findIndex (> 'b') "abcdabcdef"
Just 2
>>>
findIndex (> 'b') "ababab"
Nothing
(indexMaybe t =<< findIndex p t) == find p t
Since: 0.1.2
Splitting ShortText
s
Basic functions
take :: Int -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Take prefix of given length or return whole ShortText
if too short.
>>>
take 3 "abcdef"
"abc"
>>>
take 3 "ab"
"ab"
Since: 0.1.2
takeEnd :: Int -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Take suffix of given length or return whole ShortText
if too short.
>>>
takeEnd 3 "abcdefg"
"efg"
>>>
takeEnd 3 "ab"
"ab"
Since: 0.1.2
takeWhile :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Take longest prefix satisfying given predicate.
takeWhile p t == fst (span p t)
>>>
takeWhile (< 'c') "abcdabcd"
"ab"
Since: 0.1.2
takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Take longest suffix satisfying given predicate.
takeWhileEnd p t == snd (spanEnd p t)
>>>
takeWhileEnd (>= 'c') "abcdabcd"
"cd"
Since: 0.1.2
dropWhile :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Remove longest prefix satisfying given predicate.
dropWhile p t == snd (span p t)
>>>
dropWhile (< 'c') "abcdabcd"
"cdabcd"
Since: 0.1.2
dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Remove longest suffix satisfying given predicate.
dropWhileEnd p t == fst (spanEnd p t)
>>>
dropWhileEnd (>= 'c') "abcdabcd"
"abcdab"
Since: 0.1.2
dropAround :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Strip characters from the beginning end and of ShortText
which satisfy given predicate.
>>>
dropAround (== ' ') " white space "
"white space"
>>>
dropAround (> 'a') "bcdefghi"
""
Since: 0.1.2
Pair-valued functions
splitAt :: Int -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Split ShortText
into two halves.
returns a pair of splitAt
n tShortText
with the following properties:
length (fst (splitAt n t)) == min (length t) (max 0 n)
fst (splitAt n t) <> snd (splitAt n t) == t
>>>
splitAt 2 "abcdef"
("ab","cdef")
>>>
splitAt 10 "abcdef"
("abcdef","")
>>>
splitAt (-1) "abcdef"
("","abcdef")
Since: 0.1.2
splitAtEnd :: Int -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Split ShortText
into two halves.
returns a pair of splitAtEnd
n tShortText
with the following properties:
length (snd (splitAtEnd n t)) == min (length t) (max 0 n)
fst (splitAtEnd n t) <> snd (splitAtEnd n t) == t
splitAtEnd n t == splitAt (length t - n) t
>>>
splitAtEnd 2 "abcdef"
("abcd","ef")
>>>
splitAtEnd 10 "abcdef"
("","abcdef")
>>>
splitAtEnd (-1) "abcdef"
("abcdef","")
Since: 0.1.2
span :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Split ShortText
into longest prefix satisfying the given predicate and the remaining suffix.
>>>
span (< 'c') "abcdabcd"
("ab","cdabcd")
fst (span p t) <> snd (span p t) == t
Since: 0.1.2
break :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Variant of span
with negated predicate.
>>>
break (> 'c') "abcdabcd"
("abc","dabcd")
break p t == span (not . p) t
fst (break p t) <> snd (break p t) == t
Since: 0.1.2
spanEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Split ShortText
into longest suffix satisfying the given predicate and the preceding prefix.
>>>
spanEnd (> 'c') "abcdabcd"
("abcdabc","d")
fst (spanEnd p t) <> snd (spanEnd p t) == t
Since: 0.1.2
breakEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Variant of spanEnd
with negated predicate.
>>>
breakEnd (< 'c') "abcdabcd"
("abcdab","cd")
breakEnd p t == spanEnd (not . p) t
fst (breakEnd p t) <> snd (breakEnd p t) == t
Since: 0.1.2
Breaking into many substrings
split :: (Char -> Bool) -> ShortText -> [ShortText] Source #
\(\mathcal{O}(n)\) Splits a string into components delimited by separators, where the predicate returns True for a separator element. The resulting components do not contain the separators. Two adjacent separators result in an empty component in the output. eg.
>>>
split (=='a') "aabbaca"
["","","bb","c",""]
>>>
split (=='a') ""
[""]
intercalate (singleton c) (split (== c) t) = t
NOTE: split
never returns an empty list to match the semantics of its counterpart from Data.Text.
Since: 0.1.3
Suffix & Prefix operations
Transformations
intersperse :: Char -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Insert character between characters of ShortText
.
>>>
intersperse '*' "_"
"_"
>>>
intersperse '*' "MASH"
"M*A*S*H"
Since: 0.1.2
reverse :: ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Reverse characters in ShortText
.
>>>
reverse "star live desserts"
"stressed evil rats"
reverse (singleton c) == singleton c
reverse (reverse t) == t
Since: 0.1.2
filter :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Remove characters from ShortText
which don't satisfy given predicate.
>>>
filter (`notElem` ['a','e','i','o','u']) "You don't need vowels to convey information!"
"Y dn't nd vwls t cnvy nfrmtn!"
filter (const False) t == ""
filter (const True) t == t
length (filter p t) <= length t
filter p t == pack [ c | c <- unpack t, p c ]
Since: 0.1.2
Folds
foldl :: (a -> Char -> a) -> a -> ShortText -> a Source #
\(\mathcal{O}(n)\) Reduces the characters of the ShortText
with
the binary operator and an initial in forward direction (i.e. from
left to right).
>>>
foldl (\_ _ -> True) False ""
False
>>>
foldl (\s c -> c : s) ['.'] "abcd"
"dcba."
Since: 0.1.2
foldl' :: (a -> Char -> a) -> a -> ShortText -> a Source #
\(\mathcal{O}(n)\) Strict version of foldl
.
Since: 0.1.2
foldr :: (Char -> a -> a) -> a -> ShortText -> a Source #
\(\mathcal{O}(n)\) Reduces the characters of the ShortText
with
the binary operator and an initial in reverse direction (i.e. from
right to left).
>>>
foldr (\_ _ -> True) False ""
False
>>>
foldr (:) ['.'] "abcd"
"abcd."
Since: 0.1.2
Conversions
String
fromString :: String -> ShortText Source #
\(\mathcal{O}(n)\) Construct/pack from String
>>>
fromString []
""
>>>
fromString ['a','b','c']
"abc"
>>>
fromString ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
"\55295\65533\65533\57344"
Note: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
Since: 0.1
toString :: ShortText -> String Source #
\(\mathcal{O}(n)\) Convert to String
(fromString . toString) t == t
Note: See documentation of fromString
for why (
is not an identity function.toString
. fromString
)
Since: 0.1
Text
toText :: ShortText -> Text Source #
\(\mathcal{O}(n)\) Convert to Text
(fromText . toText) t == t
(toText . fromText) t == t
This is currently not \(\mathcal{O}(1)\) because currently Text
uses UTF-16 as its internal representation.
In the event that Text
will change its internal representation to UTF-8 this operation will become \(\mathcal{O}(1)\).
Since: 0.1
ByteString
fromShortByteString :: ShortByteString -> Maybe ShortText Source #
\(\mathcal{O}(n)\) Construct ShortText
from UTF-8 encoded ShortByteString
This operation doesn't copy the input ShortByteString
but it
cannot be \(\mathcal{O}(1)\) because we need to validate the UTF-8 encoding.
Returns Nothing
in case of invalid UTF-8 encoding.
>>>
fromShortByteString "\x00\x38\xF0\x90\x8C\x9A" -- U+00 U+38 U+1031A
Just "\NUL8\66330"
>>>
fromShortByteString "\xC0\x80" -- invalid denormalised U+00
Nothing
>>>
fromShortByteString "\xED\xA0\x80" -- U+D800 (non-scalar code-point)
Nothing
>>>
fromShortByteString "\xF4\x8f\xbf\xbf" -- U+10FFFF
Just "\1114111"
>>>
fromShortByteString "\xF4\x90\x80\x80" -- U+110000 (invalid)
Nothing
fromShortByteString (toShortByteString t) == Just t
Since: 0.1
toShortByteString :: ShortText -> ShortByteString Source #
\(\mathcal{O}(0)\) Converts to UTF-8 encoded ShortByteString
This operation has effectively no overhead, as it's currently merely a newtype
-cast.
Since: 0.1
fromByteString :: ByteString -> Maybe ShortText Source #
\(\mathcal{O}(n)\) Construct ShortText
from UTF-8 encoded ByteString
fromByteString
accepts (or rejects) the same input data as fromShortByteString
.
Returns Nothing
in case of invalid UTF-8 encoding.
Since: 0.1
toByteString :: ShortText -> ByteString Source #
\(\mathcal{O}(n)\) Converts to UTF-8 encoded ByteString
Since: 0.1