{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE NumericUnderscores #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedLists #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE RecordWildCards #-}
module Tiktoken
(
Encoding
, tiktokenToEncoding
, addSpecialTokens
, r50k_base
, p50k_base
, p50k_edit
, cl100k_base
, o200k_base
, toTokens
, toRanks
, toTokensAndRanks
, fromTokens
, fromRanks
) where
import Control.Applicative (many)
import Control.DeepSeq (NFData)
import Data.ByteString (ByteString)
import Data.Function (on)
import Data.HashMap.Strict (HashMap)
import Data.IntMap (IntMap, Key)
import Data.List.NonEmpty (NonEmpty(..))
import Data.Map (Map)
import Data.Text (Text)
import Data.Void (Void)
import Data.Word (Word8)
import GHC.Generics (Generic)
import Prelude hiding (lookup)
import System.FilePath ((</>))
import Text.Megaparsec (ParseErrorBundle, ParsecT)
import Text.RawString.QQ (r)
import qualified Control.Exception as Exception
import qualified Data.ByteString as ByteString
import qualified Data.ByteString.Base64 as Base64.Encoding
import qualified Data.ByteString.Char8 as Char8
import qualified Data.HashMap.Strict as HashMap
import qualified Data.IntMap.Strict as IntMap
import qualified Data.List as List
import qualified Data.List.NonEmpty as NonEmpty
import qualified Data.Map as Map
import qualified Data.Ord as Ord
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Text.Encoding
import qualified Data.Text.IO as Text.IO
import qualified Paths_tiktoken as Paths
import qualified System.IO.Unsafe as Unsafe
import qualified Text.Megaparsec as Megaparsec
import qualified Text.Megaparsec.Char as Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as Megaparsec.Lexer
import qualified Text.Regex.PCRE.Light as Regex
data Encoding = Encoding
{ Encoding -> HashMap ByteString Int
encode :: HashMap ByteString Int
, Encoding -> IntMap ByteString
decode :: IntMap ByteString
, Encoding -> Map ByteString Int
specialTokens :: Map ByteString Int
, Encoding -> ByteString
regex :: ByteString
} deriving stock ((forall x. Encoding -> Rep Encoding x)
-> (forall x. Rep Encoding x -> Encoding) -> Generic Encoding
forall x. Rep Encoding x -> Encoding
forall x. Encoding -> Rep Encoding x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cfrom :: forall x. Encoding -> Rep Encoding x
from :: forall x. Encoding -> Rep Encoding x
$cto :: forall x. Rep Encoding x -> Encoding
to :: forall x. Rep Encoding x -> Encoding
Generic)
deriving anyclass (Encoding -> ()
(Encoding -> ()) -> NFData Encoding
forall a. (a -> ()) -> NFData a
$crnf :: Encoding -> ()
rnf :: Encoding -> ()
NFData)
parseToken :: ParsecT Void Text m (Int, ByteString)
parseToken :: forall (m :: * -> *). ParsecT Void Text m (Int, ByteString)
parseToken = do
Text
base64Text <- Maybe String
-> (Token Text -> Bool) -> ParsecT Void Text m (Tokens Text)
forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
Megaparsec.takeWhileP (String -> Maybe String
forall a. a -> Maybe a
Just String
"Base64 character") (Token Text -> Token Text -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
Token Text
' ')
let base64Bytes :: ByteString
base64Bytes = Text -> ByteString
Text.Encoding.encodeUtf8 Text
base64Text
ByteString
token <- case ByteString -> Either Text ByteString
Base64.Encoding.decodeBase64Untyped ByteString
base64Bytes of
Left Text
text -> String -> ParsecT Void Text m ByteString
forall a. String -> ParsecT Void Text m a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (Text -> String
Text.unpack Text
text)
Right ByteString
token -> ByteString -> ParsecT Void Text m ByteString
forall a. a -> ParsecT Void Text m a
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
token
Char
_ <- Token Text -> ParsecT Void Text m (Token Text)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
Megaparsec.Char.char Char
Token Text
' '
Int
rank <- ParsecT Void Text m Int
forall e s (m :: * -> *) a.
(MonadParsec e s m, Token s ~ Char, Num a) =>
m a
Megaparsec.Lexer.decimal
Char
_ <- Token Text -> ParsecT Void Text m (Token Text)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
Megaparsec.Char.char Char
Token Text
'\n'
(Int, ByteString) -> ParsecT Void Text m (Int, ByteString)
forall a. a -> ParsecT Void Text m a
forall (m :: * -> *) a. Monad m => a -> m a
return (Int
rank, ByteString
token)
tokensToEncoding
:: ByteString
-> IntMap ByteString
-> Encoding
tokensToEncoding :: ByteString -> IntMap ByteString -> Encoding
tokensToEncoding ByteString
regex IntMap ByteString
decode = Encoding{ByteString
Map ByteString Int
IntMap ByteString
HashMap ByteString Int
$sel:encode:Encoding :: HashMap ByteString Int
$sel:decode:Encoding :: IntMap ByteString
$sel:specialTokens:Encoding :: Map ByteString Int
$sel:regex:Encoding :: ByteString
regex :: ByteString
decode :: IntMap ByteString
encode :: HashMap ByteString Int
specialTokens :: Map ByteString Int
..}
where
encode :: HashMap ByteString Int
encode = [(ByteString, Int)] -> HashMap ByteString Int
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
HashMap.fromList (((Int, ByteString) -> (ByteString, Int))
-> [(Int, ByteString)] -> [(ByteString, Int)]
forall a b. (a -> b) -> [a] -> [b]
map (Int, ByteString) -> (ByteString, Int)
forall {b} {a}. (b, a) -> (a, b)
swap (IntMap ByteString -> [(Int, ByteString)]
forall a. IntMap a -> [(Int, a)]
IntMap.toList IntMap ByteString
decode))
where
swap :: (b, a) -> (a, b)
swap (b
rank, a
token) = (a
token, b
rank)
specialTokens :: Map ByteString Int
specialTokens = Map ByteString Int
forall a. Monoid a => a
mempty
tiktokenToEncoding
:: ByteString
-> Text
-> Either (ParseErrorBundle Text Void) Encoding
tiktokenToEncoding :: ByteString -> Text -> Either (ParseErrorBundle Text Void) Encoding
tiktokenToEncoding ByteString
regex Text
text = Parsec Void Text Encoding
-> String -> Text -> Either (ParseErrorBundle Text Void) Encoding
forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
Megaparsec.runParser Parsec Void Text Encoding
forall {m :: * -> *}. ParsecT Void Text m Encoding
parser String
"" Text
text
where
parser :: ParsecT Void Text m Encoding
parser = do
[(Int, ByteString)]
keyValues <- ParsecT Void Text m (Int, ByteString)
-> ParsecT Void Text m [(Int, ByteString)]
forall a. ParsecT Void Text m a -> ParsecT Void Text m [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many ParsecT Void Text m (Int, ByteString)
forall (m :: * -> *). ParsecT Void Text m (Int, ByteString)
parseToken
Encoding -> ParsecT Void Text m Encoding
forall a. a -> ParsecT Void Text m a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ByteString -> IntMap ByteString -> Encoding
tokensToEncoding ByteString
regex ([(Int, ByteString)] -> IntMap ByteString
forall a. [(Int, a)] -> IntMap a
IntMap.fromList [(Int, ByteString)]
keyValues))
addSpecialTokens :: Map ByteString Int -> Encoding -> Encoding
addSpecialTokens :: Map ByteString Int -> Encoding -> Encoding
addSpecialTokens Map ByteString Int
tokens Encoding{ $sel:specialTokens:Encoding :: Encoding -> Map ByteString Int
specialTokens = Map ByteString Int
oldSpecialTokens, ByteString
IntMap ByteString
HashMap ByteString Int
$sel:encode:Encoding :: Encoding -> HashMap ByteString Int
$sel:decode:Encoding :: Encoding -> IntMap ByteString
$sel:regex:Encoding :: Encoding -> ByteString
encode :: HashMap ByteString Int
decode :: IntMap ByteString
regex :: ByteString
.. } =
Encoding{ $sel:specialTokens:Encoding :: Map ByteString Int
specialTokens = Map ByteString Int -> Map ByteString Int -> Map ByteString Int
forall k a. Ord k => Map k a -> Map k a -> Map k a
Map.union Map ByteString Int
tokens Map ByteString Int
oldSpecialTokens, ByteString
IntMap ByteString
HashMap ByteString Int
$sel:encode:Encoding :: HashMap ByteString Int
$sel:decode:Encoding :: IntMap ByteString
$sel:regex:Encoding :: ByteString
encode :: HashMap ByteString Int
decode :: IntMap ByteString
regex :: ByteString
.. }
_ENDOFTEXT :: ByteString
_ENDOFTEXT :: ByteString
_ENDOFTEXT = ByteString
"<|endoftext|>"
_FIM_PREFIX :: ByteString
_FIM_PREFIX :: ByteString
_FIM_PREFIX = ByteString
"<|fim_prefix|>"
_FIM_MIDDLE :: ByteString
_FIM_MIDDLE :: ByteString
_FIM_MIDDLE = ByteString
"<|fim_middle|>"
_FIM_SUFFIX :: ByteString
_FIM_SUFFIX :: ByteString
_FIM_SUFFIX = ByteString
"<|fim_suffix|>"
_ENDOFPROMPT :: ByteString
_ENDOFPROMPT :: ByteString
_ENDOFPROMPT = ByteString
"<|endofprompt|>"
loadEncoding :: FilePath -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding :: String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
file ByteString
regex Map ByteString Int
specialTokens = do
String
dataDirectory <- IO String
Paths.getDataDir
Text
text <- String -> IO Text
Text.IO.readFile (String
dataDirectory String -> String -> String
</> String
file)
Encoding
encoding <- case ByteString -> Text -> Either (ParseErrorBundle Text Void) Encoding
tiktokenToEncoding ByteString
regex Text
text of
Left ParseErrorBundle Text Void
exception -> ParseErrorBundle Text Void -> IO Encoding
forall e a. Exception e => e -> IO a
Exception.throwIO ParseErrorBundle Text Void
exception
Right Encoding
encoding -> Encoding -> IO Encoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return Encoding
encoding
Encoding -> IO Encoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Map ByteString Int -> Encoding -> Encoding
addSpecialTokens Map ByteString Int
specialTokens Encoding
encoding)
r50k_base :: Encoding
r50k_base :: Encoding
r50k_base =
IO Encoding -> Encoding
forall a. IO a -> a
Unsafe.unsafePerformIO
(String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
"r50k_base.tiktoken" ByteString
regex [ (ByteString
_ENDOFTEXT, Int
50256) ])
where
regex :: ByteString
regex =
ByteString
[r|'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+|]
{-# NOINLINE r50k_base #-}
p50k_base :: Encoding
p50k_base :: Encoding
p50k_base =
IO Encoding -> Encoding
forall a. IO a -> a
Unsafe.unsafePerformIO
(String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
"p50k_base.tiktoken" ByteString
regex [ (ByteString
_ENDOFTEXT, Int
50256) ])
where
regex :: ByteString
regex =
ByteString
[r|'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+|]
{-# NOINLINE p50k_base #-}
p50k_edit :: Encoding
p50k_edit :: Encoding
p50k_edit =
IO Encoding -> Encoding
forall a. IO a -> a
Unsafe.unsafePerformIO
(String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
"p50k_base.tiktoken"
ByteString
regex
[ (ByteString
_ENDOFTEXT , Int
50256)
, (ByteString
_FIM_PREFIX, Int
50281)
, (ByteString
_FIM_MIDDLE, Int
50282)
, (ByteString
_FIM_SUFFIX, Int
50283)
]
)
where
regex :: ByteString
regex =
ByteString
[r|'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+|]
{-# NOINLINE p50k_edit #-}
cl100k_base :: Encoding
cl100k_base :: Encoding
cl100k_base =
IO Encoding -> Encoding
forall a. IO a -> a
Unsafe.unsafePerformIO
(String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
"cl100k_base.tiktoken"
ByteString
regex
[ (ByteString
_ENDOFTEXT , Int
100257)
, (ByteString
_FIM_PREFIX , Int
100258)
, (ByteString
_FIM_MIDDLE , Int
100259)
, (ByteString
_FIM_SUFFIX , Int
100260)
, (ByteString
_ENDOFPROMPT, Int
100276)
]
)
where
regex :: ByteString
regex =
ByteString
[r|'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+|]
{-# NOINLINE cl100k_base #-}
o200k_base :: Encoding
o200k_base :: Encoding
o200k_base =
IO Encoding -> Encoding
forall a. IO a -> a
Unsafe.unsafePerformIO
(String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
"o200k_base.tiktoken"
ByteString
regex
[ (ByteString
_ENDOFTEXT , Int
199999)
, (ByteString
_ENDOFPROMPT, Int
200018)
]
)
where
regex :: ByteString
regex =
ByteString -> [ByteString] -> ByteString
Char8.intercalate ByteString
"|"
[ Item [ByteString]
[r|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|]
, Item [ByteString]
[r|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|]
, Item [ByteString]
[r|\p{N}{1,3}|]
, Item [ByteString]
[r| ?[^\s\x{3000}\p{L}\p{N}]+[\r\n/]*|]
, Item [ByteString]
[r|[\s\x{3000}]*[\r\n]+|]
, Item [ByteString]
[r|[\s\x{3000}]+(?![^\s\x{3000}])|]
, Item [ByteString]
[r|[\s\x{3000}]+|]
]
{-# NOINLINE o200k_base #-}
minimumBy :: (a -> a -> Ordering) -> IntMap a -> Maybe (Int, a)
minimumBy :: forall a. (a -> a -> Ordering) -> IntMap a -> Maybe (Int, a)
minimumBy a -> a -> Ordering
comparison IntMap a
intMap
| IntMap a -> Bool
forall a. IntMap a -> Bool
IntMap.null IntMap a
intMap =
Maybe (Int, a)
forall a. Maybe a
Nothing
| Bool
otherwise =
(Int, a) -> Maybe (Int, a)
forall a. a -> Maybe a
Just (((Int, a) -> (Int, a) -> Ordering) -> [(Int, a)] -> (Int, a)
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
List.minimumBy (a -> a -> Ordering
comparison (a -> a -> Ordering)
-> ((Int, a) -> a) -> (Int, a) -> (Int, a) -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Int, a) -> a
forall a b. (a, b) -> b
snd) (IntMap a -> [(Int, a)]
forall a. IntMap a -> [(Int, a)]
IntMap.toList IntMap a
intMap))
drop1 :: [a] -> [a]
drop1 :: forall a. [a] -> [a]
drop1 (a
_ : [a]
xs) = [a]
xs
drop1 [] = []
data Ranked = Ranked Int | Unranked
deriving (Ranked -> Ranked -> Bool
(Ranked -> Ranked -> Bool)
-> (Ranked -> Ranked -> Bool) -> Eq Ranked
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Ranked -> Ranked -> Bool
== :: Ranked -> Ranked -> Bool
$c/= :: Ranked -> Ranked -> Bool
/= :: Ranked -> Ranked -> Bool
Eq, Eq Ranked
Eq Ranked =>
(Ranked -> Ranked -> Ordering)
-> (Ranked -> Ranked -> Bool)
-> (Ranked -> Ranked -> Bool)
-> (Ranked -> Ranked -> Bool)
-> (Ranked -> Ranked -> Bool)
-> (Ranked -> Ranked -> Ranked)
-> (Ranked -> Ranked -> Ranked)
-> Ord Ranked
Ranked -> Ranked -> Bool
Ranked -> Ranked -> Ordering
Ranked -> Ranked -> Ranked
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Ranked -> Ranked -> Ordering
compare :: Ranked -> Ranked -> Ordering
$c< :: Ranked -> Ranked -> Bool
< :: Ranked -> Ranked -> Bool
$c<= :: Ranked -> Ranked -> Bool
<= :: Ranked -> Ranked -> Bool
$c> :: Ranked -> Ranked -> Bool
> :: Ranked -> Ranked -> Bool
$c>= :: Ranked -> Ranked -> Bool
>= :: Ranked -> Ranked -> Bool
$cmax :: Ranked -> Ranked -> Ranked
max :: Ranked -> Ranked -> Ranked
$cmin :: Ranked -> Ranked -> Ranked
min :: Ranked -> Ranked -> Ranked
Ord)
data Chunk = Chunk
{ Chunk -> Int
rank :: Int
, Chunk -> Ranked
rank2 :: Ranked
}
bytePairEncode
:: HashMap ByteString Int -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncode :: HashMap ByteString Int -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncode HashMap ByteString Int
hashMap ByteString
bytes
| Just Int
rank <- ByteString -> HashMap ByteString Int -> Maybe Int
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup ByteString
bytes HashMap ByteString Int
hashMap =
[(Int, ByteString)] -> Maybe [(Int, ByteString)]
forall a. a -> Maybe a
Just [ (Int
rank, ByteString
bytes) ]
| ByteString -> Bool
ByteString.null ByteString
bytes =
[(Int, ByteString)] -> Maybe [(Int, ByteString)]
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure []
| Bool
otherwise = do
let lookupByte :: Word8 -> Maybe Int
lookupByte :: Word8 -> Maybe Int
lookupByte Word8
word8 = ByteString -> HashMap ByteString Int -> Maybe Int
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup (Word8 -> ByteString
ByteString.singleton Word8
word8) HashMap ByteString Int
hashMap
let toChunk :: Word8 -> Word8 -> Maybe Chunk
toChunk Word8
w0 Word8
w1 = do
Int
rank <- Word8 -> Maybe Int
lookupByte Word8
w0
let rank2 :: Ranked
rank2 = ByteString -> Ranked
lookupSlice ([Word8] -> ByteString
ByteString.pack [ Word8
Item [Word8]
w0, Word8
Item [Word8]
w1 ])
Chunk -> Maybe Chunk
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Chunk{ Int
$sel:rank:Chunk :: Int
rank :: Int
rank, Ranked
$sel:rank2:Chunk :: Ranked
rank2 :: Ranked
rank2 }
[Chunk]
initChunks <- do
[Maybe Chunk] -> Maybe [Chunk]
forall (t :: * -> *) (m :: * -> *) a.
(Traversable t, Monad m) =>
t (m a) -> m (t a)
forall (m :: * -> *) a. Monad m => [m a] -> m [a]
sequence ((Word8 -> Word8 -> Maybe Chunk)
-> ByteString -> ByteString -> [Maybe Chunk]
forall a. (Word8 -> Word8 -> a) -> ByteString -> ByteString -> [a]
ByteString.zipWith Word8 -> Word8 -> Maybe Chunk
toChunk ByteString
bytes (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
ByteString.tail ByteString
bytes))
Chunk
lastChunk <- do
Int
rank <- Word8 -> Maybe Int
lookupByte (HasCallStack => ByteString -> Word8
ByteString -> Word8
ByteString.last ByteString
bytes)
Chunk -> Maybe Chunk
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Chunk{ Int
$sel:rank:Chunk :: Int
rank :: Int
rank, $sel:rank2:Chunk :: Ranked
rank2 = Ranked
Unranked }
let initialMap :: IntMap Chunk
initialMap :: IntMap Chunk
initialMap =
[(Int, Chunk)] -> IntMap Chunk
forall a. [(Int, a)] -> IntMap a
IntMap.fromList ([Int] -> [Chunk] -> [(Int, Chunk)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int
Item [Int]
0 ..] ([Chunk]
initChunks [Chunk] -> [Chunk] -> [Chunk]
forall a. Semigroup a => a -> a -> a
<> [ Item [Chunk]
Chunk
lastChunk ]))
let keyValues :: [(Int, Chunk)]
keyValues = IntMap Chunk -> [(Int, Chunk)]
forall a. IntMap a -> [(Int, a)]
IntMap.toAscList (IntMap Chunk -> IntMap Chunk
loop IntMap Chunk
initialMap)
[(Int, ByteString)] -> Maybe [(Int, ByteString)]
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure do
let adapt :: (Int, Chunk) -> Int -> (Int, ByteString)
adapt (Int
index, Chunk{ Int
$sel:rank:Chunk :: Chunk -> Int
rank :: Int
rank }) Int
nextIndex =
(Int
rank, Int -> Int -> ByteString
slice Int
index Int
nextIndex)
((Int, Chunk) -> Int -> (Int, ByteString))
-> [(Int, Chunk)] -> [Int] -> [(Int, ByteString)]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith (Int, Chunk) -> Int -> (Int, ByteString)
adapt [(Int, Chunk)]
keyValues ([Int] -> [Int]
forall a. [a] -> [a]
drop1 (((Int, Chunk) -> Int) -> [(Int, Chunk)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (Int, Chunk) -> Int
forall a b. (a, b) -> a
fst [(Int, Chunk)]
keyValues) [Int] -> [Int] -> [Int]
forall a. Semigroup a => a -> a -> a
<> [ Int
Item [Int]
size ])
where
size :: Int
size :: Int
size = ByteString -> Int
ByteString.length ByteString
bytes
lookupSlice :: ByteString -> Ranked
lookupSlice :: ByteString -> Ranked
lookupSlice ByteString
b = case ByteString -> HashMap ByteString Int -> Maybe Int
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup ByteString
b HashMap ByteString Int
hashMap of
Maybe Int
Nothing -> Ranked
Unranked
Just Int
int -> Int -> Ranked
Ranked Int
int
slice :: Int -> Int -> ByteString
slice :: Int -> Int -> ByteString
slice Int
begin Int
end = Int -> ByteString -> ByteString
ByteString.take (Int
end Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
begin) (Int -> ByteString -> ByteString
ByteString.drop Int
begin ByteString
bytes)
loop :: IntMap Chunk -> IntMap Chunk
loop :: IntMap Chunk -> IntMap Chunk
loop IntMap Chunk
chunks0 = case (Chunk -> Chunk -> Ordering) -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. (a -> a -> Ordering) -> IntMap a -> Maybe (Int, a)
minimumBy ((Chunk -> Ranked) -> Chunk -> Chunk -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
Ord.comparing Chunk -> Ranked
rank2) IntMap Chunk
chunks0 of
Just (Int
index, Chunk{ $sel:rank2:Chunk :: Chunk -> Ranked
rank2 = Ranked Int
rank }) -> IntMap Chunk -> IntMap Chunk
loop IntMap Chunk
chunks3
where
chunks1 :: IntMap Chunk
chunks1 = Int -> Int -> IntMap Chunk -> IntMap Chunk
rerank Int
index Int
rank IntMap Chunk
chunks0
chunks2 :: IntMap Chunk
chunks2 = case Int -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. Int -> IntMap a -> Maybe (Int, a)
IntMap.lookupLT Int
index IntMap Chunk
chunks1 of
Just (Int
prevIndex, Chunk{ $sel:rank:Chunk :: Chunk -> Int
rank = Int
prevRank }) ->
Int -> Int -> IntMap Chunk -> IntMap Chunk
rerank Int
prevIndex Int
prevRank IntMap Chunk
chunks1
Maybe (Int, Chunk)
_ ->
IntMap Chunk
chunks1
chunks3 :: IntMap Chunk
chunks3 = case Int -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. Int -> IntMap a -> Maybe (Int, a)
IntMap.lookupGT Int
index IntMap Chunk
chunks2 of
Maybe (Int, Chunk)
Nothing ->
String -> IntMap Chunk
forall a. HasCallStack => String -> a
error String
"Tiktoken.bytePairEncode: Internal error - a ranked byte pair is missing the second byte in the pair"
Just (Int
nextIndex, Chunk
_) ->
Int -> IntMap Chunk -> IntMap Chunk
forall a. Int -> IntMap a -> IntMap a
IntMap.delete Int
nextIndex IntMap Chunk
chunks2
Maybe (Int, Chunk)
_ ->
IntMap Chunk
chunks0
rerank :: Key -> Int -> IntMap Chunk -> IntMap Chunk
rerank :: Int -> Int -> IntMap Chunk -> IntMap Chunk
rerank Int
index0 Int
rank IntMap Chunk
chunks = Int -> Chunk -> IntMap Chunk -> IntMap Chunk
forall a. Int -> a -> IntMap a -> IntMap a
IntMap.insert Int
index0 Chunk
newChunk IntMap Chunk
chunks
where
maybeIndex3 :: Maybe Int
maybeIndex3 = do
(Int
index1, Chunk
_) <- Int -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. Int -> IntMap a -> Maybe (Int, a)
IntMap.lookupGT Int
index0 IntMap Chunk
chunks
(Int
index2, Chunk
_) <- Int -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. Int -> IntMap a -> Maybe (Int, a)
IntMap.lookupGT Int
index1 IntMap Chunk
chunks
Int -> Maybe Int
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure case Int -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. Int -> IntMap a -> Maybe (Int, a)
IntMap.lookupGT Int
index2 IntMap Chunk
chunks of
Just (Int
index3, Chunk
_) -> Int
index3
Maybe (Int, Chunk)
Nothing -> Int
size
rank2 :: Ranked
rank2 = case Maybe Int
maybeIndex3 of
Maybe Int
Nothing -> Ranked
Unranked
Just Int
index3 -> ByteString -> Ranked
lookupSlice (Int -> Int -> ByteString
slice Int
index0 Int
index3)
newChunk :: Chunk
newChunk = Chunk{ Int
$sel:rank:Chunk :: Int
rank :: Int
rank, Ranked
$sel:rank2:Chunk :: Ranked
rank2 :: Ranked
rank2 }
splitUsingRegex
:: ByteString
-> ByteString
-> Maybe [ByteString]
splitUsingRegex :: ByteString -> ByteString -> Maybe [ByteString]
splitUsingRegex ByteString
pattern = ([ByteString] -> [ByteString]) -> ByteString -> Maybe [ByteString]
forall {c}. ([ByteString] -> c) -> ByteString -> Maybe c
loop [ByteString] -> [ByteString]
forall a. a -> a
Prelude.id
where
loop :: ([ByteString] -> c) -> ByteString -> Maybe c
loop [ByteString] -> c
diff ByteString
bytes
| ByteString -> Bool
ByteString.null ByteString
bytes =
c -> Maybe c
forall a. a -> Maybe a
Just ([ByteString] -> c
diff [])
| Bool
otherwise =
case Regex -> ByteString -> [PCREExecOption] -> Maybe [ByteString]
Regex.match Regex
regex ByteString
bytes [ Item [PCREExecOption]
PCREExecOption
Regex.exec_no_utf8_check ] of
Just (ByteString
prefix : [ByteString]
_) ->
let suffix :: ByteString
suffix = Int -> ByteString -> ByteString
ByteString.drop (ByteString -> Int
ByteString.length ByteString
prefix) ByteString
bytes
in ([ByteString] -> c) -> ByteString -> Maybe c
loop ([ByteString] -> c
diff ([ByteString] -> c)
-> ([ByteString] -> [ByteString]) -> [ByteString] -> c
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString
prefix ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:)) ByteString
suffix
Maybe [ByteString]
_ -> Maybe c
forall a. Maybe a
Nothing
regex :: Regex
regex = ByteString -> [PCREOption] -> Regex
Regex.compile ByteString
pattern [ Item [PCREOption]
PCREOption
Regex.utf8, Item [PCREOption]
PCREOption
Regex.anchored ]
bytePairEncodeWithSplitting :: Encoding -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncodeWithSplitting :: Encoding -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncodeWithSplitting Encoding{ByteString
Map ByteString Int
IntMap ByteString
HashMap ByteString Int
$sel:encode:Encoding :: Encoding -> HashMap ByteString Int
$sel:decode:Encoding :: Encoding -> IntMap ByteString
$sel:specialTokens:Encoding :: Encoding -> Map ByteString Int
$sel:regex:Encoding :: Encoding -> ByteString
encode :: HashMap ByteString Int
decode :: IntMap ByteString
specialTokens :: Map ByteString Int
regex :: ByteString
..} ByteString
bytes = do
[ByteString]
chunks <- ByteString -> ByteString -> Maybe [ByteString]
splitUsingRegex ByteString
regex ByteString
bytes
[[(Int, ByteString)]]
tokenss <- (ByteString -> Maybe [(Int, ByteString)])
-> [ByteString] -> Maybe [[(Int, ByteString)]]
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> [a] -> f [b]
traverse (HashMap ByteString Int -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncode HashMap ByteString Int
encode) [ByteString]
chunks
[(Int, ByteString)] -> Maybe [(Int, ByteString)]
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([[(Int, ByteString)]] -> [(Int, ByteString)]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[(Int, ByteString)]]
tokenss)
splitOnSeparator
:: ByteString
-> ByteString
-> NonEmpty ByteString
splitOnSeparator :: ByteString -> ByteString -> NonEmpty ByteString
splitOnSeparator ByteString
separator ByteString
initialBytes = ByteString
initialPrefix ByteString -> [ByteString] -> NonEmpty ByteString
forall a. a -> [a] -> NonEmpty a
:| ByteString -> [ByteString]
loop ByteString
initialSuffix
where
split :: ByteString -> (ByteString, ByteString)
split = ByteString -> ByteString -> (ByteString, ByteString)
ByteString.breakSubstring ByteString
separator
(ByteString
initialPrefix, ByteString
initialSuffix) = ByteString -> (ByteString, ByteString)
split ByteString
initialBytes
loop :: ByteString -> [ByteString]
loop ByteString
bytes
| ByteString -> Bool
ByteString.null ByteString
bytes = []
| Bool
otherwise = ByteString
prefix ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
: ByteString -> [ByteString]
loop ByteString
suffix
where
rest :: ByteString
rest = Int -> ByteString -> ByteString
ByteString.drop (ByteString -> Int
ByteString.length ByteString
separator) ByteString
bytes
(ByteString
prefix, ByteString
suffix) = ByteString -> (ByteString, ByteString)
split ByteString
rest
toTokensAndRanks :: Encoding -> ByteString -> Maybe [(Int, ByteString)]
toTokensAndRanks :: Encoding -> ByteString -> Maybe [(Int, ByteString)]
toTokensAndRanks encoding :: Encoding
encoding@Encoding{ByteString
Map ByteString Int
IntMap ByteString
HashMap ByteString Int
$sel:encode:Encoding :: Encoding -> HashMap ByteString Int
$sel:decode:Encoding :: Encoding -> IntMap ByteString
$sel:specialTokens:Encoding :: Encoding -> Map ByteString Int
$sel:regex:Encoding :: Encoding -> ByteString
encode :: HashMap ByteString Int
decode :: IntMap ByteString
specialTokens :: Map ByteString Int
regex :: ByteString
..} ByteString
initialBytes =
((ByteString, Int)
-> (ByteString -> Maybe [(Int, ByteString)])
-> ByteString
-> Maybe [(Int, ByteString)])
-> (ByteString -> Maybe [(Int, ByteString)])
-> [(ByteString, Int)]
-> ByteString
-> Maybe [(Int, ByteString)]
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (ByteString, Int)
-> (ByteString -> Maybe [(Int, ByteString)])
-> ByteString
-> Maybe [(Int, ByteString)]
forall {f :: * -> *} {b}.
Applicative f =>
(ByteString, b)
-> (ByteString -> f [(b, ByteString)])
-> ByteString
-> f [(b, ByteString)]
cons ByteString -> Maybe [(Int, ByteString)]
nil (Map ByteString Int -> [(ByteString, Int)]
forall k a. Map k a -> [(k, a)]
Map.toList Map ByteString Int
specialTokens) ByteString
initialBytes
where
cons :: (ByteString, b)
-> (ByteString -> f [(b, ByteString)])
-> ByteString
-> f [(b, ByteString)]
cons (ByteString
token, b
rank) ByteString -> f [(b, ByteString)]
tokenizer ByteString
bytes = do
(NonEmpty [(b, ByteString)] -> [(b, ByteString)])
-> f (NonEmpty [(b, ByteString)]) -> f [(b, ByteString)]
forall a b. (a -> b) -> f a -> f b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap NonEmpty [(b, ByteString)] -> [(b, ByteString)]
joinSegments ((ByteString -> f [(b, ByteString)])
-> NonEmpty ByteString -> f (NonEmpty [(b, ByteString)])
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> NonEmpty a -> f (NonEmpty b)
traverse ByteString -> f [(b, ByteString)]
tokenizer (ByteString -> ByteString -> NonEmpty ByteString
splitOnSeparator ByteString
token ByteString
bytes))
where
joinSegments :: NonEmpty [(b, ByteString)] -> [(b, ByteString)]
joinSegments =
[[(b, ByteString)]] -> [(b, ByteString)]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat
([[(b, ByteString)]] -> [(b, ByteString)])
-> (NonEmpty [(b, ByteString)] -> [[(b, ByteString)]])
-> NonEmpty [(b, ByteString)]
-> [(b, ByteString)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. NonEmpty [(b, ByteString)] -> [[(b, ByteString)]]
forall a. NonEmpty a -> [a]
NonEmpty.toList
(NonEmpty [(b, ByteString)] -> [[(b, ByteString)]])
-> (NonEmpty [(b, ByteString)] -> NonEmpty [(b, ByteString)])
-> NonEmpty [(b, ByteString)]
-> [[(b, ByteString)]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(b, ByteString)]
-> NonEmpty [(b, ByteString)] -> NonEmpty [(b, ByteString)]
forall a. a -> NonEmpty a -> NonEmpty a
NonEmpty.intersperse [ (b
rank, ByteString
token) ]
nil :: ByteString -> Maybe [(Int, ByteString)]
nil ByteString
bytes = Encoding -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncodeWithSplitting Encoding
encoding ByteString
bytes
toTokens :: Encoding -> ByteString -> Maybe [ByteString]
toTokens :: Encoding -> ByteString -> Maybe [ByteString]
toTokens = ((ByteString -> Maybe [(Int, ByteString)])
-> ByteString -> Maybe [ByteString])
-> (Encoding -> ByteString -> Maybe [(Int, ByteString)])
-> Encoding
-> ByteString
-> Maybe [ByteString]
forall a b. (a -> b) -> (Encoding -> a) -> Encoding -> b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Maybe [(Int, ByteString)] -> Maybe [ByteString])
-> (ByteString -> Maybe [(Int, ByteString)])
-> ByteString
-> Maybe [ByteString]
forall a b. (a -> b) -> (ByteString -> a) -> ByteString -> b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (([(Int, ByteString)] -> [ByteString])
-> Maybe [(Int, ByteString)] -> Maybe [ByteString]
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (((Int, ByteString) -> ByteString)
-> [(Int, ByteString)] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Int, ByteString) -> ByteString
forall a b. (a, b) -> b
snd))) Encoding -> ByteString -> Maybe [(Int, ByteString)]
toTokensAndRanks
toRanks :: Encoding -> ByteString -> Maybe [Int]
toRanks :: Encoding -> ByteString -> Maybe [Int]
toRanks = ((ByteString -> Maybe [(Int, ByteString)])
-> ByteString -> Maybe [Int])
-> (Encoding -> ByteString -> Maybe [(Int, ByteString)])
-> Encoding
-> ByteString
-> Maybe [Int]
forall a b. (a -> b) -> (Encoding -> a) -> Encoding -> b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Maybe [(Int, ByteString)] -> Maybe [Int])
-> (ByteString -> Maybe [(Int, ByteString)])
-> ByteString
-> Maybe [Int]
forall a b. (a -> b) -> (ByteString -> a) -> ByteString -> b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (([(Int, ByteString)] -> [Int])
-> Maybe [(Int, ByteString)] -> Maybe [Int]
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (((Int, ByteString) -> Int) -> [(Int, ByteString)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Int, ByteString) -> Int
forall a b. (a, b) -> a
fst))) Encoding -> ByteString -> Maybe [(Int, ByteString)]
toTokensAndRanks
fromTokens :: [ByteString] -> ByteString
fromTokens :: [ByteString] -> ByteString
fromTokens = [ByteString] -> ByteString
ByteString.concat
fromRanks :: Encoding -> [Int] -> Maybe ByteString
fromRanks :: Encoding -> [Int] -> Maybe ByteString
fromRanks Encoding{ByteString
Map ByteString Int
IntMap ByteString
HashMap ByteString Int
$sel:encode:Encoding :: Encoding -> HashMap ByteString Int
$sel:decode:Encoding :: Encoding -> IntMap ByteString
$sel:specialTokens:Encoding :: Encoding -> Map ByteString Int
$sel:regex:Encoding :: Encoding -> ByteString
encode :: HashMap ByteString Int
decode :: IntMap ByteString
specialTokens :: Map ByteString Int
regex :: ByteString
..} [Int]
vector = ([ByteString] -> ByteString)
-> Maybe [ByteString] -> Maybe ByteString
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [ByteString] -> ByteString
fromTokens ((Int -> Maybe ByteString) -> [Int] -> Maybe [ByteString]
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> [a] -> f [b]
traverse Int -> Maybe ByteString
lookup [Int]
vector)
where
lookup :: Int -> Maybe ByteString
lookup Int
rank = Int -> IntMap ByteString -> Maybe ByteString
forall a. Int -> IntMap a -> Maybe a
IntMap.lookup Int
rank IntMap ByteString
decode