{-# OPTIONS_GHC -Wno-orphans #-} module Data.Function.FastMemo.Char () where import Data.Bits (complement, countLeadingZeros) import qualified Data.ByteString as ByteString import qualified Data.ByteString.UTF8 as UTF8 import Data.Function.FastMemo.Class (Memoizable (..)) import Data.Function.FastMemo.Util (memoizeFixedLen) import Data.Function.FastMemo.Word () import Data.List.NonEmpty (NonEmpty (..)) import qualified Data.List.NonEmpty as NonEmpty import Data.Word (Word8) -- We want ASCII Chars to require only a single Vector lookup, so let's encode as UTF-8 instance Memoizable Char where memoize f = memoize (f . codePointToChar) . charToCodePoint newtype CodePoint = CodePoint {getCodePoint :: NonEmpty Word8} -- In UTF-8, the first byte of a codepoint tells us how many more bytes that codepoint contains. -- We can use this fact to reduce lookups. instance Memoizable CodePoint where memoize f = let f' = memoize (\w -> memoizeFixedLen (extraBytes w) (f . CodePoint . (w :|))) in \(CodePoint (w :| ws)) -> f' w ws extraBytes :: Word8 -> Int extraBytes x = case countLeadingOnes x of 0 -> 0 n -> n - 1 countLeadingOnes :: Word8 -> Int countLeadingOnes = countLeadingZeros . complement charToCodePoint :: Char -> CodePoint charToCodePoint = CodePoint . NonEmpty.fromList . ByteString.unpack . UTF8.fromString . (: []) codePointToChar :: CodePoint -> Char codePointToChar = head . UTF8.toString . ByteString.pack . NonEmpty.toList . getCodePoint