{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module Language.PureScript.PSString
( PSString
, toUTF16CodeUnits
, decodeString
, decodeStringEither
, decodeStringWithReplacement
, prettyPrintString
, prettyPrintStringJS
, mkString
) where
import Prelude.Compat
import GHC.Generics (Generic)
import Control.DeepSeq (NFData)
import Control.Exception (try, evaluate)
import Control.Applicative ((<|>))
import Data.Char (chr)
import Data.Bits (shiftR)
import Data.List (unfoldr)
import Data.Monoid ((<>))
import Data.Scientific (toBoundedInteger)
import Data.String (IsString(..))
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import Data.Text (Text)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf16BE)
import Data.Text.Encoding.Error (UnicodeException)
import qualified Data.Vector as V
import Data.Word (Word16, Word8)
import Numeric (showHex)
import System.IO.Unsafe (unsafePerformIO)
import qualified Data.Aeson as A
import qualified Data.Aeson.Types as A
newtype PSString = PSString { toUTF16CodeUnits :: [Word16] }
deriving (Eq, Ord, Monoid, Generic)
instance NFData PSString
instance Show PSString where
show = show . codePoints
codePoints :: PSString -> String
codePoints = map (either (chr . fromIntegral) id) . decodeStringEither
decodeStringWithReplacement :: PSString -> String
decodeStringWithReplacement = map (either (const '\xFFFD') id) . decodeStringEither
decodeStringEither :: PSString -> [Either Word16 Char]
decodeStringEither = unfoldr decode . toUTF16CodeUnits
where
decode :: [Word16] -> Maybe (Either Word16 Char, [Word16])
decode (h:l:rest) | isLead h && isTrail l = Just (Right (unsurrogate h l), rest)
decode (c:rest) | isSurrogate c = Just (Left c, rest)
decode (c:rest) = Just (Right (toChar c), rest)
decode [] = Nothing
unsurrogate :: Word16 -> Word16 -> Char
unsurrogate h l = toEnum ((toInt h - 0xD800) * 0x400 + (toInt l - 0xDC00) + 0x10000)
prettyPrintString :: PSString -> Text
prettyPrintString = T.pack . show
decodeString :: PSString -> Maybe Text
decodeString = hush . decodeEither . BS.pack . concatMap unpair . toUTF16CodeUnits
where
unpair w = [highByte w, lowByte w]
lowByte :: Word16 -> Word8
lowByte = fromIntegral
highByte :: Word16 -> Word8
highByte = fromIntegral . (`shiftR` 8)
decodeEither :: ByteString -> Either UnicodeException Text
decodeEither = unsafePerformIO . try . evaluate . decodeUtf16BE
hush = either (const Nothing) Just
instance IsString PSString where
fromString a = PSString $ concatMap encodeUTF16 a
where
surrogates :: Char -> (Word16, Word16)
surrogates c = (toWord (h + 0xD800), toWord (l + 0xDC00))
where (h, l) = divMod (fromEnum c - 0x10000) 0x400
encodeUTF16 :: Char -> [Word16]
encodeUTF16 c | fromEnum c > 0xFFFF = [high, low]
where (high, low) = surrogates c
encodeUTF16 c = [toWord $ fromEnum c]
instance A.ToJSON PSString where
toJSON str =
case decodeString str of
Just t -> A.toJSON t
Nothing -> A.toJSON (toUTF16CodeUnits str)
instance A.FromJSON PSString where
parseJSON a = jsonString <|> arrayOfCodeUnits
where
jsonString = fromString <$> A.parseJSON a
arrayOfCodeUnits = PSString <$> parseArrayOfCodeUnits a
parseArrayOfCodeUnits :: A.Value -> A.Parser [Word16]
parseArrayOfCodeUnits = A.withArray "array of UTF-16 code units" (traverse parseCodeUnit . V.toList)
parseCodeUnit :: A.Value -> A.Parser Word16
parseCodeUnit b = A.withScientific "two-byte non-negative integer" (maybe (A.typeMismatch "" b) return . toBoundedInteger) b
prettyPrintStringJS :: PSString -> Text
prettyPrintStringJS s = "\"" <> foldMap encodeChar (toUTF16CodeUnits s) <> "\""
where
encodeChar :: Word16 -> Text
encodeChar c | c > 0xFF = "\\u" <> hex 4 c
encodeChar c | c > 0x7E || c < 0x20 = "\\x" <> hex 2 c
encodeChar c | toChar c == '\b' = "\\b"
encodeChar c | toChar c == '\t' = "\\t"
encodeChar c | toChar c == '\n' = "\\n"
encodeChar c | toChar c == '\v' = "\\v"
encodeChar c | toChar c == '\f' = "\\f"
encodeChar c | toChar c == '\r' = "\\r"
encodeChar c | toChar c == '"' = "\\\""
encodeChar c | toChar c == '\\' = "\\\\"
encodeChar c = T.singleton $ toChar c
hex :: (Enum a) => Int -> a -> Text
hex width c =
let hs = showHex (fromEnum c) "" in
T.pack (replicate (width - length hs) '0' <> hs)
isLead :: Word16 -> Bool
isLead h = h >= 0xD800 && h <= 0xDBFF
isTrail :: Word16 -> Bool
isTrail l = l >= 0xDC00 && l <= 0xDFFF
isSurrogate :: Word16 -> Bool
isSurrogate c = isLead c || isTrail c
toChar :: Word16 -> Char
toChar = toEnum . fromIntegral
toWord :: Int -> Word16
toWord = fromIntegral
toInt :: Word16 -> Int
toInt = fromIntegral
mkString :: Text -> PSString
mkString = fromString . T.unpack