{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module Language.PureScript.PSString
( PSString
, toUTF16CodeUnits
, decodeString
, decodeStringEither
, decodeStringWithReplacement
, prettyPrintString
, prettyPrintStringJS
, mkString
) where
import Prelude.Compat
import GHC.Generics (Generic)
import Codec.Serialise (Serialise)
import Control.DeepSeq (NFData)
import Control.Exception (try, evaluate)
import Control.Applicative ((<|>))
import qualified Data.Char as Char
import Data.Bits (shiftR)
import Data.List (unfoldr)
import Data.Scientific (toBoundedInteger)
import Data.String (IsString(..))
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import Data.Text (Text)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf16BE)
import Data.Text.Encoding.Error (UnicodeException)
import qualified Data.Vector as V
import Data.Word (Word16, Word8)
import Numeric (showHex)
import System.IO.Unsafe (unsafePerformIO)
import qualified Data.Aeson as A
import qualified Data.Aeson.Types as A
newtype PSString = PSString { toUTF16CodeUnits :: [Word16] }
deriving (Eq, Ord, Semigroup, Monoid, Generic)
instance NFData PSString
instance Serialise PSString
instance Show PSString where
show = show . codePoints
codePoints :: PSString -> String
codePoints = map (either (Char.chr . fromIntegral) id) . decodeStringEither
decodeStringWithReplacement :: PSString -> String
decodeStringWithReplacement = map (either (const '\xFFFD') id) . decodeStringEither
decodeStringEither :: PSString -> [Either Word16 Char]
decodeStringEither = unfoldr decode . toUTF16CodeUnits
where
decode :: [Word16] -> Maybe (Either Word16 Char, [Word16])
decode (h:l:rest) | isLead h && isTrail l = Just (Right (unsurrogate h l), rest)
decode (c:rest) | isSurrogate c = Just (Left c, rest)
decode (c:rest) = Just (Right (toChar c), rest)
decode [] = Nothing
unsurrogate :: Word16 -> Word16 -> Char
unsurrogate h l = toEnum ((toInt h - 0xD800) * 0x400 + (toInt l - 0xDC00) + 0x10000)
decodeString :: PSString -> Maybe Text
decodeString = hush . decodeEither . BS.pack . concatMap unpair . toUTF16CodeUnits
where
unpair w = [highByte w, lowByte w]
lowByte :: Word16 -> Word8
lowByte = fromIntegral
highByte :: Word16 -> Word8
highByte = fromIntegral . (`shiftR` 8)
decodeEither :: ByteString -> Either UnicodeException Text
decodeEither = unsafePerformIO . try . evaluate . decodeUtf16BE
hush = either (const Nothing) Just
instance IsString PSString where
fromString a = PSString $ concatMap encodeUTF16 a
where
surrogates :: Char -> (Word16, Word16)
surrogates c = (toWord (h + 0xD800), toWord (l + 0xDC00))
where (h, l) = divMod (fromEnum c - 0x10000) 0x400
encodeUTF16 :: Char -> [Word16]
encodeUTF16 c | fromEnum c > 0xFFFF = [high, low]
where (high, low) = surrogates c
encodeUTF16 c = [toWord $ fromEnum c]
instance A.ToJSON PSString where
toJSON str =
case decodeString str of
Just t -> A.toJSON t
Nothing -> A.toJSON (toUTF16CodeUnits str)
instance A.FromJSON PSString where
parseJSON a = jsonString <|> arrayOfCodeUnits
where
jsonString = fromString <$> A.parseJSON a
arrayOfCodeUnits = PSString <$> parseArrayOfCodeUnits a
parseArrayOfCodeUnits :: A.Value -> A.Parser [Word16]
parseArrayOfCodeUnits = A.withArray "array of UTF-16 code units" (traverse parseCodeUnit . V.toList)
parseCodeUnit :: A.Value -> A.Parser Word16
parseCodeUnit b = A.withScientific "two-byte non-negative integer" (maybe (A.typeMismatch "" b) return . toBoundedInteger) b
prettyPrintString :: PSString -> Text
prettyPrintString s = "\"" <> foldMap encodeChar (decodeStringEither s) <> "\""
where
encodeChar :: Either Word16 Char -> Text
encodeChar (Left c) = "\\x" <> showHex' 6 c
encodeChar (Right c)
| c == '\t' = "\\t"
| c == '\r' = "\\r"
| c == '\n' = "\\n"
| c == '"' = "\\\""
| c == '\'' = "\\\'"
| c == '\\' = "\\\\"
| shouldPrint c = T.singleton c
| otherwise = "\\x" <> showHex' 6 (Char.ord c)
shouldPrint :: Char -> Bool
shouldPrint ' ' = True
shouldPrint c =
Char.generalCategory c `elem`
[ Char.UppercaseLetter
, Char.LowercaseLetter
, Char.TitlecaseLetter
, Char.OtherLetter
, Char.DecimalNumber
, Char.LetterNumber
, Char.OtherNumber
, Char.ConnectorPunctuation
, Char.DashPunctuation
, Char.OpenPunctuation
, Char.InitialQuote
, Char.FinalQuote
, Char.OtherPunctuation
, Char.MathSymbol
, Char.CurrencySymbol
, Char.ModifierSymbol
, Char.OtherSymbol
]
prettyPrintStringJS :: PSString -> Text
prettyPrintStringJS s = "\"" <> foldMap encodeChar (toUTF16CodeUnits s) <> "\""
where
encodeChar :: Word16 -> Text
encodeChar c | c > 0xFF = "\\u" <> showHex' 4 c
encodeChar c | c > 0x7E || c < 0x20 = "\\x" <> showHex' 2 c
encodeChar c | toChar c == '\b' = "\\b"
encodeChar c | toChar c == '\t' = "\\t"
encodeChar c | toChar c == '\n' = "\\n"
encodeChar c | toChar c == '\v' = "\\v"
encodeChar c | toChar c == '\f' = "\\f"
encodeChar c | toChar c == '\r' = "\\r"
encodeChar c | toChar c == '"' = "\\\""
encodeChar c | toChar c == '\\' = "\\\\"
encodeChar c = T.singleton $ toChar c
showHex' :: Enum a => Int -> a -> Text
showHex' width c =
let hs = showHex (fromEnum c) "" in
T.pack (replicate (width - length hs) '0' <> hs)
isLead :: Word16 -> Bool
isLead h = h >= 0xD800 && h <= 0xDBFF
isTrail :: Word16 -> Bool
isTrail l = l >= 0xDC00 && l <= 0xDFFF
isSurrogate :: Word16 -> Bool
isSurrogate c = isLead c || isTrail c
toChar :: Word16 -> Char
toChar = toEnum . fromIntegral
toWord :: Int -> Word16
toWord = fromIntegral
toInt :: Word16 -> Int
toInt = fromIntegral
mkString :: Text -> PSString
mkString = fromString . T.unpack