{-# LANGUAGE ForeignFunctionInterface, CPP #-}
-- |
-- Module      : Data.Text.ICU.Convert
-- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- Character set conversion functions for Unicode, implemented as
-- bindings to the International Components for Unicode (ICU)
-- libraries.
module Data.Text.ICU.Convert
    (
    -- * Character set conversion
      Converter
    -- ** Basic functions
    , open
    , fromUnicode
    , toUnicode
    -- ** Converter metadata
    , getName
    , usesFallback
    , isAmbiguous
    -- * Functions for controlling global behavior
    , getDefaultName
    , setDefaultName
    -- * Miscellaneous functions
    , compareNames
    , aliases
    -- * Metadata
    , converterNames
    , standardNames
    ) where

import Data.ByteString.Internal (ByteString, createAndTrim)
import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
import Data.Int (Int32)
import Data.Text (Text)
import Data.Text.Foreign (fromPtr, useAsPtr)
#if !MIN_VERSION_text(2,0,0)
import Data.Text.ICU.Internal (UChar)
#endif
import Data.Text.ICU.Internal (lengthWord)
import Data.Text.ICU.Convert.Internal
import Data.Text.ICU.Error.Internal (UErrorCode, handleError)
import Data.Word (Word8, Word16)
import Foreign.C.String (CString, peekCString, withCString)
import Foreign.C.Types (CInt(..))
import Foreign.Marshal.Array (allocaArray)
import Foreign.Ptr (FunPtr, Ptr)
import System.IO.Unsafe (unsafePerformIO)
import Data.Text.ICU.Internal (UBool, asBool, asOrdering, withName, newICUPtr)

-- | Do a fuzzy compare of two converter/alias names.  The comparison
-- is case-insensitive, ignores leading zeroes if they are not
-- followed by further digits, and ignores all but letters and digits.
-- Thus the strings @\"UTF-8\"@, @\"utf_8\"@, @\"u*T\@f08\"@ and
-- @\"Utf 8\"@ are exactly equivalent.  See section 1.4, Charset Alias
-- Matching in Unicode Technical Standard #22 at
-- <http://www.unicode.org/reports/tr22/>
compareNames :: String -> String -> Ordering
compareNames :: String -> String -> Ordering
compareNames String
a String
b =
  forall a. IO a -> a
unsafePerformIO forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. String -> (CString -> IO a) -> IO a
withCString String
a forall a b. (a -> b) -> a -> b
$ \CString
aptr ->
    forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall a. Integral a => a -> Ordering
asOrdering forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. String -> (CString -> IO a) -> IO a
withCString String
b forall a b. (a -> b) -> a -> b
$ CString -> CString -> IO CInt
ucnv_compareNames CString
aptr

-- | Create a 'Converter' with the name of a coded character set
-- specified as a string.  The actual name will be resolved with the
-- alias file using a case-insensitive string comparison that ignores
-- leading zeroes and all non-alphanumeric characters.  E.g., the
-- names @\"UTF8\"@, @\"utf-8\"@, @\"u*T\@f08\"@ and @\"Utf 8\"@ are
-- all equivalent (see also 'compareNames').  If an empty string is
-- passed for the converter name, it will create one with the
-- 'getDefaultName' return value.
--
-- A converter name may contain options like a locale specification to
-- control the specific behavior of the newly instantiated converter.
-- The meaning of the options depends on the particular converter.  If
-- an option is not defined for or recognized by a given converter,
-- then it is ignored.
--
-- Options are appended to the converter name string, with a comma
-- between the name and the first option and also between adjacent
-- options.
--
-- If the alias is ambiguous, then the preferred converter is used.
--
-- The conversion behavior and names can vary between platforms. ICU
-- may convert some characters differently from other
-- platforms. Details on this topic are in the ICU User's Guide at
-- <http://icu-project.org/userguide/conversion.html>. Aliases
-- starting with a @\"cp\"@ prefix have no specific meaning other than
-- its an alias starting with the letters @\"cp\"@. Please do not
-- associate any meaning to these aliases.
open :: String                  -- ^ Name of the converter to use.
     -> Maybe Bool              -- ^ Whether to use fallback mappings
                                -- (see 'usesFallback' for details).
     -> IO Converter
open :: String -> Maybe Bool -> IO Converter
open String
name Maybe Bool
mf = do
  Converter
c <- forall a i.
(ForeignPtr a -> i) -> FinalizerPtr a -> IO (Ptr a) -> IO i
newICUPtr ForeignPtr UConverter -> Converter
Converter FunPtr (Ptr UConverter -> IO ())
ucnv_close forall a b. (a -> b) -> a -> b
$ forall a. String -> (CString -> IO a) -> IO a
withName String
name (forall a. (Ptr CInt -> IO a) -> IO a
handleError forall b c a. (b -> c) -> (a -> b) -> a -> c
. CString -> Ptr CInt -> IO (Ptr UConverter)
ucnv_open)
  case Maybe Bool
mf of
    Just Bool
f -> forall a. Converter -> (Ptr UConverter -> IO a) -> IO a
withConverter Converter
c forall a b. (a -> b) -> a -> b
$ \Ptr UConverter
p -> Ptr UConverter -> UBool -> IO ()
ucnv_setFallback Ptr UConverter
p forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Enum a => a -> Int
fromEnum forall a b. (a -> b) -> a -> b
$ Bool
f
    Maybe Bool
_ -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
  forall (m :: * -> *) a. Monad m => a -> m a
return Converter
c

-- | Encode a Unicode string into a code page string using the given converter.
fromUnicode :: Converter -> Text -> ByteString
fromUnicode :: Converter -> Text -> ByteString
fromUnicode Converter
cnv Text
t =
  forall a. IO a -> a
unsafePerformIO forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Text -> (Ptr Word16 -> I16 -> IO a) -> IO a
useAsPtr Text
t forall a b. (a -> b) -> a -> b
$ \Ptr Word16
tptr I16
tlen ->
    forall a. Converter -> (Ptr UConverter -> IO a) -> IO a
withConverter Converter
cnv forall a b. (a -> b) -> a -> b
$ \Ptr UConverter
cptr -> do
      let capacity :: Int32
capacity = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. Ptr UConverter -> CInt -> CInt
ucnv_max_bytes_for_string Ptr UConverter
cptr forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$
                     Text -> Int
lengthWord Text
t
      Int -> (Ptr Word8 -> IO Int) -> IO ByteString
createAndTrim (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int32
capacity) forall a b. (a -> b) -> a -> b
$ \Ptr Word8
sptr ->
        forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. (Ptr CInt -> IO a) -> IO a
handleError forall a b. (a -> b) -> a -> b
$
#if MIN_VERSION_text(2,0,0)
           ucnv_fromAlgorithmic_UTF8
#else
           Ptr UConverter
-> Ptr Word8
-> Int32
-> Ptr Word16
-> Int32
-> Ptr CInt
-> IO Int32
ucnv_fromUChars
#endif
           Ptr UConverter
cptr Ptr Word8
sptr Int32
capacity Ptr Word16
tptr (forall a b. (Integral a, Num b) => a -> b
fromIntegral I16
tlen)

-- | Decode an encoded string into a Unicode string using the given converter.
toUnicode :: Converter -> ByteString -> Text
toUnicode :: Converter -> ByteString -> Text
toUnicode Converter
cnv ByteString
bs =
  forall a. IO a -> a
unsafePerformIO forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. ByteString -> (CStringLen -> IO a) -> IO a
unsafeUseAsCStringLen ByteString
bs forall a b. (a -> b) -> a -> b
$ \(CString
sptr, Int
slen) ->
    forall a. Converter -> (Ptr UConverter -> IO a) -> IO a
withConverter Converter
cnv forall a b. (a -> b) -> a -> b
$ \Ptr UConverter
cptr -> do
      let (Int
capacity, Ptr UConverter
-> Ptr Word16 -> Int32 -> CString -> Int32 -> Ptr CInt -> IO Int32
conv) =
#if MIN_VERSION_text(2,0,0)
              (slen * 4, ucnv_toAlgorithmic_UTF8)
#else
              (Int
slen forall a. Num a => a -> a -> a
* Int
2, Ptr UConverter
-> Ptr Word16 -> Int32 -> CString -> Int32 -> Ptr CInt -> IO Int32
ucnv_toUChars)
#endif
      forall a b. Storable a => Int -> (Ptr a -> IO b) -> IO b
allocaArray Int
capacity forall a b. (a -> b) -> a -> b
$ \Ptr Word16
tptr ->
        Ptr Word16 -> I16 -> IO Text
fromPtr Ptr Word16
tptr forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< (forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. (Ptr CInt -> IO a) -> IO a
handleError forall a b. (a -> b) -> a -> b
$
                          Ptr UConverter
-> Ptr Word16 -> Int32 -> CString -> Int32 -> Ptr CInt -> IO Int32
conv Ptr UConverter
cptr Ptr Word16
tptr (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
capacity) CString
sptr
                                        (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
slen))

-- | Determines whether the converter uses fallback mappings or not.
-- This flag has restrictions.  Regardless of this flag, the converter
-- will always use fallbacks from Unicode Private Use codepoints, as
-- well as reverse fallbacks (to Unicode).  For details see \".ucm
-- File Format\" in the Conversion Data chapter of the ICU User Guide:
-- <http://www.icu-project.org/userguide/conversion-data.html#ucmformat>
usesFallback :: Converter -> Bool
usesFallback :: Converter -> Bool
usesFallback Converter
cnv = forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$
                   forall a. Integral a => a -> Bool
asBool forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` forall a. Converter -> (Ptr UConverter -> IO a) -> IO a
withConverter Converter
cnv Ptr UConverter -> IO UBool
ucnv_usesFallback

-- | Returns the current default converter name. If you want to 'open'
-- a default converter, you do not need to use this function.  It is
-- faster to pass the empty string to 'open' the default converter.
getDefaultName :: IO String
getDefaultName :: IO String
getDefaultName = CString -> IO String
peekCString forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< IO CString
ucnv_getDefaultName

-- | Indicates whether the converter contains ambiguous mappings of
-- the same character or not.
isAmbiguous :: Converter -> Bool
isAmbiguous :: Converter -> Bool
isAmbiguous Converter
cnv = forall a. Integral a => a -> Bool
asBool forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$ forall a. Converter -> (Ptr UConverter -> IO a) -> IO a
withConverter Converter
cnv Ptr UConverter -> IO UBool
ucnv_isAmbiguous

-- | Sets the current default converter name. If this function needs
-- to be called, it should be called during application
-- initialization. Most of the time, the results from 'getDefaultName'
-- or 'open' with an empty string argument is sufficient for your
-- application.
--
-- /Note/: this function is not thread safe. /Do not/ call this
-- function when /any/ ICU function is being used from more than one
-- thread!
setDefaultName :: String -> IO ()
setDefaultName :: String -> IO ()
setDefaultName String
s = forall a. String -> (CString -> IO a) -> IO a
withCString String
s forall a b. (a -> b) -> a -> b
$ CString -> IO ()
ucnv_setDefaultName

-- | A list of the canonical names of all available converters.
converterNames :: [String]
{-# NOINLINE converterNames #-}
converterNames :: [String]
converterNames = forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$
  forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM ((CString -> IO String
peekCString forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<<) forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int32 -> IO CString
ucnv_getAvailableName) [Int32
0..Int32
ucnv_countAvailableforall a. Num a => a -> a -> a
-Int32
1]

-- | The list of supported standard names.
standardNames :: [String]
{-# NOINLINE standardNames #-}
standardNames :: [String]
standardNames = forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (t :: * -> *) a. Foldable t => t a -> Bool
null) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$
  forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM ((CString -> IO String
peekCString forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<<) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. (Ptr CInt -> IO a) -> IO a
handleError forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word16 -> Ptr CInt -> IO CString
ucnv_getStandard) [Word16
0..Word16
ucnv_countStandardsforall a. Num a => a -> a -> a
-Word16
1]

-- | Return the aliases for a given converter or alias name.
aliases :: String -> [String]
aliases :: String -> [String]
aliases String
name = forall a. IO a -> a
unsafePerformIO forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. String -> (CString -> IO a) -> IO a
withCString String
name forall a b. (a -> b) -> a -> b
$ \CString
ptr -> do
  Word16
count <- forall a. (Ptr CInt -> IO a) -> IO a
handleError forall a b. (a -> b) -> a -> b
$ CString -> Ptr CInt -> IO Word16
ucnv_countAliases CString
ptr
  if Word16
count forall a. Eq a => a -> a -> Bool
== Word16
0
    then forall (m :: * -> *) a. Monad m => a -> m a
return []
    else forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM ((CString -> IO String
peekCString forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<<) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. (Ptr CInt -> IO a) -> IO a
handleError forall b c a. (b -> c) -> (a -> b) -> a -> c
. CString -> Word16 -> Ptr CInt -> IO CString
ucnv_getAlias CString
ptr) [Word16
0..Word16
countforall a. Num a => a -> a -> a
-Word16
1]

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_open" ucnv_open
    :: CString -> Ptr UErrorCode -> IO (Ptr UConverter)

foreign import ccall unsafe "hs_text_icu.h &__hs_ucnv_close" ucnv_close
    :: FunPtr (Ptr UConverter -> IO ())

foreign import ccall unsafe "__hs_ucnv_get_max_bytes_for_string" ucnv_max_bytes_for_string
    :: Ptr UConverter -> CInt -> CInt

#if MIN_VERSION_text(2,0,0)

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_toAlgorithmic_UTF8" ucnv_toAlgorithmic_UTF8
    :: Ptr UConverter -> Ptr Word8 -> Int32 -> CString -> Int32
    -> Ptr UErrorCode -> IO Int32

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_fromAlgorithmic_UTF8" ucnv_fromAlgorithmic_UTF8
    :: Ptr UConverter -> Ptr Word8 -> Int32 -> Ptr Word8 -> Int32
    -> Ptr UErrorCode -> IO Int32

#else

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_toUChars" ucnv_toUChars
    :: Ptr UConverter -> Ptr UChar -> Int32 -> CString -> Int32
    -> Ptr UErrorCode -> IO Int32

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_fromUChars" ucnv_fromUChars
    :: Ptr UConverter -> Ptr Word8 -> Int32 -> Ptr UChar -> Int32
    -> Ptr UErrorCode -> IO Int32

#endif

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_compareNames" ucnv_compareNames
    :: CString -> CString -> IO CInt

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getDefaultName" ucnv_getDefaultName
    :: IO CString

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_setDefaultName" ucnv_setDefaultName
    :: CString -> IO ()

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countAvailable" ucnv_countAvailable
    :: Int32

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getAvailableName" ucnv_getAvailableName
    :: Int32 -> IO CString

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countAliases" ucnv_countAliases
    :: CString -> Ptr UErrorCode -> IO Word16

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getAlias" ucnv_getAlias
    :: CString -> Word16 -> Ptr UErrorCode -> IO CString

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countStandards" ucnv_countStandards
    :: Word16

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getStandard" ucnv_getStandard
    :: Word16 -> Ptr UErrorCode -> IO CString

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_usesFallback" ucnv_usesFallback
    :: Ptr UConverter -> IO UBool

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_setFallback" ucnv_setFallback
    :: Ptr UConverter -> UBool -> IO ()

foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_isAmbiguous" ucnv_isAmbiguous
    :: Ptr UConverter -> IO UBool