module Bio.Util.Text
    ( Unpack(..)
    , w2c, c2w
    , decodeBytes
    , encodeBytes
    , decompressGzip
    ) where

import BasePrelude
import Data.ByteString.Internal     ( c2w, w2c )
import Data.Text.Encoding           ( encodeUtf8, decodeUtf8With )

import qualified Codec.Compression.Zlib.Internal as Z
import qualified Data.ByteString.Char8           as S
import qualified Data.ByteString.Lazy            as L
import qualified Data.ByteString.Lazy.Char8      as C ( unpack )
import qualified Data.ByteString.Lazy.Internal   as L ( ByteString(..) )
import qualified Data.Text                       as T

-- | Class of things that can be unpacked into 'String's.  Kind of the
-- opposite of 'IsString'.
class Unpack s where unpack :: s -> String

instance Unpack L.ByteString where unpack = C.unpack
instance Unpack S.ByteString where unpack = S.unpack
instance Unpack T.Text       where unpack = T.unpack
instance Unpack String       where unpack = id


-- | Converts 'Bytes' into 'Text'.  This uses UTF8, but if there is an
-- error, it pretends it was Latin1.  Evil as this is, it tends to Just
-- Work on files where nobody ever wasted a thought on encodings.
decodeBytes :: S.ByteString -> T.Text
decodeBytes = decodeUtf8With (const $ fmap w2c)

-- | Converts 'Text' into 'Bytes'.  This uses UTF8.
encodeBytes :: T.Text -> S.ByteString
encodeBytes = encodeUtf8


-- | Decompresses Gzip or Bgzf and passes everything else on.  In
-- reality, it simply decompresses Gzip, and when done, looks for
-- another Gzip stream.  Since there is a small chance to attempt
-- decompression of an uncompressed stream, the original data is
-- returned in case of an error.
decompressGzip :: L.ByteString -> L.ByteString
decompressGzip s = case L.uncons s of
    Just (31, s') -> case L.uncons s' of
        Just (139,_) -> Z.foldDecompressStreamWithInput L.Chunk decompressGzip (const s)
                        (Z.decompressST Z.gzipOrZlibFormat Z.defaultDecompressParams) s
        _            -> s
    _                -> s