module Bio.Util.Text ( Unpack(..) , w2c, c2w , decodeBytes , encodeBytes , decompressGzip ) where import BasePrelude import Data.ByteString.Internal ( c2w, w2c ) import Data.Text.Encoding ( encodeUtf8, decodeUtf8With ) import qualified Codec.Compression.Zlib.Internal as Z import qualified Data.ByteString.Char8 as S import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy.Internal as L ( ByteString(..) ) import qualified Data.Text as T -- | Class of things that can be unpacked into 'String's. Kind of the -- opposite of 'IsString'. class Unpack s where unpack :: s -> String instance Unpack S.ByteString where unpack = S.unpack instance Unpack T.Text where unpack = T.unpack instance Unpack String where unpack = id -- | Converts 'Bytes' into 'Text'. This uses UTF8, but if there is an -- error, it pretends it was Latin1. Evil as this is, it tends to Just -- Work on files where nobody ever wasted a thought on encodings. decodeBytes :: S.ByteString -> T.Text decodeBytes = decodeUtf8With (const $ fmap w2c) -- | Converts 'Text' into 'Bytes'. This uses UTF8. encodeBytes :: T.Text -> S.ByteString encodeBytes = encodeUtf8 -- | Decompresses Gzip or Bgzf and passes everything else on. In -- reality, it simply decompresses Gzip, and when done, looks for -- another Gzip stream. Since there is a small chance to attempt -- decompression of an uncompressed stream, the original data is -- returned in case of an error. decompressGzip :: L.ByteString -> L.ByteString decompressGzip s = case L.uncons s of Just (31, s') -> case L.uncons s' of Just (139,_) -> Z.foldDecompressStreamWithInput L.Chunk decompressGzip (const s) (Z.decompressST Z.gzipOrZlibFormat Z.defaultDecompressParams) s _ -> s _ -> s