src/Darcs/Util/ByteString.hs

{-# LANGUAGE CPP #-}
-----------------------------------------------------------------------------
-- |
-- Module      :  Darcs.Util.ByteString
-- Copyright   :  (c) The University of Glasgow 2001,
--                    David Roundy 2003-2005
-- License : GPL (I'm happy to also license this file BSD style but don't
--           want to bother distributing two license files with darcs.
--
-- Maintainer  :  droundy@abridgegame.org
-- Stability   :  experimental
-- Portability :  portable
--
-- GZIp and MMap IO for ByteStrings, encoding utilities, and miscellaneous
-- functions for Data.ByteString
--
module Darcs.Util.ByteString
    (
    -- * IO with mmap or gzip
      gzReadFilePS
    , mmapFilePS
    , gzWriteFilePS
    , gzWriteFilePSs
    , gzReadStdin
    , gzWriteHandle
    , FileSegment
    , readSegment
    -- * gzip handling
    , isGZFile
    , gzDecompress
    -- * list utilities
    , dropSpace
    , breakSpace
    , linesPS
    , unlinesPS
    , hashPS
    , breakFirstPS
    , breakLastPS
    , substrPS
    , readIntPS
    , isFunky
    , fromHex2PS
    , fromPS2Hex
    , betweenLinesPS
    , intercalate
    -- * encoding and unicode utilities
    , isAscii
    , decodeLocale
    , encodeLocale
    , unpackPSFromUTF8
    , packStringToUTF8
    -- * properties
    , prop_unlinesPS_linesPS_left_inverse
    , prop_linesPS_length
    , prop_unlinesPS_length
    , propHexConversion
    , spec_betweenLinesPS
    ) where

import Prelude ()
import Darcs.Prelude

import Codec.Binary.Base16 ( b16Enc, b16Dec )

import qualified Data.ByteString            as B
import qualified Data.ByteString.Char8      as BC
import qualified Data.ByteString.Lazy       as BL
import Data.ByteString (intercalate)

import System.IO ( withFile, IOMode(ReadMode)
                 , hSeek, SeekMode(SeekFromEnd,AbsoluteSeek)
                 , openBinaryFile, hClose, Handle, hGetChar
                 , stdin)
import System.IO.Error          ( catchIOError )
import System.IO.Unsafe         ( unsafePerformIO )

import Data.Bits                ( rotateL )
import Data.Char                ( ord, isSpace, toLower, toUpper )
import Data.Word                ( Word8 )
import Data.Int                 ( Int32, Int64 )
import Data.List                ( intersperse )
import Control.Monad            ( when )
import Control.Monad.ST.Lazy    ( ST )

import qualified Codec.Compression.GZip as GZ
import qualified Codec.Compression.Zlib.Internal as ZI
import Darcs.Util.Encoding ( decode, encode, decodeUtf8, encodeUtf8 )
import Darcs.Util.Global ( addCRCWarning )

#if mingw32_HOST_OS
#else
import System.IO.MMap( mmapFileByteString )
#endif
import System.Mem( performGC )
import System.Posix.Files( fileSize, getSymbolicLinkStatus )

-- | readIntPS skips any whitespace at the beginning of its argument, and
-- reads an Int from the beginning of the PackedString.  If there is no
-- integer at the beginning of the string, it returns Nothing, otherwise it
-- just returns the int read, along with a B.ByteString containing the
-- remainder of its input.

readIntPS :: B.ByteString -> Maybe (Int, B.ByteString)
readIntPS = BC.readInt . BC.dropWhile isSpace

------------------------------------------------------------------------
-- A locale-independent isspace(3) so patches are interpreted the same everywhere.
-- ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\r')
isSpaceWord8 :: Word8 -> Bool
isSpaceWord8 = (`elem` [0x20, 0x09, 0x0A, 0x0D])
{-# INLINE isSpaceWord8 #-}

-- | Drop leading white space, where white space is defined as
-- consisting of ' ', '\t', '\n', or '\r'.
dropSpace :: B.ByteString -> B.ByteString
dropSpace bs = B.dropWhile isSpaceWord8 bs

-- | Split at first occurrence of ' ', '\t', '\n', or '\r'.
breakSpace :: B.ByteString -> (B.ByteString, B.ByteString)
breakSpace bs = B.break isSpaceWord8 bs

------------------------------------------------------------------------

{-# INLINE isFunky #-}
isFunky :: B.ByteString -> Bool
isFunky ps = 0 `B.elem` ps || 26 `B.elem` ps

------------------------------------------------------------------------

{-# INLINE hashPS #-}
hashPS :: B.ByteString -> Int32
hashPS = B.foldl' hashByte 0

{-# INLINE hashByte #-}
hashByte :: Int32 -> Word8 -> Int32
hashByte h x = fromIntegral x + rotateL h 8

{-# INLINE substrPS #-}
substrPS :: B.ByteString -> B.ByteString -> Maybe Int
substrPS tok str
    | B.null tok = Just 0
    | B.length tok > B.length str = Nothing
    | otherwise = do n <- B.elemIndex (B.head tok) str
                     let ttok = B.tail tok
                         reststr = B.drop (n+1) str
                     if ttok == B.take (B.length ttok) reststr
                        then Just n
                        else ((n+1)+) `fmap` substrPS tok reststr

------------------------------------------------------------------------

-- TODO: replace breakFirstPS and breakLastPS with definitions based on
-- ByteString's break/breakEnd
{-# INLINE breakFirstPS #-}
breakFirstPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString)
breakFirstPS c p = case BC.elemIndex c p of
                   Nothing -> Nothing
                   Just n -> Just (B.take n p, B.drop (n+1) p)

{-# INLINE breakLastPS #-}
breakLastPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString)
breakLastPS c p = case BC.elemIndexEnd c p of
                  Nothing -> Nothing
                  Just n -> Just (B.take n p, B.drop (n+1) p)

------------------------------------------------------------------------
-- linesPS and unlinesPS

{-# INLINE linesPS #-}
linesPS :: B.ByteString -> [B.ByteString]
linesPS ps
     | B.null ps = [B.empty]
     | otherwise = BC.split '\n' ps

{-# INLINE unlinesPS #-}
unlinesPS :: [B.ByteString] -> B.ByteString
unlinesPS [] = B.empty
unlinesPS x  = B.concat $ intersperse (BC.singleton '\n') x

-- properties of linesPS and unlinesPS

prop_unlinesPS_linesPS_left_inverse :: B.ByteString -> Bool
prop_unlinesPS_linesPS_left_inverse x = unlinesPS (linesPS x) == x

prop_linesPS_length :: B.ByteString -> Bool
prop_linesPS_length x = length (linesPS x) == length (BC.elemIndices '\n' x) + 1

prop_unlinesPS_length :: [B.ByteString] -> Bool
prop_unlinesPS_length xs =
  B.length (unlinesPS xs) == if null xs then 0 else sum (map B.length xs) + length xs - 1

-- -----------------------------------------------------------------------------
-- gzReadFilePS

-- |Decompress the given bytestring into a lazy list of chunks, along with a boolean
-- flag indicating (if True) that the CRC was corrupted.
-- Inspecting the flag will cause the entire list of chunks to be evaluated (but if
-- you throw away the list immediately this should run in constant space).
gzDecompress :: Maybe Int -> BL.ByteString -> ([B.ByteString], Bool)
gzDecompress mbufsize =
    -- This is what the code would be without the bad CRC recovery logic:
    -- return . BL.toChunks . GZ.decompressWith decompressParams
    decompressWarn (ZI.decompressST ZI.gzipFormat decompressParams)
  where
        decompressParams = case mbufsize of
                              Just bufsize -> GZ.defaultDecompressParams { GZ.decompressBufferSize = bufsize }
                              Nothing -> GZ.defaultDecompressParams

        decompressWarn :: (forall s . ZI.DecompressStream (ST s)) -> BL.ByteString -> ([B.ByteString], Bool)
        decompressWarn = ZI.foldDecompressStreamWithInput
                           (\x ~(xs, b) -> (x:xs, b))
                           (\xs -> if BL.null xs
                                      then ([], False)
                                      else error "trailing data at end of compressed stream"
                           )
                           handleBad

        -- For a while a bug in darcs caused gzip files with good data but bad CRCs to be
        -- produced. Trap bad CRC messages, run the specified action to report that it happened,
        -- but continue on the assumption that the data is valid.
        handleBad (ZI.DataFormatError "incorrect data check") = ([], True)
        handleBad e = error (show e)

isGZFile :: FilePath -> IO (Maybe Int)
isGZFile f = do
    h <- openBinaryFile f ReadMode
    header <- B.hGet h 2
    if header /= B.pack [31,139]
       then do hClose h
               return Nothing
       else do hSeek h SeekFromEnd (-4)
               len <- hGetLittleEndInt h
               hClose h
               return (Just len)

-- | Read an entire file, which may or may not be gzip compressed, directly
-- into a 'B.ByteString'.
gzReadFilePS :: FilePath -> IO B.ByteString
gzReadFilePS f = do
    mlen <- isGZFile f
    case mlen of
       Nothing -> mmapFilePS f
       Just len ->
            do -- Passing the length to gzDecompress means that it produces produces one chunk,
               -- which in turn means that B.concat won't need to copy data.
               -- If the length is wrong this will just affect efficiency, not correctness
               let doDecompress buf = let (res, bad) = gzDecompress (Just len) buf
                                      in do when bad $ addCRCWarning f
                                            return res
               compressed <- (BL.fromChunks . return) `fmap` mmapFilePS f
               B.concat `fmap` doDecompress compressed

hGetLittleEndInt :: Handle -> IO Int
hGetLittleEndInt h = do
    b1 <- ord `fmap` hGetChar h
    b2 <- ord `fmap` hGetChar h
    b3 <- ord `fmap` hGetChar h
    b4 <- ord `fmap` hGetChar h
    return $ b1 + 256*b2 + 65536*b3 + 16777216*b4

gzWriteFilePS :: FilePath -> B.ByteString -> IO ()
gzWriteFilePS f ps = gzWriteFilePSs f [ps]

gzWriteFilePSs :: FilePath -> [B.ByteString] -> IO ()
gzWriteFilePSs f pss  =
    BL.writeFile f $ GZ.compress $ BL.fromChunks pss

gzWriteHandle :: Handle -> [B.ByteString] -> IO ()
gzWriteHandle h pss  =
    BL.hPut h $ GZ.compress $ BL.fromChunks pss

-- | Read standard input, which may or may not be gzip compressed, directly
-- into a 'B.ByteString'.
gzReadStdin :: IO B.ByteString
gzReadStdin = do
    header <- B.hGet stdin 2
    rest   <- B.hGetContents stdin
    let allStdin = B.concat [header,rest]
    return $
     if header /= B.pack [31,139]
      then allStdin
      else let decompress = fst . gzDecompress Nothing
               compressed = BL.fromChunks [allStdin]
           in
           B.concat $ decompress compressed

-- | Pointer to a filesystem, possibly with start/end offsets. Supposed to be
-- fed to (uncurry mmapFileByteString) or similar.
type FileSegment = (FilePath, Maybe (Int64, Int))

-- | Read in a FileSegment into a Lazy ByteString. Implemented using mmap.
readSegment :: FileSegment -> IO BL.ByteString
readSegment (f,range) = do
    bs <- tryToRead
       `catchIOError` (\_ -> do
                     size <- fileSize `fmap` getSymbolicLinkStatus f
                     if size == 0
                        then return B.empty
                        else performGC >> tryToRead)
    return $ BL.fromChunks [bs]
  where
    tryToRead =
        case range of
            Nothing -> B.readFile f
            Just (off, size) -> withFile f ReadMode $ \h -> do
                hSeek h AbsoluteSeek $ fromIntegral off
                B.hGet h size
{-# INLINE readSegment #-}

-- -----------------------------------------------------------------------------
-- mmapFilePS

-- | Like readFilePS, this reads an entire file directly into a
-- 'B.ByteString', but it is even more efficient.  It involves directly
-- mapping the file to memory.  This has the advantage that the contents of
-- the file never need to be copied.  Also, under memory pressure the page
-- may simply be discarded, wile in the case of readFilePS it would need to
-- be written to swap.  If you read many small files, mmapFilePS will be
-- less memory-efficient than readFilePS, since each mmapFilePS takes up a
-- separate page of memory.  Also, you can run into bus errors if the file
-- is modified.

mmapFilePS :: FilePath -> IO B.ByteString
#if mingw32_HOST_OS
mmapFilePS = B.readFile
#else
mmapFilePS f =
  mmapFileByteString f Nothing
   `catchIOError` (\_ -> do
                     size <- fileSize `fmap` getSymbolicLinkStatus f
                     if size == 0
                        then return B.empty
                        else performGC >> mmapFileByteString f Nothing)
#endif

-- -------------------------------------------------------------------------
-- fromPS2Hex

fromPS2Hex :: B.ByteString -> B.ByteString
fromPS2Hex = BC.map toLower . b16Enc

-- -------------------------------------------------------------------------
-- fromHex2PS

fromHex2PS :: B.ByteString -> B.ByteString
fromHex2PS s =
  case b16Dec $ BC.map toUpper s of
    Right (result, remaining)
      | B.null remaining -> result
    _ -> error "fromHex2PS: input is not hex encoded"

propHexConversion :: B.ByteString -> Bool
propHexConversion x = fromHex2PS (fromPS2Hex x) == x

-- -------------------------------------------------------------------------
-- betweenLinesPS

-- | Return the B.ByteString between the two lines given,
-- or Nothing if they do not appear.
betweenLinesPS :: B.ByteString -> B.ByteString -> B.ByteString
               -> Maybe B.ByteString
betweenLinesPS start end ps =
  case B.breakSubstring start_line ps of
    (before_start, at_start)
      | not (B.null at_start)
      , B.null before_start || BC.last before_start == '\n' ->
          case B.breakSubstring end_line (B.drop (B.length start_line) at_start) of
            (before_end, at_end)
              | not (B.null at_end)
              , B.null before_end || BC.last before_end == '\n' -> Just before_end
              | otherwise -> Nothing
      | otherwise -> Nothing
  where
    start_line = BC.snoc start '\n'
    end_line = BC.snoc end '\n'

-- | Simpler but less efficient variant of 'betweenLinesPS'.
spec_betweenLinesPS :: B.ByteString -> B.ByteString -> B.ByteString
                    -> Maybe B.ByteString
spec_betweenLinesPS start end ps =
  case break (start ==) (linesPS ps) of
    (_, _:after_start) ->
      case break (end ==) after_start of
        (before_end, _:_) ->
          Just $ BC.unlines before_end
        _ -> Nothing
    _ -> Nothing

-- | Test if a ByteString is made of ascii characters
isAscii :: B.ByteString -> Bool
isAscii = B.all (< 128)

-- * Encoding functions

-- Use of 'unsafePerformIO' is ratified by the fact that these
-- really are pure functions.

-- | Decode a 'ByteString' containing UTF-8 to a 'String'. Decoding errors
-- are flagged with the U+FFFD character.
unpackPSFromUTF8 :: B.ByteString -> String
unpackPSFromUTF8  = unsafePerformIO . decodeUtf8

-- | Encode a 'String' to a 'ByteString' using UTF-8.
packStringToUTF8 :: String -> B.ByteString
packStringToUTF8 = unsafePerformIO . encodeUtf8

-- | Decode a 'ByteString' to a 'String' according to the current locale,
-- using lone surrogates for un-decodable bytes.
decodeLocale :: B.ByteString -> String
decodeLocale = unsafePerformIO . decode

-- | Encode a 'String' to a 'ByteString' according to the current locale,
-- converting lone surrogates back to the original byte. If that
-- fails (because the locale does not support the full unicode range)
-- then encode using utf-8, assuming that the un-ecodable characters
-- come from patch meta data.
--
-- See also 'Darcs.UI.Commands.setEnvCautiously'.
encodeLocale :: String -> B.ByteString
encodeLocale s = unsafePerformIO $ encode s `catchIOError` (\_ -> encodeUtf8 s)