src/Blip/Marshal.hs

{-# LANGUAGE RecordWildCards #-}
-----------------------------------------------------------------------------
-- |
-- Module      : Blip.Marshal
-- Copyright   : (c) 2012, 2013 Bernie Pope
-- License     : BSD-style
-- Maintainer  : florbitous@gmail.com
-- Stability   : experimental
-- Portability : ghc
--
-- Reading, writing and representation of Python bytecode files.
--
-----------------------------------------------------------------------------

module Blip.Marshal
   ( readPyc, writePyc, PycFile (..), PyObject (..), CodeObjectFlagMask
   , co_optimized, co_newlocals, co_varargs, co_varkeywords
   , co_nested, co_generator, co_nofree )
   where

import Blip.MarshalDouble (bytesToDouble, doubleToBytes)
import Blip.Bytecode (decode, BytecodeSeq (..))
import Blip.Pretty (Pretty (..), prettyList, prettyTuple)
import Control.Applicative ((<$>), (<*>))
import Data.Map as Map hiding (map, size, empty)
import Data.Word (Word8, Word16, Word32)
import Control.Monad.Error (ErrorT (..), lift, replicateM)
import System.IO (Handle)
import qualified Data.ByteString.Lazy as B 
   (ByteString, hGetContents, unpack, hPutStr, length)
import Data.ByteString.Lazy.UTF8 as UTF8 (toString, fromString) 
import Data.Binary.Get (Get, runGet, getLazyByteString, getWord32le, getWord8, getWord16le)
import Data.Binary.Put (PutM, putWord32le, putLazyByteString, runPutM, putWord8, putWord16le)
import Data.Bits ((.&.))
import Data.Int (Int64, Int32)
import Data.Char (chr, ord)
import Text.PrettyPrint
   (text, (<+>), ($$), (<>), Doc , vcat, int, equals, doubleQuotes, hsep, empty)

data PycFile =
   PycFile
   { magic :: Word32
   , modified_time :: Word32
   , size :: Word32           -- the size in bytes of the original Python source
   , object :: PyObject       -- a code object
   }
   deriving Show

instance Pretty PycFile where
   pretty pycFile = 
      text "magic =" <+> pretty (magic pycFile) $$
      text "modified time =" <+> pretty (modified_time pycFile) $$
      text "size =" <+> pretty (size pycFile) $$
      text "object =" <+> pretty (object pycFile)

data PyObject
   = Code
        { argcount :: !Word32 -- #arguments, except *args
        , kwonlyargcount :: !Word32 -- #keyword only arguments 
        , nlocals :: !Word32 -- #local variables
        , stacksize :: !Word32 -- #entries needed for evaluation stack
        , flags :: !Word32 -- CO_..., see below
        , code :: !PyObject -- instruction opcodes (a string)
        , consts :: !PyObject -- list (constants used) XXX seems to be a tuple
        , names :: !PyObject -- list of strings (names used)
        , varnames :: !PyObject -- tuple of strings (local variable names)
        , freevars :: !PyObject -- tuple of strings (free variable names)
        , cellvars :: !PyObject -- tuple of strings (cell variable names)
        , filename :: !PyObject -- unicode (where it was loaded from)
        , name :: !PyObject -- unicode (name, for reference)
        , firstlineno :: !Word32 -- first source line number
        , lnotab :: !PyObject -- string (encoding addr<->lineno mapping)
        }
   | String { string :: !B.ByteString }
   | Tuple { elements :: ![PyObject] }
   | Int { int_value :: !Word32 }  -- XXX should that be Int32?
   | Float { float_value :: !Double }
   | None
   | Ellipsis
   | Unicode { unicode :: !String } -- should be decoded into a String
   | TrueObj
   | FalseObj
   | Complex { real :: !Double, imaginary :: !Double }
   | Long { long_value :: !Integer }
   deriving (Eq, Ord, Show)

instance Pretty PyObject where
   pretty (String {..}) = doubleQuotes $ pretty string
   pretty (Tuple {..}) = prettyTuple $ map pretty elements
   pretty (Int {..}) = pretty int_value
   pretty (Long {..}) = pretty long_value
   pretty (Float {..}) = pretty float_value
   pretty None = text "None"
   pretty Ellipsis = text "..."
   pretty TrueObj = text "True"
   pretty FalseObj = text "False"
   pretty (Unicode {..}) = doubleQuotes $ text unicode
   pretty (Code {..}) =
      text "argcount =" <+> pretty argcount $$
      text "kwonlyargcount =" <+> pretty kwonlyargcount $$
      text "nlocals =" <+> pretty nlocals $$
      text "stacksize =" <+> pretty stacksize $$
      text "flags =" <+> prettyFlags flags $$
      text "varnames =" <+> pretty varnames $$
      text "freevars =" <+> pretty freevars $$
      text "cellvars =" <+> pretty cellvars $$
      text "filename =" <+> pretty filename $$
      text "name =" <+> pretty name $$
      text "firstlineno =" <+> pretty firstlineno $$
      text "lnotab =" <+> prettyLnotab lnotab $$
      text "names =" <+> pretty names $$
      prettyConsts consts $$ 
      text "code =" <+> pretty (BytecodeSeq $ decode $ string code)
   pretty (Complex {..}) = pretty real <+> text "+" <+> pretty imaginary <> text "j"

prettyConsts :: PyObject -> Doc
prettyConsts obj =
   case obj of
      Tuple {..} -> 
         vcat $ map prettyConst $ zip [0..] elements
      _other -> text ("consts not a tuple: " ++ show obj)
   where
   prettyConst :: (Int, PyObject) -> Doc
   prettyConst (i, obj) = text "const" <+> int i <+> equals <+> pretty obj 

prettyLnotab :: PyObject -> Doc
prettyLnotab obj =
   case obj of
      String {..} -> prettyList $ map pretty $ pairs $ B.unpack string
      _other -> text ("lnotab not a string: " ++ show obj)
   where
   pairs :: [Word8] -> [(Word8, Word8)]
   pairs [] = []
   pairs [_] = error $ "Odd numbered linenotab"
   pairs (nextCode:nextLine:rest) = (nextCode, nextLine) : pairs rest

readPyc :: Handle -> IO PycFile
readPyc handle = do
   bytes <- B.hGetContents handle
   runGetDataCheck getPycFile bytes

writePyc :: Handle -> PycFile -> IO ()
writePyc handle pycFile = do
   bytes <- runPutDataCheck $ putPycFile pycFile
   B.hPutStr handle bytes

getPycFile :: GetData PycFile
getPycFile = PycFile <$> getU32 <*> getU32 <*> getU32 <*> readObject

putPycFile :: PycFile -> PutData
putPycFile pycFile = do
   putU32 $ magic pycFile
   putU32 $ modified_time pycFile
   putU32 $ size pycFile
   writeObject $ object pycFile

readObject :: GetData PyObject
readObject = do
   object_type <- decodeObjectType <$> getU8
   case object_type of
       CODE -> readCodeObject
       STRING -> readStringObject
       TUPLE -> readTupleObject
       INT -> readIntObject
       NONE -> return None 
       ELLIPSIS -> return Ellipsis
       TRUE -> return TrueObj
       FALSE -> return FalseObj
       UNICODE -> readUnicodeObject
       BINARY_FLOAT -> readFloatObject
       BINARY_COMPLEX -> readComplexObject
       LONG -> readLongObject
       _other -> error ("readObject: unsupported object type: " ++ show object_type)

writeObject :: PyObject -> PutData
writeObject object =
   case object of
      Code {..} -> writeCodeObject object
      String {..} -> writeStringObject object
      Tuple {..} -> writeTupleObject object
      Int {..} -> writeIntObject object
      None -> putU8 $ encodeObjectType NONE
      Ellipsis -> putU8 $ encodeObjectType ELLIPSIS
      Unicode {..} -> writeUnicodeObject object
      TrueObj -> putU8 $ encodeObjectType TRUE
      FalseObj -> putU8 $ encodeObjectType FALSE
      Float {..} -> writeFloatObject object
      Complex {..} -> writeComplexObject object
      Long {..} -> writeLongObject object

writeObjectType :: ObjectType -> PutData
writeObjectType = putU8 . encodeObjectType

readCodeObject :: GetData PyObject
readCodeObject =
   Code <$> getU32 <*> getU32 <*> getU32 <*> getU32 <*> getU32 <*> 
      readObject <*> readObject <*> readObject <*> readObject <*>
      readObject <*> readObject <*> readObject <*> readObject <*>
      getU32 <*> readObject

writeCodeObject :: PyObject -> PutData
writeCodeObject (Code {..}) =
   writeObjectType CODE >>
   mapM_ putU32 [argcount, kwonlyargcount, nlocals, stacksize, flags] >>
   mapM_ writeObject [code, consts, names, varnames, freevars, cellvars,
      filename, name] >>
   putU32 firstlineno >>
   writeObject lnotab
writeCodeObject other = error $ "writeCodeObject called on non code object: " ++ show other

readStringObject :: GetData PyObject
readStringObject = do
   len <- getU32
   String <$> (getBS $ fromIntegral len)

writeStringObject :: PyObject -> PutData
writeStringObject (String {..}) = 
   writeObjectType STRING >>
   putU32 (fromIntegral $ B.length string) >>
   putBS string
writeStringObject other = error $ "writStringObject called on non string object: " ++ show other

readTupleObject :: GetData PyObject
readTupleObject = do
   len <- getU32
   Tuple <$> replicateM (fromIntegral len) readObject

writeTupleObject :: PyObject -> PutData
writeTupleObject (Tuple {..}) =
   writeObjectType TUPLE >>
   putU32 (fromIntegral $ length elements) >>
   mapM_ writeObject elements
writeTupleObject other = error $ "writeTupleObject called on non tuple object: " ++ show other

readIntObject :: GetData PyObject
readIntObject = Int <$> getU32

writeIntObject :: PyObject -> PutData
writeIntObject (Int {..}) = 
   writeObjectType INT >> putU32 int_value
writeIntObject other = error $ "writeIntObject called on non int object: " ++ show other

readFloatObject :: GetData PyObject
readFloatObject = Float <$> getDouble

readComplexObject :: GetData PyObject
readComplexObject = Complex <$> getDouble <*> getDouble

writeFloatObject :: PyObject -> PutData
writeFloatObject (Float {..}) = 
   writeObjectType BINARY_FLOAT >> putDouble float_value
writeFloatObject other = error $ "writeFloatObject called on non float object: " ++ show other

writeComplexObject :: PyObject -> PutData
writeComplexObject (Complex {..}) =
   writeObjectType BINARY_COMPLEX >> putDouble real >> putDouble imaginary
writeComplexObject other = error $ "writeComplexObject called on non complex object: " ++ show other

readUnicodeObject :: GetData PyObject
readUnicodeObject = do
   len <- getU32
   bs <- getBS $ fromIntegral len
   return $ Unicode $ UTF8.toString bs

writeUnicodeObject :: PyObject -> PutData
writeUnicodeObject (Unicode {..}) = do
   writeObjectType UNICODE
   let uc = UTF8.fromString unicode 
   putU32 (fromIntegral $ B.length uc)
   putBS uc
writeUnicodeObject other = error $ "writeUnicodeObject called on non unicode object: " ++ show other

longDigitBase :: Integer
longDigitBase = 2^(15::Integer)

readLongObject :: GetData PyObject
readLongObject = do
   -- Read the len as a signed 32bit integer.
   -- The sign tells us whether the number is positive or negative.
   -- The magnitude tells us how many digits are in the number.
   -- Digits are stored in base 2^15, in 16 bit chunks.
   len <- getI32 
   if len == 0
      then return $ Long 0
      else do
         -- read 'len' digits
         base15digits <- replicateM (fromIntegral (abs len)) getU16
         -- pair each digit with its exponent
         -- [(d0, 0), (d1, 1), ... (dn, n)]
         let digitsExponents = zip (map fromIntegral base15digits) [(0::Integer) ..]
         -- (d0 * base^0) + (d1 * base^1) + ... + ... (dn * base^n)
             val = sum [(longDigitBase ^ exp) * digit | (digit, exp) <- digitsExponents]
         -- Check if the result is positive or negative based on the sign of len.
         if len < 0
            then return $! Long $! negate val
            else return $! Long val

writeLongObject :: PyObject -> PutData
writeLongObject (Long {..}) = do
   writeObjectType LONG 
   case compare long_value 0 of
      EQ -> putI32 0
      GT -> do 
         putI32 numDigits 
         mapM_ putU16 digits
      LT -> do
         putI32 $ negate numDigits 
         mapM_ putU16 digits
   where
   digits :: [Word16]
   digits = getDigits (abs long_value) longDigitBase
   numDigits :: Int32 
   numDigits = fromIntegral $ length digits
   -- We assume n > 0
   getDigits :: Integer -> Integer -> [Word16]
   getDigits 0 _base = []
   getDigits n base = (fromIntegral (n `mod` base)) : getDigits (n `div` base) base
writeLongObject other = error $ "writeLongObject called on non long object: " ++ show other

data ObjectType
   = NULL               -- '0'
   | NONE               -- 'N'
   | FALSE              -- 'F'
   | TRUE               -- 'T'
   | STOPITER           -- 'S'
   | ELLIPSIS           -- '.'
   | INT                -- 'i'

   | INT64              -- 'I' INT64 is deprecated. It is not,
                        -- generated anymore, and support for reading it
                        -- will be removed in Python 3.4.

   | FLOAT              -- 'f'
   | BINARY_FLOAT       -- 'g'
   | COMPLEX            -- 'x'
   | BINARY_COMPLEX     -- 'y'
   | LONG               -- 'l'
   | STRING             -- 's'
   | TUPLE              -- '('
   | LIST               -- '['
   | DICT               -- '{'
   | CODE               -- 'c'
   | UNICODE            -- 'u'
   | UNKNOWN            -- '?'
   | SET                -- '<'
   | FROZENSET          -- '>'
   deriving (Eq, Ord, Show)

charToObjectType :: Map.Map Char ObjectType
charToObjectType = Map.fromList objectTypeList 

objectTypeToChar :: Map.Map ObjectType Char
objectTypeToChar = Map.fromList [ (y, x) | (x, y) <- objectTypeList ]

objectTypeList :: [(Char, ObjectType)]
objectTypeList = [
   ('0', NULL), 
   ('N', NONE),
   ('F', FALSE), 
   ('T', TRUE),
   ('S', STOPITER),
   ('.', ELLIPSIS), 
   ('i', INT),
   ('I', INT64),
   ('f', FLOAT),
   ('g', BINARY_FLOAT),
   ('x', COMPLEX),
   ('y', BINARY_COMPLEX),
   ('l', LONG),
   ('s', STRING),
   ('(', TUPLE),
   ('[', LIST),
   ('{', DICT),
   ('c', CODE),
   ('u', UNICODE),
   ('?', UNKNOWN),
   ('<', SET),
   ('>', FROZENSET) ]

encodeObjectType :: ObjectType -> Word8
encodeObjectType objectType =
   case Map.lookup objectType objectTypeToChar of
      Nothing -> error $ "bad object type: " ++ show objectType
      Just chr -> fromIntegral $ ord chr 

decodeObjectType :: Word8 -> ObjectType
decodeObjectType byte =
   case Map.lookup byteChar charToObjectType of
      Nothing -> error $ "bad object type: " ++ show byteChar
      Just t -> t
   where
   byteChar = chr $ fromIntegral byte

-- utilities for reading binary data from a sequence of bytes
type GetData a = ErrorT String Get a

getDouble :: GetData Double
getDouble = do
   bs <- replicateM 8 getU8
   return $ bytesToDouble bs

getBS :: Int64 -> GetData B.ByteString
getBS = lift . getLazyByteString

-- read an unsigned 8 bit word
getU8 :: GetData Word8
getU8 = lift getWord8

-- XXX is it always little endian?
-- read an unsigned 32 bit word
getU32 :: GetData Word32
getU32 = lift getWord32le

getI32 :: GetData Int32
getI32 = fromIntegral `fmap` lift getWord32le

getU16 :: GetData Word16
getU16 = lift getWord16le

runGetData :: GetData a -> B.ByteString -> Either String a
runGetData = runGet . runErrorT

runGetDataCheck  :: GetData a -> B.ByteString -> IO a
runGetDataCheck g b =
   case runGetData g b of
      Left e -> fail e
      Right v -> return v

-- utilities for writing binary data to a sequence of bytes
type PutData = ErrorT String PutM ()

putDouble :: Double -> PutData
putDouble d = mapM_ putU8 $ doubleToBytes d

-- write a bytestring
putBS :: B.ByteString -> PutData
putBS = lift . putLazyByteString

-- write an unsigned 8 bit word
putU8 :: Word8 -> PutData
putU8 = lift . putWord8

-- write an unsigned 16 bit word
putU16 :: Word16 -> PutData
putU16 = lift . putWord16le

-- XXX is it always little endian?
-- write an unsigned 32 bit word
putU32 :: Word32 -> PutData
putU32 = lift . putWord32le

putI32 :: Int32 -> PutData
putI32 = putU32 . fromIntegral

runPutData :: PutData -> Either String B.ByteString
runPutData comp = 
   case runPutM (runErrorT comp) of
      (Left err, _) -> Left err
      (Right (), bs) -> Right bs 

runPutDataCheck  :: PutData -> IO B.ByteString
runPutDataCheck comp =
   case runPutData comp of
      Left e -> fail e
      Right bs -> return bs

-- masks for the code object flags
type CodeObjectFlagMask = Word32

co_optimized :: CodeObjectFlagMask
co_optimized = 0x0001
co_newlocals :: CodeObjectFlagMask 
co_newlocals = 0x0002
co_varargs :: CodeObjectFlagMask 
co_varargs = 0x0004
co_varkeywords :: CodeObjectFlagMask 
co_varkeywords = 0x0008
co_nested :: CodeObjectFlagMask 
co_nested = 0x0010
co_generator :: CodeObjectFlagMask 
co_generator = 0x0020
co_nofree :: CodeObjectFlagMask 
co_nofree = 0x0040

prettyFlags :: Word32 -> Doc
prettyFlags bits = 
   hsep $ map (uncurry showFlag) masks
   where
   checkFlag :: CodeObjectFlagMask -> Bool
   checkFlag mask = (bits .&. mask) /= 0
   showFlag :: CodeObjectFlagMask -> String -> Doc
   showFlag mask name
      | checkFlag mask = text name
      | otherwise = empty
   masks = [ (co_optimized, "CO_OPTIMIZED")
           , (co_newlocals, "CO_NEWLOCALS")
           , (co_varargs, "CO_VARARGS") 
           , (co_varkeywords, "CO_VARKEYWORDS")
           , (co_nested, "CO_NESTED") 
           , (co_generator, "CO_GENERATOR") 
           , (co_nofree, "CO_NOFREE") ]