{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ScopedTypeVariables #-}

-- | Reverse bits
--
-- There are several algorithms performing the same thing here (reversing bits
-- into words of different sizes). There are benchmarks for them in the 
-- "bench" directory. The fastest one for the current architecture should be
-- selected below. If you find that another algorithm is faster on your
-- architecture, please report it.
module Haskus.Format.Binary.Bits.Reverse
   ( 
   -- * Generic
     ReversableBits (..)
   , reverseBitsGeneric
   -- * Algorithms
   , reverseBitsObvious
   , reverseBits3Ops
   , reverseBits4Ops
   , reverseBitsTable
   , reverseBits7Ops
   , reverseBits5LgN
   , liftReverseBits
   )
where

import Haskus.Format.Binary.Buffer
import Haskus.Format.Binary.Word
import Haskus.Format.Binary.Bits.Finite
import Haskus.Format.Binary.Bits.Shift
import Haskus.Format.Binary.Bits.Bitwise
import Haskus.Format.Binary.Bits.Index
import Haskus.Utils.Types (KnownNat)

---------------------------------------------------
-- Generic and specialized reverseBits
---------------------------------------------------


-- | Reverse bits in a Word
reverseBitsGeneric ::
   ( FiniteBits a
   , Integral a
   , ShiftableBits a
   , Bitwise a
   , KnownNat (BitSize a)
   ) => a -> a
reverseBitsGeneric = liftReverseBits reverseBits4Ops

-- | Data whose bits can be reversed
class ReversableBits w where
   reverseBits :: w -> w

instance ReversableBits Word8 where
   reverseBits = reverseBits4Ops

instance ReversableBits Word16 where
   reverseBits = reverseBits5LgN

instance ReversableBits Word32 where
   reverseBits = reverseBits5LgN

instance ReversableBits Word64 where
   reverseBits = reverseBits5LgN

instance ReversableBits Word where
   reverseBits = reverseBits5LgN

instance ReversableBits Int8 where
   reverseBits = fromIntegral . reverseBits4Ops . fromIntegral

instance ReversableBits Int16 where
   reverseBits = reverseBits5LgN

instance ReversableBits Int32 where
   reverseBits = reverseBits5LgN

instance ReversableBits Int64 where
   reverseBits = reverseBits5LgN

instance ReversableBits Int where
   reverseBits = reverseBits5LgN


---------------------------------------------------
-- Bit reversal algorithms
---------------------------------------------------

-- Algorithms and explanations adapted from:
-- http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits

-- Reverse the bits the obvious way
-- ================================
--
--
-- unsigned int v;     // input bits to be reversed
-- unsigned int r = v; // r will be reversed bits of v; first get LSB of v
-- int s = sizeof(v) * CHAR_BIT - 1; // extra shift needed at end
-- 
-- for (v >>= 1; v; v >>= 1)
-- {   
--   r <<= 1;
--   r |= v & 1;
--   s--;
-- }
-- r <<= s; // shift when v's highest bits are zero
--
-- On October 15, 2004, Michael Hoisie pointed out a bug in the original
-- version. Randal E. Bryant suggested removing an extra operation on May 3,
-- 2005. Behdad Esfabod suggested a slight change that eliminated one iteration
-- of the loop on May 18, 2005. Then, on February 6, 2007, Liyong Zhou
-- suggested a better version that loops while v is not 0, so rather than
-- iterating over all bits it stops early. 

-- | Obvious recursive version
reverseBitsObvious :: forall a.
   ( FiniteBits a
   , ShiftableBits a
   , IndexableBits a
   , Bitwise a
   , KnownNat (BitSize a)
   , Eq a
   ) => a -> a
reverseBitsObvious x = rec x (x `shiftR` 1) (bitSize x - 1)
   where
      rec :: FiniteBits a => a -> a -> Word -> a
      rec !r !v !s 
         | v == zeroBits = r `shiftL` s
         | otherwise     = rec ((r `shiftL` 1) .|. (v .&. bit 0)) (v `shiftR` 1) (s - 1)

{-# SPECIALIZE reverseBitsObvious :: Word8  -> Word8  #-}
{-# SPECIALIZE reverseBitsObvious :: Word16 -> Word16 #-}
{-# SPECIALIZE reverseBitsObvious :: Word32 -> Word32 #-}
{-# SPECIALIZE reverseBitsObvious :: Word64 -> Word64 #-}

-- Reverse the bits in a byte with 3 operations (64-bit multiply and modulus division) 
-- ===================================================================================
-- 
-- unsigned char b; // reverse this (8-bit) byte
--  
-- b = (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;
-- 
-- The multiply operation creates five separate copies of the 8-bit byte
-- pattern to fan-out into a 64-bit value. The AND operation selects the bits
-- that are in the correct (reversed) positions, relative to each 10-bit groups
-- of bits. The multiply and the AND operations copy the bits from the original
-- byte so they each appear in only one of the 10-bit sets. The reversed
-- positions of the bits from the original byte coincide with their relative
-- positions within any 10-bit set. The last step, which involves modulus
-- division by 2^10 - 1, has the effect of merging together each set of 10 bits
-- (from positions 0-9, 10-19, 20-29, ...) in the 64-bit value. They do not
-- overlap, so the addition steps underlying the modulus division behave like
-- or operations.
-- 
-- This method was attributed to Rich Schroeppel in the Programming Hacks
-- section of Beeler, M., Gosper, R. W., and Schroeppel, R. HAKMEM. MIT AI Memo
-- 239, Feb. 29, 1972.

-- | Reverse bits in a Word8 (3 64-bit operations, modulus division)
reverseBits3Ops :: Word8 -> Word8
{-# INLINE reverseBits3Ops #-}
reverseBits3Ops x = fromIntegral x'
   where
      !x' = ((fromIntegral x * 0x0202020202 :: Word64) .&. 0x010884422010) `mod` 1023


-- Reverse the bits in a byte with 4 operations (64-bit multiply, no division) 
-- ===========================================================================
--
-- unsigned char b; // reverse this (8-bit) byte
--  
-- b = ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
-- 
-- The following shows the flow of the bit values with the boolean variables a,
-- b, c, d, e, f, g, and h, which comprise an 8-bit byte. Notice how the first
-- multiply fans out the bit pattern to multiple copies, while the last
-- multiply combines them in the fifth byte from the right. 
--
--
--                                                                                         abcd efgh (-> hgfe dcba)
-- *                                                      1000 0000  0010 0000  0000 1000  0000 0010 (0x80200802)
-- -------------------------------------------------------------------------------------------------
--                                             0abc defg  h00a bcde  fgh0 0abc  defg h00a  bcde fgh0
-- &                                           0000 1000  1000 0100  0100 0010  0010 0001  0001 0000 (0x0884422110)
-- -------------------------------------------------------------------------------------------------
--                                             0000 d000  h000 0c00  0g00 00b0  00f0 000a  000e 0000
-- *                                           0000 0001  0000 0001  0000 0001  0000 0001  0000 0001 (0x0101010101)
-- -------------------------------------------------------------------------------------------------
--                                             0000 d000  h000 0c00  0g00 00b0  00f0 000a  000e 0000
--                                  0000 d000  h000 0c00  0g00 00b0  00f0 000a  000e 0000
--                       0000 d000  h000 0c00  0g00 00b0  00f0 000a  000e 0000
--            0000 d000  h000 0c00  0g00 00b0  00f0 000a  000e 0000
-- 0000 d000  h000 0c00  0g00 00b0  00f0 000a  000e 0000
-- -------------------------------------------------------------------------------------------------
-- 0000 d000  h000 dc00  hg00 dcb0  hgf0 dcba  hgfe dcba  hgfe 0cba  0gfe 00ba  00fe 000a  000e 0000
-- >> 32
-- -------------------------------------------------------------------------------------------------
--                                             0000 d000  h000 dc00  hg00 dcb0  hgf0 dcba  hgfe dcba  
-- &                                                                                       1111 1111
-- -------------------------------------------------------------------------------------------------
--                                                                                         hgfe dcba
-- Note that the last two steps can be combined on some processors because the
-- registers can be accessed as bytes; just multiply so that a register stores
-- the upper 32 bits of the result and the take the low byte. Thus, it may take
-- only 6 operations.
-- 
-- Devised by Sean Anderson, July 13, 2001. 

-- | Reverse bits in a Word8 (4 64-bit operations, no division)
reverseBits4Ops :: Word8 -> Word8
{-# INLINE reverseBits4Ops #-}
reverseBits4Ops x = fromIntegral x'
   where
      !x' = (((fromIntegral x * 0x80200802 :: Word64) .&. 0x0884422110) * 0x0101010101) `shiftR` 32


-- Reverse bits using a lookup table
-- =================================

-- | Reverse bits using a lookup table
reverseBitsTable :: Word8 -> Word8
{-# INLINE reverseBitsTable #-}
reverseBitsTable x = bitsTable `bufferIndex` (fromIntegral x)


-- fill the table by using another method
bitsTable :: Buffer
bitsTable = bufferPackByteList $ fmap reverseBits4Ops [0..255]

-- Reverse the bits in a byte with 7 operations (no 64-bit)
-- ========================================================
-- 
-- b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16; 
-- 
-- Make sure you assign or cast the result to an unsigned char to remove
-- garbage in the higher bits. Devised by Sean Anderson, July 13, 2001. Typo
-- spotted and correction supplied by Mike Keith, January 3, 2002. 


-- | Reverse bits in a Word8 (7 no 64-bit operations, no division)
reverseBits7Ops :: Word8 -> Word8
{-# INLINE reverseBits7Ops #-}
reverseBits7Ops b' = fromIntegral x'
   where
      b   = fromIntegral b' :: Word32
      !x' = ((((b * 0x0802) .&. 0x22110) .|. ((b * 0x8020) .&. 0x88440)) * 0x10101) `shiftR` 16


-- Reverse an N-bit quantity in parallel in 5 * lg(N) operations
-- =============================================================
-- 
-- unsigned int v; // 32-bit word to reverse bit order
-- 
-- // swap odd and even bits
-- v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
-- // swap consecutive pairs
-- v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
-- // swap nibbles ... 
-- v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
-- // swap bytes
-- v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
-- // swap 2-byte long pairs
-- v = ( v >> 16             ) | ( v               << 16);
-- 
-- The following variation is also O(lg(N)), however it requires more
-- operations to reverse v. Its virtue is in taking less slightly memory by
-- computing the constants on the fly.
-- 
-- unsigned int s = sizeof(v) * CHAR_BIT; // bit size; must be power of 2 
-- unsigned int mask = ~0;         
-- while ((s >>= 1) > 0) 
-- {
--   mask ^= (mask << s);
--   v = ((v >> s) & mask) | ((v << s) & ~mask);
-- }
-- 
-- These methods above are best suited to situations where N is large. If you
-- use the above with 64-bit ints (or larger), then you need to add more lines
-- (following the pattern); otherwise only the lower 32 bits will be reversed
-- and the result will be in the lower 32 bits.
-- 
-- See Dr. Dobb's Journal 1983, Edwin Freed's article on Binary Magic Numbers
-- for more information. The second variation was suggested by Ken Raeburn on
-- September 13, 2005. Veldmeijer mentioned that the first version could do
-- without ANDS in the last line on March 19, 2006. 

-- | "Parallel" recursive version
reverseBits5LgN :: forall a.
   ( FiniteBits a
   , ShiftableBits a
   , Bitwise a
   , KnownNat (BitSize a)
   ) => a -> a
reverseBits5LgN x = rec (bitSize x `shiftR` 1) (complement zeroBits) x
   where
      rec :: FiniteBits a => Word -> a -> a -> a
      rec !s !mask !v
         | s <= 0        = v
         | otherwise     = rec (s `shiftR` 1) mask' v'
            where
               mask' = mask `xor` (mask `shiftL` s)
               v'    =      ((v `shiftR` s) .&. mask')
                        .|. ((v `shiftL` s) .&. complement mask')

{-# SPECIALIZE reverseBits5LgN :: Word8  -> Word8  #-}
{-# SPECIALIZE reverseBits5LgN :: Word16 -> Word16 #-}
{-# SPECIALIZE reverseBits5LgN :: Word32 -> Word32 #-}
{-# SPECIALIZE reverseBits5LgN :: Word64 -> Word64 #-}


-- | Convert a function working on Word8 to one working on any Word
--
-- The number of bits in the Word must be a multiple of 8
liftReverseBits ::
   ( ShiftableBits a
   , Bitwise a
   , FiniteBits a
   , Integral a
   , KnownNat (BitSize a)
   ) => (Word8 -> Word8) -> a -> a
liftReverseBits f w = rec zeroBits 0
   where
      nb = bitSize w `shiftR` 3 -- div 8
      f' = fromIntegral . f . fromIntegral
      rec !v !o
         | o == nb    = v
         | otherwise = rec v' (o+1)
               where
                  -- multiplication by 8 replaced with (`shiftL` 3)
                  v' = v .|. ((f' (w `shiftR` (o `shiftL` 3))) `shiftL` ((nb-1-o) `shiftL` 3))

{-# SPECIALIZE liftReverseBits :: (Word8 -> Word8) -> Word8  -> Word8  #-}
{-# SPECIALIZE liftReverseBits :: (Word8 -> Word8) -> Word16 -> Word16 #-}
{-# SPECIALIZE liftReverseBits :: (Word8 -> Word8) -> Word32 -> Word32 #-}
{-# SPECIALIZE liftReverseBits :: (Word8 -> Word8) -> Word64 -> Word64 #-}