{- Joel Svensson 2012, 2013, 2014 
   Mary Sheeran  2012

   Notes:
   2014-03-31: Merged Library and LibraryG 
   2013-01-24: GlobPull nolonger exists
               GlobPush is Push Grid
              
   2013-01-08: Renamed GlobArray to GlobPush 
   2013-01-02: Added toGlobArray and toGlobArrayN
   2012-12-10: Refactoring
               (adherence to new Array types and program types)  
-}

{-# LANGUAGE FlexibleInstances,
             TypeSynonymInstances,
             ScopedTypeVariables,
             TypeFamilies,
             GADTs #-}

module Obsidian.Library where 

import Obsidian.Array 
import Obsidian.Exp 
import Obsidian.Program
import Obsidian.Types

import Obsidian.Force

-- needed for threadsPerBlock analysis 
-- import qualified Obsidian.CodeGen.Program as P 

import Control.Monad

import Data.Bits 
import Data.Word

import Prelude hiding (splitAt,zipWith,replicate,reverse)

---------------------------------------------------------------------------
-- Reverse an array by indexing in it backwards
---------------------------------------------------------------------------

-- | Reverses a Pull array.
reverse :: ASize l => Pull l a -> Pull l a 
reverse arr = mkPull n (\ix -> arr ! ((sizeConv m) - ix))  
   where m = n-1
         n = len arr
         
---------------------------------------------------------------------------
-- splitAt (name clashes with Prelude.splitAt)
---------------------------------------------------------------------------

-- | Splits a Pull array at a given point. Performs no bounds checks. 
splitAt :: (Integral i, ASize l) => i -> Pull l a -> (Pull l a, Pull l a) 
splitAt n arr = (mkPull m (\ix -> arr ! ix), 
                 mkPull  (len arr - m) (\ix -> arr ! (ix + pos)))
  where pos = fromIntegral n
        m   = fromIntegral n

-- | Splits a Pull array in the middle. 
halve :: ASize l => Pull l a -> (Pull l a, Pull l a) 
halve arr = splitAt n2 arr
  where 
    n = len arr
    n2 = n `div` 2

-- | Splits a Pull array into chunks of size n. Result is a Pull of Pull arrays.
splitUp :: ASize l => Word32 -> Pull l a -> Pull l (SPull a)
splitUp n arr {-(Pull m ixf)-} =
  mkPull (len arr `div` fromIntegral n) $ \i ->
    mkPull n $ \j -> arr ! (i * (sizeConv n) + j)                                               

-- | Same as @splitUp@ but also performs a permutation of the elements. 
coalesce :: ASize l
         => Word32 -> Pull l a -> Pull l (Pull Word32 a)
coalesce n arr =
  mkPull s $ \i ->
    mkPull n $ \j -> arr ! (i + (sizeConv s) * j)
  where s = len arr `div` fromIntegral n
        
---------------------------------------------------------------------------
-- elements at even indices to fst output, odd to snd.
---------------------------------------------------------------------------
-- | Split a Pull array into its even and odd indexed elements.
evenOdds :: ASize l => Pull l a -> (Pull l a, Pull l a)
evenOdds arr = (mkPull (n-n2) (\ix -> arr ! (2*ix)) ,
                mkPull n2     (\ix -> arr ! (2*ix + 1)))
  where
    n  = len arr
    n2 = div n 2
-- | Extract the elements at even indices from a Pull array
evens :: ASize l => Pull l a -> Pull l a
evens = fst . evenOdds

-- | Extract the elements at odd indices from a Pull array 
odds :: ASize l => Pull l a -> Pull l a
odds  = snd . evenOdds

---------------------------------------------------------------------------
-- everyNth 
---------------------------------------------------------------------------
-- | Extract every nth element from a Pull array.
everyNth :: ASize l => Word32 -> Word32 -> Pull l a -> Pull l a
everyNth n m arr = mkPull n' $ \ix -> arr ! (ix * (fromIntegral n) + fromIntegral m)
  where
    n' = len arr `div` (fromIntegral n) 
  

---------------------------------------------------------------------------
-- replicate 
---------------------------------------------------------------------------
-- | Generates a Pull array of length one, containing @a@. 
singleton :: (Array a, ASize l) => e -> a l e
singleton a = replicate 1 a

-- | Generate a pull or push array usign a function from Index to element.
generate :: (Functor (a s), Array a, ASize s)
         => s -> (EWord32 -> b) -> a s b
generate n f = fmap f (iota n) 
---------------------------------------------------------------------------
-- last and first 
---------------------------------------------------------------------------
-- | Extract last element from a Pull array.
last :: ASize l => Pull l a -> a 
last arr = arr ! fromIntegral ( len arr - 1)

-- | Extract the first element from a Pull array.
first :: ASize l => Pull l a -> a
first arr = arr ! 0 

---------------------------------------------------------------------------
-- Take and Drop (what about strange sizes ? fix) 
---------------------------------------------------------------------------
-- | Take the first @n@ elements from a Pull array
take :: ASize l => l -> Pull l a -> Pull l a
take n arr = setSize n arr

-- | Drop the first @n@ elements from a Pull array
drop :: ASize l => l -> Pull l a -> Pull l a
drop n arr = setSize (len arr - n) $ ixMap (\ix -> ix + sizeConv n) arr

---------------------------------------------------------------------------
-- fold (sequential , unrolled)  
---------------------------------------------------------------------------
-- | Fold a nonempty pull array using a given operator. The result a singleton array (push or pull). 
fold1 :: Array a => (e -> e -> e) -> Pull Word32 e -> a Word32 e
fold1  f arr = replicate 1
               $ foldl1 f [arr ! (fromIntegral i) | i <- [0..(n-1)]]   
  where n = len arr


---------------------------------------------------------------------------
-- Shift arrays
---------------------------------------------------------------------------
-- shiftRight :: (ASize l, Choice a) => Word32 -> a -> Pull l a -> Pull l a
-- shiftRight dist elt arr = setSize (len arr)
--                           $ replicate (fromIntegral dist) elt `conc` arr

-- -- TODO: incorrect! 
-- shiftLeft :: (ASize l, Choice a) => Word32 -> a -> Pull l a -> Pull l a
-- shiftLeft dist elt arr = mkPull (len arr)
--                          $ \ix -> (arr `conc`  replicate (fromIntegral dist) elt)
--                                   ! (ix + fromIntegral dist) 
                         

-- opposite of evenOdds 
--shuffle :: ASize l => PT t -> Pull l a -> Pull l a -> Push t l a
--shuffle Block a1 a2 =
--  Push (len a1 + len a2) $
--    \ wf -> ForAll (sizeConv (len a1)) $
--            \ tid -> do
--              wf (a1 ! tid) (tid * 2) 
--              wf (a2 ! tid) (tid * 2 + 1) 


---------------------------------------------------------------------------
-- Concatenate the arrays
---------------------------------------------------------------------------
-- -- | Concatenate two Pull arrays (Potentially inefficient).
-- conc :: (ASize l, Choice a) => Pull l a -> Pull l a -> Pull l a 
-- conc a1 a2 = mkPull (n1+n2)
--                $ \ix -> ifThenElse (ix <* (sizeConv n1)) 
--                        (a1 ! ix) 
--                        (a2 ! (ix - (sizeConv n1)))
--   where 
--     n1 = len a1
--     n2 = len a2 

    
---------------------------------------------------------------------------
-- zipp unzipp
---------------------------------------------------------------------------
-- | Unzip implemented on Pull arrays    
unzip :: ASize l =>  Pull l (a,b) -> (Pull l a, Pull l b)       
unzip arr = (mkPull (len arr) (\ix -> fst (arr ! ix)) ,
             mkPull (len arr) (\ix -> snd (arr ! ix)) )

-- | Zip implemented on Pull arrays
zip :: ASize l => Pull l a -> Pull l b -> Pull l (a, b)             
zip arr1 arr2 = mkPull (min (len arr1) (len arr2))
                     $ \ix -> (arr1 ! ix, arr2 ! ix) 

-- | Unzip tripples. 
unzip3 :: ASize l => Pull l (a,b,c) 
           -> (Pull l a, Pull l b, Pull l c)       
unzip3 arr = (fmap (\(x,_,_) -> x) arr,
               fmap (\(_,y,_) -> y) arr,
               fmap (\(_,_,z) -> z)  arr)
             
-- | Zip three arrays
zip3 :: ASize l
        => Pull l a
        -> Pull l b
        -> Pull l c 
        -> Pull l (a,b,c)             
zip3 arr1 arr2 arr3 = 
  mkPull (minimum [len arr1, len arr2, len arr3])
  (\ix -> (arr1 ! ix, arr2 ! ix, arr3 ! ix))
    

-- | Perform elementwise operation.
zipWith :: ASize l => (a -> b -> c) -> Pull l a -> Pull l b -> Pull l c
zipWith op a1 a2 =  
  mkPull (min (len a1) (len a2))
  (\ix -> (a1 ! ix) `op` (a2 ! ix))

-- | Perform elementwise operation. 
zipWith3 :: ASize l => (a -> b -> c-> d) -> Pull l a -> Pull l b -> Pull l c -> Pull l d
zipWith3 f a1 a2 a3 =  
  mkPull (minimum [len a1,len a2,len a3]) $ 
    \ix -> f (a1 ! ix)  (a2 ! ix) (a3 ! ix)

  
---------------------------------------------------------------------------
-- pair 
---------------------------------------------------------------------------
-- | Pair up consecutive elements in a Pull array.
pair :: ASize l => Pull l a -> Pull l (a,a)
pair arr = 
  mkPull n' (\ix -> (arr ! (ix*2),arr ! (ix*2+1))) 
  where 
    n' = len arr `div` 2 


-- | Flatten a Pull array of pairs. 
unpair :: ASize l => Choice a => Pull l (a,a) -> Pull l a
unpair arr = 
    let n = len arr
    in  mkPull (2*n) (\ix -> ifThenElse ((mod ix 2) ==* 0) 
                                  (fst (arr ! (ix `shiftR` 1)))
                                  (snd (arr ! (ix `shiftR` 1)))) 


---------------------------------------------------------------------------
-- twoK (untested for proper functionality) 
---------------------------------------------------------------------------

-- | Recursively split an array in the middle. Apply an array to array computation
--   on each part. @binSplit 3@ divides the array into 8 pieces. 
binSplit ::Int
           -> (Pull Word32 a -> Pull Word32 b)
           -> Pull Word32 a
           -> Pull Word32 b 
binSplit = twoK

-- See if this should be specifically for Static size pull arrays
twoK :: Int -> (Pull Word32 a -> Pull Word32 b) -> Pull Word32 a -> Pull Word32 b 
twoK 0 f = f  -- divide 0 times and apply f
twoK n f = \arr -> 
              let arr' = mkPull lt (\i -> (f (mkPull  m (\j -> (arr ! (g i j)))) ! (h i))) 
                  m    = (len arr `shiftR` n)   --pow of two           
                  g i j = i .&. (fromIntegral (complement (m-1))) .|. j  
                  h i   = i .&. (fromIntegral (nl2-1))   -- optimize 

                  nl2   = len (f (mkPull  m (\j -> arr ! variable "X")))
                  lt    = nl2 `shiftL` n 
              in arr'

---------------------------------------------------------------------------
-- ***                          PUSHY LIBRARY                       *** ---
---------------------------------------------------------------------------

---------------------------------------------------------------------------
-- Concatenate on Push arrays 
---------------------------------------------------------------------------

-- | Concatenate two push arrays.
concP :: ASize l
         => Push t l a -> Push t l a -> Push t l a 
concP p1 p2  =
  mkPush (n1 + n2) $ \wf ->
  do
    p1 <: wf
    p2 <: \a i -> wf a (sizeConv n1 + i) 
 where 
   n1 = len p1
   n2 = len p2 


-- Implement unpair on pusharrays,
-- Impement for more tuples than pairs. 
--unpair :: ASize l => Choice a => Pull l (a,a) -> Pull l a
--unpair arr = 
--    let n = len arr
--    in  mkPull (2*n) (\ix -> ifThenElse ((mod ix 2) ==* 0) 
--                                  (fst (arr ! (ix `shiftR` 1)))
--                                  (snd (arr ! (ix `shiftR` 1)))) 


---------------------------------------------------------------------------
-- load / Store 
---------------------------------------------------------------------------
load :: Word32 -> Pull Word32 a -> Push Block Word32 a 
load n arr =
  mkPush m (\wf ->
  forAll (fromIntegral n') (\tid ->
  do
    seqFor (fromIntegral n) (\ix -> 
      wf (arr ! (tid + (ix*fromIntegral n'))) (tid + (ix*fromIntegral n')))))

  where
    m = len arr
    n' = m `div` n

store :: Word32 -> SPull a -> SPush Block a 
store = load 


-- ########################################################################
--
-- Programming the Hierarchy
--
-- ########################################################################


---------------------------------------------------------------------------
-- Parallel concatMap  
---------------------------------------------------------------------------

pConcatMap f = pConcat . fmap f
pUnCoalesceMap f = pUnCoalesce . fmap f
pConcatMapJoin f = pConcat . fmap (runPush.f)
pUnCoalesceMapJoin f = pUnCoalesce . fmap (runPush.f)
pCoalesceMap n f = pUnCoalesce . fmap f . coalesce n
pSplitMap n f = pConcat . fmap f . splitUp n

---------------------------------------------------------------------------
-- Step into the Hierarchy by distributing a
-- Thread program parameterized on a threadId over the threads
-- at a specific level in the Hierarchy. 
---------------------------------------------------------------------------

-- | A way to enter into the hierarchy
-- A bunch of Thread computations, spread across the threads of either
-- a Warp, block or grid. (or performed sequentially in a single thread) 
tConcat :: ASize l
           => Pull l (SPush Thread b)
           -> Push t l b  
tConcat arr =
  mkPush (n * fromIntegral s) $ \wf -> do
    forAll (sizeConv n) $ \tid -> 
       let wf' a ix = wf a (tid * sizeConv s + ix)
           p = arr ! tid -- f tid 
       in p <: wf'
  where
    n = len arr
    s  = len (arr ! 0) --(f (variable "tid")) -- arr

-- | Variant of @tConcat@. 
tDistribute :: ASize l
               => l
               -> (EWord32 -> SPush Thread b)
               -> Push t l b
tDistribute n f = tConcat (mkPull n f) 

      
-- | Distribute work across the parallel resources at a given level of the GPU hiearchy
pConcat :: ASize l => Pull l (SPush t a) -> Push (Step t) l a
pConcat arr =
  mkPush (n * fromIntegral rn) $ \wf ->
    distrPar (sizeConv n) $ \bix ->
      let p = arr ! bix 
          wf' a ix = wf a (bix * sizeConv rn + ix) 
          
      in p <: wf'
  where
    n  = len arr
    rn = len (arr ! 0) -- All arrays are same length

-- | Distribute work across the parallel resources at a given level of the GPU hierarchy
pDistribute :: ASize l => l -> (EWord32 -> SPush t a) -> Push (Step t) l a
pDistribute n f = pConcat (mkPull n f) 

-- | Sequential concatenation of a Pull of Push.
sConcat :: ASize l => Pull l (SPush t a) -> Push t l a
sConcat arr =
  mkPush (n * fromIntegral rn) $ \wf ->
  do
    seqFor (sizeConv n) $ \bix ->
      let p = arr ! bix -- (Push _ p) = arr ! bix
          wf' a ix = wf a (bix * sizeConv rn + ix)              
      in p <: wf'
  where 
    n  = len arr
    rn = len $ arr ! 0

-- | Variant of sConcat.
sDistribute :: ASize l => l -> (EWord32 -> SPush t a) -> Push t l a
sDistribute n f = sConcat (mkPull n f) 

-- pUnCoalesce adapted from Niklas branch.
-- | Combines work that was distributed in a Coalesced way.
-- | Applies a permutation on stores.
pUnCoalesce :: ASize l => Pull l (SPush t a) -> Push (Step t) l a
pUnCoalesce arr =
  mkPush (n * fromIntegral rn) $ \wf ->
  distrPar (sizeConv n) $ \bix ->
    let p = arr ! bix
        wf' a ix = wf a (bix * sizeConv rn + ix)
    in p <: (g wf')
  where
    n  = len arr
    rn = len $ arr ! 0
    s  = sizeConv rn 
    g wf a i = wf a (i `div` s + (i`mod`s)*(sizeConv n))

---------------------------------------------------------------------------
-- RunPush 
---------------------------------------------------------------------------

-- | Fuses the program that computes a Push array into the Push array. 
runPush :: Program t (Push t s a) -> Push t s a
runPush prg =
  mkPush n $ \wf -> do
    parr <- prg
    parr <: wf
    -- It is a bit scary that I need to "evaluate" programs here. 
  where n = len $ fst $ runPrg 0 prg

-- | Lifts @runPush@ to one input functions.
runPush1 :: (a -> Program t (Push t s b)) -> a -> Push t s b
runPush1 f a = runPush (f a)

-- | Lifts @runPush@ to two input functions.
runPush2 :: (a -> b -> Program t (Push t s c)) -> a -> b -> Push t s c
runPush2 f a b = runPush (f a b)  

-- | Converts a program computing a pull Array to a Push array
runPull :: (Pushable t, ASize s) => Program t (Pull s a) -> Push t s a
runPull = runPush . liftM push 

-- | Lifts @runPull@ to one input functions.
runPull1 :: (Pushable t, ASize s) => (a -> Program t (Pull s b)) -> a -> Push t s b
runPull1 f a = runPull (f a)

-- | Lifts @runPull@ to two input functions.
runPull2 :: (Pushable t, ASize s) => (a -> b -> Program t (Pull s c)) -> a -> b -> Push t s c
runPull2 f a b = runPull (f a b)

---------------------------------------------------------------------------
-- 
---------------------------------------------------------------------------
pushPrg :: Program t a -> SPush t a
pushPrg = singletonPush


---------------------------------------------------------------------------
-- Singleton push arrays 
---------------------------------------------------------------------------

-- Danger! use only with Scalar a's 
-- -- | Create a singleton Push array.
--singletonPush :: a -> SPush t a
--singletonPush = singletonPushP . return 

-- | Monadic version of @singleton@.
singletonPush :: Program t a -> SPush t a
singletonPush prg =
  mkPush 1 $ \wf -> do
    a <- prg
    forAll 1 $ \ix -> 
      wf a 0