{-# LANGUAGE CPP #-}
-----------------------------------------------------------------------------
-- |
-- Module      :  Control.Parallel.Eden.DivConq
-- Copyright   :  (c) Philipps Universitaet Marburg 2009-2014
-- License     :  BSD-style (see the file LICENSE)
-- 
-- Maintainer  :  eden@mathematik.uni-marburg.de
-- Stability   :  beta
-- Portability :  not portable
--
-- This Haskell module defines divide-and-conquer skeletons for 
-- the parallel functional language Eden.
--
-- All divide-and-conquer algorithms are parameterised with control functions
-- which decide if a problem is trivial, how to solve a trivial problem, 
-- how to split a non-trivial problem into smaller problems and how to combine solutions 
-- of subproblems into a solution of the problem.
--
-- Depends on GHC. Using standard GHC, you will get a threaded simulation of Eden. 
-- Use the forked GHC-Eden compiler from http:\/\/www.mathematik.uni-marburg.de/~eden 
-- for a parallel build.
--
-- Eden Group ( http:\/\/www.mathematik.uni-marburg.de/~eden )
--

module Control.Parallel.Eden.DivConq (
  -- * Divide-and-conquer scheme
  DivideConquer, DivideConquerSimple,
  -- * Sequential divide-and-conquer
  dc, 
  -- * Distributed expansion divide-and-conquer skeletons
  -- ** Straightforward implementation for arbitrary DC problems 
  parDC, 
  -- ** Divide-and-conquer skeleton for regular n-ary trees 
  disDC, offline_disDC, disDCdepth, disDCn, 
  -- * Flat expansion divide-and-conquer skeletons
  flatDC,                                      

  -- * Deprecated divide-and-conquer skeletons (legacy code)
  -- ** Deprecated sequential divide-and-conquer
  divConSeq, divConSeq_c,
  -- **  Deprecated straightforward implementations for arbitrary DC problems 
  divCon, divCon_c, divConD, divConD_c,
  -- ** Deprecated divide-and-conquer skeleton for regular n-ary trees 
  dcN, dcN_c, dcN', dcN_c', dcNTickets, dcNTickets_c,
   -- ** Deprecated flat expansion divide-and-conquer skeletons
  divConFlat, divConFlat_c                  
  ) where
#if defined( __PARALLEL_HASKELL__ )
import Control.Parallel.Eden
#else
import Control.Parallel.Eden.EdenConcHs
#endif
import Control.Parallel.Eden.Map
import Control.Parallel.Eden.Auxiliary



--   Divide-and-conquer scheme
type DivideConquer a b
  = (a -> Bool)        -- ^ trivial? 
    -> (a -> b)        -- ^ solve
    -> (a -> [a])      -- ^ split
    -> (a -> [b] -> b) -- ^ combine
    -> a               -- ^ input 
    -> b               -- ^ result

-- | Sequential Version.
dc :: DivideConquer a b
dc trivial solve split combine = rec_dc
  where
    rec_dc x = if trivial x then solve x
               else combine x (map rec_dc (split x))


-- | Simple parMap parallelisation with depth control but
-- no placement control. This variant allows to
-- give an additional depth parameter for the recursion, proceeding in a
-- sequential manner when @depth=0@. The process scheme unfolds the call 
-- tree on processors chosen by the runtime environment. Round-Robin 
-- distribution is unfavourable for this skeleton, better use RTS option 
-- @+RTS -qrnd@ when using it.
parDC :: (Trans a, Trans b) 
         => Int            -- ^ parallel depth
         -> DivideConquer a b
parDC lv trivial solve split combine
 = pdc lv
 where
  pdc lv x
   | lv == 0 = dc trivial solve split combine x
   | lv >  0 = if trivial x then solve x
               else combine x (parMap (pdc (lv-1)) (split x))


-- | Distributed-expansion divide-and-conquer skeleton
--  (tutorial version, similar to dcNTickets).
disDC :: (Trans a, Trans b) 
         => Int               -- ^ branching degree
         -> Places            -- ^ tickets
         -> DivideConquer a b
disDC k tickets trivial solve split combine x
  = if null tickets then seqDC x
    else recDC tickets x
  where
    seqDC = dc trivial solve split combine
    recDC tickets x =
      if trivial x then solve x
      else childRes           `pseq`  -- explicit demand
           rdeepseq myRes     `pseq`  -- control
           combine x ( myRes:childRes ++ localRess )
      where
        -- child process generation
        childRes   = spawnAt childTickets childProcs procIns
        childProcs = map (process . recDC) theirTs
        -- ticket distribution
        (childTickets, restTickets) = splitAt (k-1) tickets
        (myTs: theirTs) = unshuffle k restTickets
        -- input splitting
        (myIn:theirIn)  = split x
        (procIns, localIns)
                   = splitAt (length childTickets) theirIn
        -- local computations
        myRes      = recDC myTs myIn
        localRess  = map seqDC localIns

-- | offline distributed-expansion divide-and-conquer skeleton.
offline_disDC :: Trans b 
                 => Int                -- ^ branching degree
                 -> [Int]              -- ^ tickets
                 -> DivideConquer a b
offline_disDC k ts triv solve split combine x
  = snd (disDC k ts newtriv newsolve newsplit newcombine 0)
  where
    seqDC      = dc triv solve split combine
    newsplit   = successors k
    newtriv  n = length ts <= k^(length (path k n))
    newsolve n = (flag, seqDC localx)
      where (flag, localx) = select triv split x (path k n)
    newcombine n bs@((flag,bs1):_)
      = if flag then (True, combine localx (map snd bs))
                else (lab,  bs1)
      where (lab, localx) = select triv split x (path k n)


-- local selection function for offline distributed-expansion divide-and-conquer skeleton
select :: (a -> Bool) -> (a -> [a])  -- ^ trivial / split
          -> a -> [Int] -> (Bool,a)
select trivial split x ys = go x ys
  where go x []     = (True, x)
        go x (y:ys) = if trivial x then (False, x)
                      else go (split x !! y) ys

-- auxiliary functions for offline distributed-expansion divide-and-conquer skeleton
successors :: Int -> Int -> [Int]
successors k n = [nk + i | let nk = n*k, i <- [1..k]]

path :: Int -> Int -> [Int]
path k n | n == 0    = []
         | otherwise = reverse (factors k n)

factors :: Int -> Int -> [Int]
factors k n
  | n <= 0    = []
  | otherwise = (n+k-1) `mod` k : factors k ((n-1) `div` k)
  
-- | DC skeleton with fixed branching degree, parallel depth control and explicit process 
-- placement (tree-shaped process creation, one task in each recursive step stays local).
disDCdepth :: (Trans a, Trans b) 
       => Int             -- ^ branching degree
       -> Int             -- ^ parallel depth
       -> DivideConquer a b
disDCdepth k depth trivial solve split combine x 
  | depth < 1 = dc trivial solve split combine x
  | trivial x = solve x 
  | otherwise = childRs `seq` -- early demand on children list
                combine x (myR : childRs)
	    where myself = disDCdepth k (depth - 1) trivial solve split combine
	          (mine:rest) = split x
		  myR = myself mine
		  childRs = parMapAt places myself rest
			      `using` seqList r0 -- ???
		  -- placement with stride for next children, round-robin
		  places = map ((+1) . (`mod` noPe) . (+(-1))) shifts
                  shifts = map (selfPe +) [shift,2*shift..]
                  shift  = k ^ (depth -1)

-- | Like 'disDCdepth', but controls parallelism by limiting the number of processes instead of the parallel depth.
disDCn :: (Trans a, Trans b) 
          => Int             -- ^ branching degree 
          -> Int             -- ^ number of processes
          -> DivideConquer a b
disDCn k n = disDCdepth n depth
  where depth = logN k n

-- rounding-up log approximation
logN n 1 = 0
logN n  k | k > 0 = 1 + logN n ((k + n-1) `div` n) -- round up
          | otherwise = error "logN"
                        
-------------------------------Flat Expansion----------------------------------
-- | DC Skeleton with flat expansion of upper DC-tree levels, takes custom map 
-- skeletons to solve expanded tasks (a sequential map skeleton leads to a 
-- sequential DC-skeleton).
flatDC :: (Trans a,Trans b) => 
                ((a->b)->[a]->[b]) -- ^custom map implementation
                -> Int             -- ^depth
                -> DivideConquer a b
flatDC myMap depth trivial solve split combine x
  = combineTopMaster combine levels results
  where (tasks,levels) = generateTasks depth trivial split x
        results        = myMap(divConSeq_c trivial solve split combine) tasks



combineTopMaster :: (NFData b) =>
                    (a->[b]->b) -> (Tree a) -> [b] -> b
combineTopMaster c t bs = fst (combineTopRnf c t bs)


combineTopRnf :: (NFData b) => 
                 (a->[b]->b) -> (Tree a) -> [b] -> (b,[b])
combineTopRnf _ (Leaf a) (b:bs) = (b,bs)
combineTopRnf combine (Tree a ts) bs 
 = (rnf res `pseq` combine a res, bs')
 where (bs',res)      = foldl f (bs,[]) ts
       f (olds,news) t = (remaining,news++[b]) 
         where (b,remaining) =  combineTopRnf combine t olds
               

generateTasks :: Int -> (a->Bool) -> (a->[a]) -> a -> ([a],Tree a)
generateTasks 0 _ _ a = ([a],Leaf a)
generateTasks n trivial split a
 | trivial a = ([a],Leaf a)
 | otherwise = (concat ass,Tree a ts)
 where assts = map (generateTasks (n-1) trivial split) (split a)
       (ass,ts) = unzip assts


data Tree a = Tree a [Tree a] | Leaf a  deriving Show
instance NFData a => NFData (Tree a)
 where rnf (Tree a ls) = rnf a `seq` rnf ls 
       rnf (Leaf a)    = rnf a 

-----------------------------------DEPRECATED--------------------------------
-- | The simple interface (deprecated): combine function without input
type DivideConquerSimple a b
  = (a -> Bool)    -- ^ trivial? 
    -> (a -> b)    -- ^ solve
    -> (a -> [a])  -- ^ split
    -> ([b] -> b)  -- ^ combine (only uses sub-results, not the input)
    -> a           -- ^ input 
    -> b           -- ^ result


-- | Like 'dc' but uses simple DC Interface.
{-# DEPRECATED divConSeq, divConSeq_c "better use dc instead" #-}
divConSeq :: (Trans a, Trans b) => DivideConquerSimple a b
divConSeq trivial solve split combine x 
   = dc trivial solve split (\_ parts -> combine parts) x

-- | Tutorial version, same as 'dc'

divConSeq_c :: (Trans a, Trans b) => DivideConquer a b
divConSeq_c = dc


-- | Straightforward implementation.
--
-- The straightforward method to parallelise divide-and-conquer
-- algorithms is to unfold the call tree onto different
-- processors. The process scheme unfolds the call tree on processors chosen by the
-- runtime environment. Round-Robin distribution is unfavourable for this
-- skeleton, better use runtime option @-qrnd@ when using it.
{-# DEPRECATED divCon, divCon_c, divConD, divConD_c "better use parDC instead" #-}
divCon :: (Trans a, Trans b) => DivideConquerSimple a b
divCon trivial solve split combine x 
   = divCon_c trivial solve split (\_ parts -> combine parts) x

-- | Like 'divCon' but with different combine signature (takes the original problem as additional input).
divCon_c :: (Trans a, Trans b) => DivideConquer a b
divCon_c trivial solve split combine x
  | trivial x = solve x
  | otherwise = combine x children
    where children = parMap (divCon_c trivial solve split combine) (split x)  


-- | Like 'parDC' but uses simple DC Interface.
divConD :: (Trans a, Trans b) 
           => Int                -- ^parallel depth
           -> DivideConquerSimple a b
divConD depth trivial solve split combine x 
   = parDC depth trivial solve split (\_ parts -> combine parts) x

-- | Tutorial version, same as 'parDC'.
divConD_c :: (Trans a, Trans b) 
             => Int               -- ^parallel depth
             -> DivideConquer a b
divConD_c = parDC

-- | Like 'disDCdepth' but uses simple DC Interface.
{-# DEPRECATED dcN, dcN_c "better use disDCdepth instead" #-}
dcN :: (Trans a, Trans b) 
       => Int                 -- ^ branching degree
       -> Int                 -- ^ parallel depth
       -> DivideConquerSimple a b
dcN n depth trivial solve split combine x 
   = disDCdepth n depth trivial solve split (\_ parts -> combine parts) x

-- | Tutorial version, same as 'disDCdepth'
dcN_c :: (Trans a, Trans b) 
         => Int                -- ^ branching degree
         -> Int                -- ^ parallel depth
         -> DivideConquer a b
dcN_c = disDCdepth

-- | Like 'disDCn' but uses simple DC Interface.
{-# DEPRECATED dcN', dcN_c' "better use disDCn instead" #-}
dcN' :: (Trans a, Trans b) 
        => Int                     -- ^ branching degree
        -> Int                     -- ^ number of processes
        -> DivideConquerSimple a b
dcN' n pes = dcN n depth
  where depth = logN n pes

-- | Tutorial version, same as 'disDCn'.
dcN_c' :: (Trans a, Trans b) 
          => Int                -- ^ branching degree
          -> Int                -- ^ number of processes
          -> DivideConquer a b
dcN_c' = disDCn

---------------------------------------------------------------
-- | Like 'disDC', but differs in demand control and uses simple DC Interface.
{-# DEPRECATED dcNTickets, dcNTickets_c "better use disDC instead" #-}
dcNTickets :: (Trans a, Trans b) => 
        Int                       -- ^ branching degree
        -> Places                 -- ^ tickets (places to use)
        -> DivideConquerSimple a b
dcNTickets k ts trivial solve split combine x 
 = dcNTickets_c k ts trivial solve split (\_ parts -> combine parts) x

-- | Like 'disDC', but differs in demand control.
dcNTickets_c :: (Trans a, Trans b) 
                => Int                -- ^ branching degree
                -> Places             -- ^ Tickets (places to use)
                -> DivideConquer a b
dcNTickets_c k [] trivial solve split combine x 
  = divConSeq_c trivial solve split combine x
dcNTickets_c k tickets trivial solve split combine x  
   = if trivial x then solve x 
                  else childRes `pseq` -- early demand on children list
		       rnf myRes `pseq` rnf localRess `pseq` 
                       combine x (myRes:childRes ++ localRess )
        where
          -- splitting computation into processes
          (childTickets,restTickets) = splitAt (k-1) tickets --position of (children,further ancestors)
          (myTs:theirTs)= unshuffle k restTickets
          ticketF ts = dcNTickets_c k ts trivial solve split combine
          insts = length childTickets
          (procIns, localIns) = splitAt insts theirIn 
          childProcs = map (process . ticketF) theirTs 
          childRes  = spawnAt childTickets childProcs procIns

          -- local computation:
          myRes = ticketF myTs myIn
          (myIn:theirIn) = split x
          localRess = map (divConSeq_c trivial solve split combine) localIns


    
-- | Like as 'flatDC' but uses simple DC Interface.
{-# DEPRECATED divConFlat, divConFlat_c "better use flatDC instead" #-}
divConFlat :: (Trans a,Trans b) => 
              ((a->b)->[a]->[b]) -- ^custom map implementation
              -> Int             -- ^depth
              -> DivideConquerSimple a b
divConFlat myMap depth trivial solve split combine x 
   = divConFlat_c myMap depth trivial solve split (\_ parts -> combine parts) x
     
-- | Tutorial version, same as 'flatDC'.
divConFlat_c :: (Trans a,Trans b) 
                => ((a->b)->[a]->[b]) -- ^ custom map implementation
                -> Int                -- ^ parallel depth
                -> DivideConquer a b
divConFlat_c = flatDC