| Copyright | (c) Adam Conner-Sax 2019 |
|---|---|
| License | BSD-3-Clause |
| Maintainer | adam_conner_sax@yahoo.com |
| Stability | experimental |
| Safe Haskell | None |
| Language | Haskell2010 |
Control.MapReduce.Engines.Streamly
Description
map-reduce engine (fold builder) using Streamly streams as its intermediate and return type.
Notes:
1. These are polymorphic in the return stream type. Thought the streams do have to be serial when groupBy is called
So you have to specify the stream type in the call or it has to be inferrable from the use of the result.
- There is a concurrent engine here, one that uses Streamly's concurrency features to map over the stream. I've not been able to verify that this is faster on an appropriate task with appropriate runtime settings.
Synopsis
- streamlyEngine :: (Foldable g, Functor g) => (forall z. SerialT Identity (k, z) -> SerialT Identity (k, g z)) -> MapReduceFold y k c (SerialT Identity) x d
- streamlyEngineM :: (IsStream t, Monad m, MonadAsync m, Traversable g) => (forall z. SerialT m (k, z) -> SerialT m (k, g z)) -> MapReduceFoldM m y k c (t m) x d
- concurrentStreamlyEngine :: forall tIn tOut m g y k c x d. (IsStream tIn, IsStream tOut, MonadAsync m, Foldable g, Functor g) => (forall z. SerialT m (k, z) -> SerialT m (k, g z)) -> MapReduceFold y k c (tOut m) x d
- toStreamlyFold :: Monad m => Fold a b -> Fold m a b
- toStreamlyFoldM :: FoldM m a b -> Fold m a b
- resultToList :: (Monad m, IsStream t) => t m a -> m [a]
- concatStream :: (Monad m, Monoid a) => SerialT m a -> m a
- concatStreamFold :: Monoid b => Fold a (SerialT Identity b) -> Fold a b
- concatStreamFoldM :: (Monad m, Monoid b, IsStream t) => FoldM m a (t m b) -> FoldM m a b
- concatConcurrentStreamFold :: (Monad m, Monoid b, IsStream t) => Fold a (t m b) -> FoldM m a b
- groupByHashableKey :: (Monad m, Hashable k, Eq k) => SerialT m (k, c) -> SerialT m (k, Seq c)
- groupByOrderedKey :: (Monad m, Ord k) => SerialT m (k, c) -> SerialT m (k, Seq c)
- groupByHashableKeyST :: (Monad m, Hashable k, Eq k) => SerialT m (k, c) -> SerialT m (k, Seq c)
- groupByDiscriminatedKey :: (Monad m, Grouping k) => SerialT m (k, c) -> SerialT m (k, Seq c)
- data SerialT (m :: Type -> Type) a
- data WSerialT (m :: Type -> Type) a
- data AheadT (m :: Type -> Type) a
- data AsyncT (m :: Type -> Type) a
- data WAsyncT (m :: Type -> Type) a
- data ParallelT (m :: Type -> Type) a
- type MonadAsync (m :: Type -> Type) = (MonadIO m, MonadBaseControl IO m, MonadThrow m)
- class (forall (m :: Type -> Type) a. MonadAsync m => Semigroup (t m a), forall (m :: Type -> Type) a. MonadAsync m => Monoid (t m a), forall (m :: Type -> Type). Monad m => Functor (t m), forall (m :: Type -> Type). MonadAsync m => Applicative (t m)) => IsStream (t :: (Type -> Type) -> Type -> Type)
Engines
streamlyEngine :: (Foldable g, Functor g) => (forall z. SerialT Identity (k, z) -> SerialT Identity (k, g z)) -> MapReduceFold y k c (SerialT Identity) x d Source #
map-reduce-fold builder returning a SerialT Identity d result
streamlyEngineM :: (IsStream t, Monad m, MonadAsync m, Traversable g) => (forall z. SerialT m (k, z) -> SerialT m (k, g z)) -> MapReduceFoldM m y k c (t m) x d Source #
effectful map-reduce-fold engine returning a (Istream t => t m d) result The MonadAsync constraint here more or less requires us to run in IO, or something IO like.
concurrentStreamlyEngine :: forall tIn tOut m g y k c x d. (IsStream tIn, IsStream tOut, MonadAsync m, Foldable g, Functor g) => (forall z. SerialT m (k, z) -> SerialT m (k, g z)) -> MapReduceFold y k c (tOut m) x d Source #
possibly (depending on chosen stream types) concurrent map-reduce-fold builder returning an (Istream t, MonadAsync m) => t m d result
Streamly Combinators
toStreamlyFold :: Monad m => Fold a b -> Fold m a b Source #
convert a Control.Foldl Fold into a Streamly.Data.Fold fold
toStreamlyFoldM :: FoldM m a b -> Fold m a b Source #
convert a Control.Foldl FoldM into a Streamly.Data.Fold fold
Result Extraction
resultToList :: (Monad m, IsStream t) => t m a -> m [a] Source #
make a stream into an (effectful) []
concatStreamFold :: Monoid b => Fold a (SerialT Identity b) -> Fold a b Source #
mappend everything in a pure Streamly fold
concatStreamFoldM :: (Monad m, Monoid b, IsStream t) => FoldM m a (t m b) -> FoldM m a b Source #
mappend everything in an effectful Streamly fold.
concatConcurrentStreamFold :: (Monad m, Monoid b, IsStream t) => Fold a (t m b) -> FoldM m a b Source #
mappend everything in a concurrent Streamly fold.
groupBy Functions
groupByHashableKey :: (Monad m, Hashable k, Eq k) => SerialT m (k, c) -> SerialT m (k, Seq c) Source #
Group streamly stream of (k,c) by hashable key.
NB: this function uses the fact that SerialT m is a monad
groupByOrderedKey :: (Monad m, Ord k) => SerialT m (k, c) -> SerialT m (k, Seq c) Source #
Group streamly stream of (k,c) by ordered key.
NB: this function uses the fact that SerialT m is a monad
groupByHashableKeyST :: (Monad m, Hashable k, Eq k) => SerialT m (k, c) -> SerialT m (k, Seq c) Source #
Group streamly stream of (k,c) by hashable key. Uses mutable hashtables running in the ST monad.
NB: this function uses the fact that SerialT m is a monad
groupByDiscriminatedKey :: (Monad m, Grouping k) => SerialT m (k, c) -> SerialT m (k, Seq c) Source #
Group streamly stream of (k,c) by key with instance of Grouping from http://hackage.haskell.org/package/discrimination.
NB: this function uses the fact that SerialT m is a monad
Re-Exports
data SerialT (m :: Type -> Type) a #
The Semigroup operation for SerialT behaves like a regular append
operation. Therefore, when a <> b is evaluated, stream a is evaluated
first until it exhausts and then stream b is evaluated. In other words,
the elements of stream b are appended to the elements of stream a. This
operation can be used to fold an infinite lazy container of streams.
import Streamly
import qualified Streamly.Prelude as S
main = (S.toList . serially $ (S.fromList [1,2]) <> (S.fromList [3,4])) >>= print
[1,2,3,4]
The Monad instance runs the monadic continuation for each
element of the stream, serially.
main = S.drain . serially $ do
x <- return 1 <> return 2
S.yieldM $ print x
1 2
SerialT nests streams serially in a depth first manner.
main = S.drain . serially $ do
x <- return 1 <> return 2
y <- return 3 <> return 4
S.yieldM $ print (x, y)
(1,3) (1,4) (2,3) (2,4)
We call the monadic code being run for each element of the stream a monadic
continuation. In imperative paradigm we can think of this composition as
nested for loops and the monadic continuation is the body of the loop. The
loop iterates for all elements of the stream.
Note that the behavior and semantics of SerialT, including Semigroup
and Monad instances are exactly like Haskell lists except that SerialT
can contain effectful actions while lists are pure.
In the code above, the serially combinator can be omitted as the default
stream type is SerialT.
Since: streamly-0.2.0
Instances
data WSerialT (m :: Type -> Type) a #
The Semigroup operation for WSerialT interleaves the elements from the
two streams. Therefore, when a <> b is evaluated, stream a is evaluated
first to produce the first element of the combined stream and then stream
b is evaluated to produce the next element of the combined stream, and
then we go back to evaluating stream a and so on. In other words, the
elements of stream a are interleaved with the elements of stream b.
Note that when multiple actions are combined like a <> b <> c ... <> z we
interleave them in a binary fashion i.e. a and b are interleaved with
each other and the result is interleaved with c and so on. This will not
act as a true round-robin scheduling across all the streams. Note that this
operation cannot be used to fold a container of infinite streams as the
state that it needs to maintain is proportional to the number of streams.
import Streamly
import qualified Streamly.Prelude as S
main = (S.toList . wSerially $ (S.fromList [1,2]) <> (S.fromList [3,4])) >>= print
[1,3,2,4]
Similarly, the Monad instance interleaves the iterations of the
inner and the outer loop, nesting loops in a breadth first manner.
main = S.drain . wSerially $ do
x <- return 1 <> return 2
y <- return 3 <> return 4
S.yieldM $ print (x, y)
(1,3) (2,3) (1,4) (2,4)
Since: streamly-0.2.0
Instances
data AheadT (m :: Type -> Type) a #
The Semigroup operation for AheadT appends two streams. The combined
stream behaves like a single stream with the actions from the second stream
appended to the first stream. The combined stream is evaluated in the
speculative style. This operation can be used to fold an infinite lazy
container of streams.
import Streamly import qualified Streamly.Prelude as S import Control.Concurrent main = do xs <- S.toList.aheadly$ (p 1 |: p 2 |: nil) <> (p 3 |: p 4 |: nil) print xs where p n = threadDelay 1000000 >> return n
[1,2,3,4]
Any exceptions generated by a constituent stream are propagated to the output stream.
The monad instance of AheadT may run each monadic continuation (bind)
concurrently in a speculative manner, performing side effects in a partially
ordered manner but producing the outputs in an ordered manner like
SerialT.
main = S.drain . aheadly $ do
n <- return 3 <> return 2 <> return 1
S.yieldM $ do
threadDelay (n * 1000000)
myThreadId >>= \tid -> putStrLn (show tid ++ ": Delay " ++ show n)
ThreadId 40: Delay 1 ThreadId 39: Delay 2 ThreadId 38: Delay 3
Since: streamly-0.3.0
Instances
| MonadTrans AheadT | |
Defined in Streamly.Streams.Ahead | |
| IsStream AheadT | |
Defined in Streamly.Streams.Ahead Methods toStream :: AheadT m a -> Stream m a fromStream :: Stream m a -> AheadT m a consM :: MonadAsync m => m a -> AheadT m a -> AheadT m a # (|:) :: MonadAsync m => m a -> AheadT m a -> AheadT m a # | |
| (MonadBase b m, Monad m, MonadAsync m) => MonadBase b (AheadT m) | |
Defined in Streamly.Streams.Ahead | |
| (MonadState s m, MonadAsync m) => MonadState s (AheadT m) | |
| (MonadReader r m, MonadAsync m) => MonadReader r (AheadT m) | |
| MonadAsync m => Monad (AheadT m) | |
| Monad m => Functor (AheadT m) | |
| (Monad m, MonadAsync m) => Applicative (AheadT m) | |
| (MonadIO m, MonadAsync m) => MonadIO (AheadT m) | |
Defined in Streamly.Streams.Ahead | |
| (MonadThrow m, MonadAsync m) => MonadThrow (AheadT m) | |
Defined in Streamly.Streams.Ahead | |
| MonadAsync m => Semigroup (AheadT m a) | |
| MonadAsync m => Monoid (AheadT m a) | |
data AsyncT (m :: Type -> Type) a #
The Semigroup operation for AsyncT appends two streams. The combined
stream behaves like a single stream with the actions from the second stream
appended to the first stream. The combined stream is evaluated in the
asynchronous style. This operation can be used to fold an infinite lazy
container of streams.
import Streamly
import qualified Streamly.Prelude as S
import Control.Concurrent
main = (S.toList . asyncly $ (S.fromList [1,2]) <> (S.fromList [3,4])) >>= print
[1,2,3,4]
Any exceptions generated by a constituent stream are propagated to the output stream. The output and exceptions from a single stream are guaranteed to arrive in the same order in the resulting stream as they were generated in the input stream. However, the relative ordering of elements from different streams in the resulting stream can vary depending on scheduling and generation delays.
Similarly, the monad instance of AsyncT may run each iteration
concurrently based on demand. More concurrent iterations are started only
if the previous iterations are not able to produce enough output for the
consumer.
main =drain.asyncly$ do n <- return 3 <> return 2 <> return 1 S.yieldM $ do threadDelay (n * 1000000) myThreadId >>= \tid -> putStrLn (show tid ++ ": Delay " ++ show n)
ThreadId 40: Delay 1 ThreadId 39: Delay 2 ThreadId 38: Delay 3
Since: streamly-0.1.0
Instances
| MonadTrans AsyncT | |
Defined in Streamly.Streams.Async | |
| IsStream AsyncT | |
Defined in Streamly.Streams.Async Methods toStream :: AsyncT m a -> Stream m a fromStream :: Stream m a -> AsyncT m a consM :: MonadAsync m => m a -> AsyncT m a -> AsyncT m a # (|:) :: MonadAsync m => m a -> AsyncT m a -> AsyncT m a # | |
| (MonadBase b m, Monad m, MonadAsync m) => MonadBase b (AsyncT m) | |
Defined in Streamly.Streams.Async | |
| (MonadState s m, MonadAsync m) => MonadState s (AsyncT m) | |
| (MonadReader r m, MonadAsync m) => MonadReader r (AsyncT m) | |
| MonadAsync m => Monad (AsyncT m) | |
| Monad m => Functor (AsyncT m) | |
| (Monad m, MonadAsync m) => Applicative (AsyncT m) | |
| (MonadIO m, MonadAsync m) => MonadIO (AsyncT m) | |
Defined in Streamly.Streams.Async | |
| (MonadThrow m, MonadAsync m) => MonadThrow (AsyncT m) | |
Defined in Streamly.Streams.Async | |
| MonadAsync m => Semigroup (AsyncT m a) | |
| MonadAsync m => Monoid (AsyncT m a) | |
data WAsyncT (m :: Type -> Type) a #
The Semigroup operation for WAsyncT interleaves the elements from the
two streams. Therefore, when a <> b is evaluated, one action is picked
from stream a for evaluation and then the next action is picked from
stream b and then the next action is again picked from stream a, going
around in a round-robin fashion. Many such actions are executed concurrently
depending on maxThreads and maxBuffer limits. Results are served to the
consumer in the order completion of the actions.
Note that when multiple actions are combined like a <> b <> c ... <> z we
go in a round-robin fasion across all of them picking one action from each
up to z and then come back to a. Note that this operation cannot be
used to fold a container of infinite streams as the state that it needs to
maintain is proportional to the number of streams.
import Streamly
import qualified Streamly.Prelude as S
import Control.Concurrent
main = (S.toList . wAsyncly $ (S.fromList [1,2]) <> (S.fromList [3,4])) >>= print
[1,3,2,4]
Any exceptions generated by a constituent stream are propagated to the output stream. The output and exceptions from a single stream are guaranteed to arrive in the same order in the resulting stream as they were generated in the input stream. However, the relative ordering of elements from different streams in the resulting stream can vary depending on scheduling and generation delays.
Similarly, the Monad instance of WAsyncT runs all iterations fairly
concurrently using a round robin scheduling.
main =drain.wAsyncly$ do n <- return 3 <> return 2 <> return 1 S.yieldM $ do threadDelay (n * 1000000) myThreadId >>= \tid -> putStrLn (show tid ++ ": Delay " ++ show n)
ThreadId 40: Delay 1 ThreadId 39: Delay 2 ThreadId 38: Delay 3
Since: streamly-0.2.0
Instances
| MonadTrans WAsyncT | |
Defined in Streamly.Streams.Async | |
| IsStream WAsyncT | |
Defined in Streamly.Streams.Async Methods toStream :: WAsyncT m a -> Stream m a fromStream :: Stream m a -> WAsyncT m a consM :: MonadAsync m => m a -> WAsyncT m a -> WAsyncT m a # (|:) :: MonadAsync m => m a -> WAsyncT m a -> WAsyncT m a # | |
| (MonadBase b m, Monad m, MonadAsync m) => MonadBase b (WAsyncT m) | |
Defined in Streamly.Streams.Async | |
| (MonadState s m, MonadAsync m) => MonadState s (WAsyncT m) | |
| (MonadReader r m, MonadAsync m) => MonadReader r (WAsyncT m) | |
| MonadAsync m => Monad (WAsyncT m) | |
| Monad m => Functor (WAsyncT m) | |
| (Monad m, MonadAsync m) => Applicative (WAsyncT m) | |
| (MonadIO m, MonadAsync m) => MonadIO (WAsyncT m) | |
Defined in Streamly.Streams.Async | |
| (MonadThrow m, MonadAsync m) => MonadThrow (WAsyncT m) | |
Defined in Streamly.Streams.Async | |
| MonadAsync m => Semigroup (WAsyncT m a) | |
| MonadAsync m => Monoid (WAsyncT m a) | |
data ParallelT (m :: Type -> Type) a #
Async composition with strict concurrent execution of all streams.
The Semigroup instance of ParallelT executes both the streams
concurrently without any delay or without waiting for the consumer demand
and merges the results as they arrive. If the consumer does not consume
the results, they are buffered upto a configured maximum, controlled by the
maxBuffer primitive. If the buffer becomes full the concurrent tasks will
block until there is space in the buffer.
Both WAsyncT and ParallelT, evaluate the constituent streams fairly in a
round robin fashion. The key difference is that WAsyncT might wait for the
consumer demand before it executes the tasks whereas ParallelT starts
executing all the tasks immediately without waiting for the consumer demand.
For WAsyncT the maxThreads limit applies whereas for ParallelT it does
not apply. In other words, WAsyncT can be lazy whereas ParallelT is
strict.
ParallelT is useful for cases when the streams are required to be
evaluated simultaneously irrespective of how the consumer consumes them e.g.
when we want to race two tasks and want to start both strictly at the same
time or if we have timers in the parallel tasks and our results depend on
the timers being started at the same time. If we do not have such
requirements then AsyncT or AheadT are recommended as they can be more
efficient than ParallelT.
main = (toList.parallely$ (fromFoldable [1,2]) <> (fromFoldable [3,4])) >>= print
[1,3,2,4]
When streams with more than one element are merged, it yields whichever
stream yields first without any bias, unlike the Async style streams.
Any exceptions generated by a constituent stream are propagated to the output stream. The output and exceptions from a single stream are guaranteed to arrive in the same order in the resulting stream as they were generated in the input stream. However, the relative ordering of elements from different streams in the resulting stream can vary depending on scheduling and generation delays.
Similarly, the Monad instance of ParallelT runs all iterations
of the loop concurrently.
import Streamly import qualified Streamly.Prelude as S import Control.Concurrent main =drain.parallely$ do n <- return 3 <> return 2 <> return 1 S.yieldM $ do threadDelay (n * 1000000) myThreadId >>= \tid -> putStrLn (show tid ++ ": Delay " ++ show n)
ThreadId 40: Delay 1 ThreadId 39: Delay 2 ThreadId 38: Delay 3
Note that parallel composition can only combine a finite number of streams as it needs to retain state for each unfinished stream.
Since: 0.7.0 (maxBuffer applies to ParallelT streams)
Since: 0.1.0
Instances
| MonadTrans ParallelT | |
Defined in Streamly.Streams.Parallel | |
| IsStream ParallelT | |
Defined in Streamly.Streams.Parallel Methods toStream :: ParallelT m a -> Stream m a fromStream :: Stream m a -> ParallelT m a consM :: MonadAsync m => m a -> ParallelT m a -> ParallelT m a # (|:) :: MonadAsync m => m a -> ParallelT m a -> ParallelT m a # | |
| (MonadBase b m, Monad m, MonadAsync m) => MonadBase b (ParallelT m) | |
Defined in Streamly.Streams.Parallel | |
| (MonadState s m, MonadAsync m) => MonadState s (ParallelT m) | |
| (MonadReader r m, MonadAsync m) => MonadReader r (ParallelT m) | |
| MonadAsync m => Monad (ParallelT m) | |
| Monad m => Functor (ParallelT m) | |
| (Monad m, MonadAsync m) => Applicative (ParallelT m) | |
Defined in Streamly.Streams.Parallel | |
| (MonadIO m, MonadAsync m) => MonadIO (ParallelT m) | |
Defined in Streamly.Streams.Parallel | |
| (MonadThrow m, MonadAsync m) => MonadThrow (ParallelT m) | |
Defined in Streamly.Streams.Parallel | |
| MonadAsync m => Semigroup (ParallelT m a) | |
| MonadAsync m => Monoid (ParallelT m a) | |
type MonadAsync (m :: Type -> Type) = (MonadIO m, MonadBaseControl IO m, MonadThrow m) #
A monad that can perform concurrent or parallel IO operations. Streams
that can be composed concurrently require the underlying monad to be
MonadAsync.
Since: streamly-0.1.0
class (forall (m :: Type -> Type) a. MonadAsync m => Semigroup (t m a), forall (m :: Type -> Type) a. MonadAsync m => Monoid (t m a), forall (m :: Type -> Type). Monad m => Functor (t m), forall (m :: Type -> Type). MonadAsync m => Applicative (t m)) => IsStream (t :: (Type -> Type) -> Type -> Type) #
Class of types that can represent a stream of elements of some type a in
some monad m.
Since: streamly-0.2.0
Instances
| IsStream Stream | |
Defined in Streamly.Streams.StreamK.Type Methods toStream :: Stream m a -> Stream m a fromStream :: Stream m a -> Stream m a consM :: MonadAsync m => m a -> Stream m a -> Stream m a # (|:) :: MonadAsync m => m a -> Stream m a -> Stream m a # | |
| IsStream ZipSerialM | |
Defined in Streamly.Streams.Zip Methods toStream :: ZipSerialM m a -> Stream m a fromStream :: Stream m a -> ZipSerialM m a consM :: MonadAsync m => m a -> ZipSerialM m a -> ZipSerialM m a # (|:) :: MonadAsync m => m a -> ZipSerialM m a -> ZipSerialM m a # | |
| IsStream ZipAsyncM | |
Defined in Streamly.Streams.Zip Methods toStream :: ZipAsyncM m a -> Stream m a fromStream :: Stream m a -> ZipAsyncM m a consM :: MonadAsync m => m a -> ZipAsyncM m a -> ZipAsyncM m a # (|:) :: MonadAsync m => m a -> ZipAsyncM m a -> ZipAsyncM m a # | |
| IsStream AheadT | |
Defined in Streamly.Streams.Ahead Methods toStream :: AheadT m a -> Stream m a fromStream :: Stream m a -> AheadT m a consM :: MonadAsync m => m a -> AheadT m a -> AheadT m a # (|:) :: MonadAsync m => m a -> AheadT m a -> AheadT m a # | |
| IsStream AsyncT | |
Defined in Streamly.Streams.Async Methods toStream :: AsyncT m a -> Stream m a fromStream :: Stream m a -> AsyncT m a consM :: MonadAsync m => m a -> AsyncT m a -> AsyncT m a # (|:) :: MonadAsync m => m a -> AsyncT m a -> AsyncT m a # | |
| IsStream WAsyncT | |
Defined in Streamly.Streams.Async Methods toStream :: WAsyncT m a -> Stream m a fromStream :: Stream m a -> WAsyncT m a consM :: MonadAsync m => m a -> WAsyncT m a -> WAsyncT m a # (|:) :: MonadAsync m => m a -> WAsyncT m a -> WAsyncT m a # | |
| IsStream ParallelT | |
Defined in Streamly.Streams.Parallel Methods toStream :: ParallelT m a -> Stream m a fromStream :: Stream m a -> ParallelT m a consM :: MonadAsync m => m a -> ParallelT m a -> ParallelT m a # (|:) :: MonadAsync m => m a -> ParallelT m a -> ParallelT m a # | |
| IsStream SerialT | |
Defined in Streamly.Streams.Serial Methods toStream :: SerialT m a -> Stream m a fromStream :: Stream m a -> SerialT m a consM :: MonadAsync m => m a -> SerialT m a -> SerialT m a # (|:) :: MonadAsync m => m a -> SerialT m a -> SerialT m a # | |
| IsStream WSerialT | |
Defined in Streamly.Streams.Serial Methods toStream :: WSerialT m a -> Stream m a fromStream :: Stream m a -> WSerialT m a consM :: MonadAsync m => m a -> WSerialT m a -> WSerialT m a # (|:) :: MonadAsync m => m a -> WSerialT m a -> WSerialT m a # | |