{-# LANGUAGE DataKinds #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedLabels #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeApplications #-} -- The same example than example1 in 'porcupine-core', but with s3 access -- enabled by 'runPipelineTask'. Don't forget to map locations to s3 urls in the -- 'porcupine.yaml' generated by calling 'exampleS3 write-config-template', or -- else it will act exactly like example1. -- -- Don't forget to enable OverloadedLabels and import Data.Locations.Accessors.AWS import Data.Aeson import Data.DocRecord import qualified Data.HashMap.Strict as HM import qualified Data.Text as T import GHC.Generics import Porcupine import Data.Locations.Accessors.AWS data User = User { userName :: T.Text , userSurname :: T.Text , userAge :: Int } deriving (Generic) instance FromJSON User newtype Analysis = Analysis { numLetters :: HM.HashMap Char Int } deriving (Generic) instance ToJSON Analysis -- | How to load users userFile :: DataSource User userFile = dataSource ["Inputs", "User"] (somePureDeserial JSONSerial) -- | How to write analysis analysisFile :: DataSink Analysis analysisFile = dataSink ["Outputs", "Analysis"] (somePureSerial JSONSerial) -- | The simple computation we want to perform computeAnalysis :: User -> Analysis computeAnalysis (User name surname _) = Analysis $ HM.fromListWith (+) $ [(c,1) | c <- T.unpack name] ++ [(c,1) | c <- T.unpack surname] -- | The task combining the three previous operations. -- -- This task may look very opaque from the outside, having no parameters and no -- return value. But we will be able to reuse it over different users without -- having to change it at all. analyseOneUser :: (LogThrow m) => PTask m () () analyseOneUser = loadData userFile >>> arr computeAnalysis >>> writeData analysisFile mainTask :: (LogThrow m) => PTask m () () mainTask = -- First we get the ids of the users that we want to analyse. We need only one -- field that will contain a range of values, see IndexRange. By default, this -- range contains just one value, zero. getOption ["Settings"] (docField @"users" (oneIndex (0::Int)) "The user ids to load") -- We turn the range we read into a full lazy list: >>> arr enumTRIndices -- Then we just map over these ids and call analyseOneUser each time: >>> parMapTask_ "userId" analyseOneUser main :: IO () main = runPipelineTask (FullConfig "exampleS3" "porcupine.yaml" "porcupine-core/examples/data" ()) ( #aws <-- useAWS Discover -- We just add #aws on top of the -- baseContexts. Credentials will be discovered. :& baseContexts "") mainTask ()