Safe Haskell	None
Language	Haskell2010

Numeric.Datasets

Contents

Parsing datasets
Defining datasets
Dataset source URLs

Description

The datasets package defines three different kinds of datasets:

Tiny datasets (up to a few tens of rows) are embedded as part of the library source code, as lists of values.
Small data sets are embedded indirectly (via file-embed) in the package as pure values and do not require IO to be downloaded (i.e. the data is loaded and parsed at compile time).
Larger data sets which need to be fetched over the network and are cached in a local temporary directory for subsequent use.

This module defines the getDataset function for fetching datasets and utilities for defining new data sets and modifying their options. It is only necessary to import this module when using fetched data sets. Embedded data sets can be used directly.

Please refer to the dataset modules for examples.

Synopsis

getDataset :: (MonadThrow io, MonadIO io) => Dataset a -> io [a]
data Dataset a = Dataset {
- source :: Source
- temporaryDirectory :: Maybe FilePath
- preProcess :: Maybe (ByteString -> ByteString)
- readAs :: ReadAs a
}
data Source
- = URL (Url h)
- | File FilePath
- | ImgFolder FilePath (NonEmpty String)
getDatavec :: (MonadThrow io, MonadIO io, Vector v a) => Dataset a -> io (v a)
defaultTempDir :: Maybe FilePath -> IO FilePath
getFileFromSource :: FilePath -> Source -> IO (NonEmpty ByteString)
readDataset :: ReadAs a -> ByteString -> [a]
safeReadDataset :: (Vector v a, MonadThrow m) => ReadAs a -> NonEmpty ByteString -> m (v a)
data ReadAs a where
- JSON :: FromJSON a => ReadAs a
- CSVRecord :: FromRecord a => HasHeader -> DecodeOptions -> ReadAs a
- CSVNamedRecord :: FromNamedRecord a => DecodeOptions -> ReadAs a
- Parsable :: Parser a -> ReadAs a
- ImageFolder :: NonEmpty String -> ReadAs (String, FilePath)
csvRecord :: FromRecord a => ReadAs a
csvDataset :: FromRecord a => Source -> Dataset a
csvHdrDataset :: FromNamedRecord a => Source -> Dataset a
csvHdrDatasetSep :: FromNamedRecord a => Char -> Source -> Dataset a
csvDatasetSkipHdr :: FromRecord a => Source -> Dataset a
jsonDataset :: FromJSON a => Source -> Dataset a
withPreprocess :: (ByteString -> ByteString) -> Dataset a -> Dataset a
withTempDir :: FilePath -> Dataset a -> Dataset a
dropLines :: Int -> ByteString -> ByteString
fixedWidthToCSV :: ByteString -> ByteString
removeEscQuotes :: ByteString -> ByteString
fixAmericanDecimals :: ByteString -> ByteString
parseReadField :: Read a => Field -> Parser a
parseDashToCamelField :: Read a => Field -> Parser a
yearToUTCTime :: Double -> UTCTime
umassMLDB :: Url Http
uciMLDB :: Url Https

Documentation

getDataset :: (MonadThrow io, MonadIO io) => Dataset a -> io [a] Source #

Load a dataset into memory

data Dataset a Source #

A Dataset contains metadata for loading, caching, preprocessing and parsing data.

Constructors

Dataset
Fields source :: Source Dataset source temporaryDirectory :: Maybe FilePath Temporary directory (optional) preProcess :: Maybe (ByteString -> ByteString) Dataset preprocessing function (optional) readAs :: ReadAs a

data Source Source #

A Dataset source can be either a URL (for remotely-hosted datasets) or the filepath of a local file.

Constructors

URL (Url h)
File FilePath
ImgFolder FilePath (NonEmpty String)

getDatavec :: (MonadThrow io, MonadIO io, Vector v a) => Dataset a -> io (v a) Source #

Load a dataset into memory as a vector

defaultTempDir :: Maybe FilePath -> IO FilePath Source #

Reify an optional temporary directory

getFileFromSource Source #

Arguments

:: FilePath	Cache directory
-> Source
-> IO (NonEmpty ByteString)

Get a ByteString from the specified Source

Parsing datasets

readDataset Source #

Arguments

:: ReadAs a	How to parse the raw data string
-> ByteString	The data strings
-> [a]

Parse a ByteString into a list of Haskell values

safeReadDataset :: (Vector v a, MonadThrow m) => ReadAs a -> NonEmpty ByteString -> m (v a) Source #

Read a ByteString into a Haskell value

data ReadAs a where Source #

ReadAs is a datatype to describe data formats that hold data sets

Constructors

JSON :: FromJSON a => ReadAs a
CSVRecord :: FromRecord a => HasHeader -> DecodeOptions -> ReadAs a
CSVNamedRecord :: FromNamedRecord a => DecodeOptions -> ReadAs a
Parsable :: Parser a -> ReadAs a
ImageFolder :: NonEmpty String -> ReadAs (String, FilePath)