Safe Haskell | None |
---|---|
Language | Haskell98 |
The datasets package defines two different kinds of datasets:
- small data sets which are directly (or indirectly with `file-embed`) embedded in the package as pure values and do not require network or IO to download the data set.
- other data sets which need to be fetched over the network with
getDataset
and are cached in a local temporary directory
This module defines the getDataset
function for fetching datasets
and utilies for defining new data sets. It is only necessary to import
this module when using fetched data sets. Embedded data sets can be
imported directly.
- getDataset :: Dataset a -> IO [a]
- type Dataset a = FilePath -> IO [a]
- data Source = URL String
- csvDatasetPreprocess :: FromRecord a => (ByteString -> ByteString) -> Source -> Dataset a
- csvDataset :: FromRecord a => Source -> Dataset a
- csvHdrDataset :: FromNamedRecord a => Source -> Dataset a
- csvHdrDatasetSep :: FromNamedRecord a => Char -> Source -> Dataset a
- jsonDataset :: FromJSON a => Source -> Dataset a
- getFileFromSource :: FilePath -> Source -> IO ByteString
- parseCSV :: FromRecord a => (ByteString -> ByteString) -> ByteString -> [a]
- parseCSVHdr :: FromNamedRecord a => ByteString -> [a]
- parseCSVHdrSep :: FromNamedRecord a => Char -> ByteString -> [a]
- parseJSON :: FromJSON a => ByteString -> [a]
- dashToCamelCase :: String -> String
- parseDashToCamelField :: Read a => Field -> Parser a
- parseReadField :: Read a => Field -> Parser a
- dropLines :: Int -> ByteString -> ByteString
- fixAmericanDecimals :: ByteString -> ByteString
- fixedWidthToCSV :: ByteString -> ByteString
- yearToUTCTime :: Double -> UTCTime
Using datasets
getDataset :: Dataset a -> IO [a] Source #
Load a dataset, using the system temporary directory as a cache
A dataset is defined as a function from the caching directory to the IO action that loads the data
Defining datasets
csvDatasetPreprocess :: FromRecord a => (ByteString -> ByteString) -> Source -> Dataset a Source #
Define a dataset from a pre-processing function and a source for a CSV file
csvDataset :: FromRecord a => Source -> Dataset a Source #
Define a dataset from a source for a CSV file
csvHdrDataset :: FromNamedRecord a => Source -> Dataset a Source #
Define a dataset from a source for a CSV file with a known header
csvHdrDatasetSep :: FromNamedRecord a => Char -> Source -> Dataset a Source #
Define a dataset from a source for a CSV file with a known header and separator
jsonDataset :: FromJSON a => Source -> Dataset a Source #
Define a dataset from a source for a JSON file -- data file must be accessible with HTTP, not HTTPS
getFileFromSource :: FilePath -> Source -> IO ByteString Source #
Get a ByteString from the specified Source
parseCSV :: FromRecord a => (ByteString -> ByteString) -> ByteString -> [a] Source #
Parse CSV file
parseCSVHdr :: FromNamedRecord a => ByteString -> [a] Source #
Parse CSV file with known header
parseCSVHdrSep :: FromNamedRecord a => Char -> ByteString -> [a] Source #
Parse CSV file with known header
parseJSON :: FromJSON a => ByteString -> [a] Source #
Parse JSON file
Helper functions for parsing
dashToCamelCase :: String -> String Source #
Turn dashes to CamlCase
parseDashToCamelField :: Read a => Field -> Parser a Source #
Parse a field, first turning dashes to CamlCase
dropLines :: Int -> ByteString -> ByteString Source #
Drop lines from a bytestring
fixAmericanDecimals :: ByteString -> ByteString Source #
Turn US-style decimals starting with a period (e.g. .2) into something Haskell can parse (e.g. 0.2)
fixedWidthToCSV :: ByteString -> ByteString Source #
Convert a Fixed-width format to a CSV
Helper functions for data analysis
yearToUTCTime :: Double -> UTCTime Source #
convert a fractional year to UTCTime with second-level precision (due to not taking into account leap seconds)