module Numeric.Datasets where
import Data.Csv
import System.FilePath
import System.Directory
import Data.Hashable
import Data.Monoid
import qualified Data.ByteString.Lazy as BL
import qualified Data.Vector as V
import qualified Data.Aeson as JSON
import Control.Applicative
import Data.Time
import Data.Char (ord)
import qualified Network.Wreq as Wreq
import Lens.Micro ((^.))
import Data.Char (toUpper)
import Text.Read (readMaybe)
import Data.ByteString.Char8 (unpack)
import qualified Data.ByteString.Lazy.Char8 as BL8
import Data.ByteString.Lazy.Search (replace)
getDataset :: Dataset a -> IO [a]
getDataset ds = do
dir <- getTemporaryDirectory
ds $ dir </> "haskds"
type Dataset a = FilePath
-> IO [a]
data Source = URL String
csvDatasetPreprocess :: FromRecord a => (BL.ByteString -> BL.ByteString) -> Source -> Dataset a
csvDatasetPreprocess preF src cacheDir = do
parseCSV preF <$> getFileFromSource cacheDir src
csvDataset :: FromRecord a => Source -> Dataset a
csvDataset = csvDatasetPreprocess id
csvHdrDataset :: FromNamedRecord a => Source -> Dataset a
csvHdrDataset src cacheDir = do
parseCSVHdr <$> getFileFromSource cacheDir src
csvHdrDatasetSep :: FromNamedRecord a => Char -> Source -> Dataset a
csvHdrDatasetSep sepc src cacheDir = do
parseCSVHdrSep sepc <$> getFileFromSource cacheDir src
jsonDataset :: JSON.FromJSON a => Source -> Dataset a
jsonDataset src cacheDir = do
bs <- getFileFromSource cacheDir src
return $ parseJSON bs
getFileFromSource :: FilePath -> Source -> IO (BL.ByteString)
getFileFromSource cacheDir (URL url) = do
createDirectoryIfMissing True cacheDir
let fnm = cacheDir </> "ds" <> show (hash url)
ex <- doesFileExist fnm
if ex
then BL.readFile fnm
else do
rsp <- Wreq.get url
let bs = rsp ^. Wreq.responseBody
BL.writeFile fnm bs
return bs
parseCSV :: FromRecord a => (BL.ByteString -> BL.ByteString) -> BL.ByteString -> [a]
parseCSV preF contents =
case decode NoHeader (preF contents) of
Right theData -> V.toList theData
Left err -> error err
parseCSVHdr :: FromNamedRecord a => BL.ByteString -> [a]
parseCSVHdr contents =
case decodeByName contents of
Right (_,theData) -> V.toList theData
Left err -> error err
parseCSVHdrSep :: FromNamedRecord a => Char -> BL.ByteString -> [a]
parseCSVHdrSep sepc contents =
let opts = defaultDecodeOptions { decDelimiter = fromIntegral (ord sepc)} in
case decodeByNameWith opts contents of
Right (_,theData) -> V.toList theData
Left err -> error err
parseJSON :: JSON.FromJSON a => BL.ByteString -> [a]
parseJSON bs = case JSON.decode bs of
Just theData -> theData
Nothing -> error "failed to parse json"
dashToCamelCase :: String -> String
dashToCamelCase ('-':c:cs) = toUpper c : dashToCamelCase cs
dashToCamelCase (c:cs) = c : dashToCamelCase cs
dashToCamelCase [] = []
parseDashToCamelField :: Read a => Field -> Parser a
parseDashToCamelField s =
case readMaybe (dashToCamelCase $ unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
parseReadField :: Read a => Field -> Parser a
parseReadField s =
case readMaybe (unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
dropLines :: Int -> BL.ByteString -> BL.ByteString
dropLines 0 s = s
dropLines n s = dropLines (n1) $ BL.tail $ BL8.dropWhile (/='\n') s
fixAmericanDecimals :: BL.ByteString -> BL.ByteString
fixAmericanDecimals = replace ",." (",0."::BL.ByteString)
fixedWidthToCSV :: BL.ByteString -> BL.ByteString
fixedWidthToCSV = BL8.pack . fnl . BL8.unpack where
f [] = []
f (' ':cs) = ',':f (chomp cs)
f ('\n':cs) = '\n':fnl cs
f (c:cs) = c:f cs
fnl cs = f (chomp cs) --newline
chomp (' ':cs) = chomp cs
chomp (c:cs) = c:cs
chomp [] = []
yearToUTCTime :: Double -> UTCTime
yearToUTCTime yearDbl =
let (yearn,yearFrac) = properFraction yearDbl
dayYearBegin = fromGregorian yearn 1 1
(dayn, dayFrac) = properFraction $ yearFrac * (if isLeapYear yearn then 366 else 365)
day = addDays dayn dayYearBegin
dt = secondsToDiffTime $ round $ dayFrac * 86400
in UTCTime day dt