module Numeric.Datasets where
import Network.HTTP
import Data.Csv
import System.FilePath
import System.Directory
import Data.Hashable
import Data.Monoid
import qualified Data.ByteString.Lazy as BL
import qualified Data.Vector as V
import qualified Data.Aeson as JSON
import Control.Applicative
import Data.Char (toUpper)
import Text.Read (readMaybe)
import Data.ByteString.Char8 (unpack)
import qualified Data.ByteString.Lazy.Char8 as BL8
import Data.ByteString.Lazy.Search (replace)
getDataset :: Dataset a -> IO [a]
getDataset ds = do
dir <- getTemporaryDirectory
ds $ dir </> "haskds"
type Dataset a = FilePath
-> IO [a]
data Source = URL String
csvDatasetPreprocess :: FromRecord a => (BL.ByteString -> BL.ByteString) -> Source -> Dataset a
csvDatasetPreprocess preF src cacheDir = do
parseCSV preF <$> getFileFromSource cacheDir src
csvDataset :: FromRecord a => Source -> Dataset a
csvDataset = csvDatasetPreprocess id
jsonDataset :: JSON.FromJSON a => Source -> Dataset a
jsonDataset src cacheDir = do
bs <- getFileFromSource cacheDir src
return $ parseJSON bs
getFileFromSource :: FilePath -> Source -> IO (BL.ByteString)
getFileFromSource cacheDir (URL url) = do
createDirectoryIfMissing True cacheDir
let fnm = cacheDir </> "ds" <> show (hash url)
castRequest :: Request String -> Request BL.ByteString
castRequest r = Request (rqURI r) (rqMethod r) (rqHeaders r) ""
ex <- doesFileExist fnm
if ex
then BL.readFile fnm
else do
rsp <- simpleHTTP (castRequest $ getRequest url)
bs <- getResponseBody rsp
BL.writeFile fnm bs
return bs
parseCSV :: FromRecord a => (BL.ByteString -> BL.ByteString) -> BL.ByteString -> [a]
parseCSV preF contents =
case decode NoHeader (preF contents) of
Right theData -> V.toList theData
Left err -> error err
parseJSON :: JSON.FromJSON a => BL.ByteString -> [a]
parseJSON bs = case JSON.decode bs of
Just theData -> theData
Nothing -> error "failed to parse json"
dashToCamelCase :: String -> String
dashToCamelCase ('-':c:cs) = toUpper c : dashToCamelCase cs
dashToCamelCase (c:cs) = c : dashToCamelCase cs
dashToCamelCase [] = []
parseDashToCamelField :: Read a => Field -> Parser a
parseDashToCamelField s =
case readMaybe (dashToCamelCase $ unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
parseReadField :: Read a => Field -> Parser a
parseReadField s =
case readMaybe (unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
dropLines :: Int -> BL.ByteString -> BL.ByteString
dropLines 0 s = s
dropLines n s = dropLines (n1) $ BL.tail $ BL8.dropWhile (/='\n') s
fixAmericanDecimals :: BL.ByteString -> BL.ByteString
fixAmericanDecimals = replace ",." (",0."::BL.ByteString)
fixedWidthToCSV :: BL.ByteString -> BL.ByteString
fixedWidthToCSV = BL8.pack . fnl . BL8.unpack where
f [] = []
f (' ':cs) = ',':f (chomp cs)
f ('\n':cs) = '\n':fnl cs
f (c:cs) = c:f cs
fnl cs = f (chomp cs) --newline
chomp (' ':cs) = chomp cs
chomp (c:cs) = c:cs
chomp [] = []