module Numeric.SGD.Dataset
(
Dataset (..)
, loadData
, sample
, withVect
, withDisk
, withData
) where
import Control.Monad (forM_)
import Data.Binary (Binary, encodeFile, decode)
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import System.IO.Unsafe (unsafeInterleaveIO)
import System.IO.Temp (withTempDirectory)
import System.FilePath ((</>))
import qualified System.Random as R
import qualified Data.Vector as V
import qualified Control.Monad.State.Strict as S
data Dataset a = Dataset {
size :: Int
, elemAt :: Int -> IO a }
loadData :: Dataset a -> IO [a]
loadData Dataset{..} = lazyMapM elemAt [0 .. size 1]
sample :: R.RandomGen g => g -> Int -> Dataset a -> IO ([a], g)
sample g 0 _ = return ([], g)
sample g n dataset = do
(xs, g') <- sample g (n1) dataset
let (i, g'') = R.next g'
x <- dataset `elemAt` (i `mod` size dataset)
return (x:xs, g'')
lazyMapM :: (a -> IO b) -> [a] -> IO [b]
lazyMapM f (x:xs) = do
y <- f x
ys <- unsafeInterleaveIO $ lazyMapM f xs
return (y:ys)
lazyMapM _ [] = return []
withVect :: [a] -> (Dataset a -> IO b) -> IO b
withVect xs handler =
handler dataset
where
v = V.fromList xs
dataset = Dataset
{ size = V.length v
, elemAt = \k -> return (v V.! k) }
withDisk :: Binary a => [a] -> (Dataset a -> IO b) -> IO b
withDisk xs handler = withTempDirectory "." ".sgd" $ \tmpDir -> do
n <- flip S.execStateT 0 $ forM_ (zip xs [0 :: Int ..]) $ \(x, ix) -> do
S.lift $ encodeFile (tmpDir </> show ix) x
S.modify (+1)
let at ix = do
cs <- B.readFile (tmpDir </> show ix)
return . decode $ BL.fromChunks [cs]
handler $ Dataset {size = n, elemAt = at}
withData :: Binary a => Bool -> [a] -> (Dataset a -> IO b) -> IO b
withData x = case x of
True -> withDisk
False -> withVect