module Numeric.Datasets.Netflix where
import Prelude hiding (takeWhile)
import Numeric.Datasets
import Data.FileEmbed
import Data.ByteString hiding (map, takeWhile)
import Data.Time (Day, fromGregorian)
import Control.Applicative
import Data.Monoid (mconcat)
import Data.Traversable (traverse)
import qualified Data.Attoparsec.Internal.Types as PT (Parser)
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 hiding (takeWhile, inClass)
trainingSet :: [(FilePath, ByteString)]
trainingSet = $(embedDir "datafiles/netflix/training/")
testSet :: [(FilePath, ByteString)]
testSet = $(embedDir "datafiles/netflix/test/")
movies :: [(FilePath, ByteString)]
movies = $(embedDir "datafiles/netflix/movies/")
data RatingDate = RatingDate {userId :: UserId,
ratingDate :: Day} deriving (Eq, Show)
newtype UserId = UserId {unUserId :: Int} deriving Eq
instance Show UserId where show = show . unUserId
data Train = Train {trainRating :: RatingDate,
rating :: Int } deriving (Eq, Show)
newtype MovieId = MovieId {unMovieId :: Int} deriving Eq
instance Show MovieId where show = show . unMovieId
data Movie = Movie { movieId :: MovieId,
releaseYear :: Day,
movieTitle :: ByteString } deriving (Eq, Show)
newtype Test = Test { testRating :: RatingDate } deriving (Eq, Show)
data TrainCol = TrainC { tcMovieId :: MovieId,
tcTrainSet :: [Train]} deriving (Eq, Show)
data RD a = RD { rdRating :: a,
rdDate :: Day} deriving (Eq, Show)
toCoordsCol :: Num a => TrainCol -> [(UserId, MovieId, RD a)]
toCoordsCol tc = map (f mid) tss where
tss = tcTrainSet tc
mid = tcMovieId tc
f m ts = (uid, m, RD r d) where
r = fromIntegral $ rating ts
d = ratingDate $ trainRating ts
uid = userId $ trainRating ts
parseTrainingSet :: Num a => Either String [(UserId, MovieId, RD a)]
parseTrainingSet = mconcat <$> parseTrainingSet'
parseTrainingSet' :: Num a => Either String [[(UserId, MovieId, RD a)]]
parseTrainingSet' = do
d <- traverse (parseOnly trainingSetParser . snd) trainingSet
pure $ map toCoordsCol d
trainingSetParser :: PT.Parser ByteString TrainCol
trainingSetParser = do
(mid, tr) <- stanza trainRow
return $ TrainC mid tr
testSetParser :: PT.Parser ByteString [(MovieId, [Test])]
testSetParser = many1 (stanza testRow)
moviesParser :: PT.Parser ByteString [Movie]
moviesParser = parseRows moviesRow
trainRow :: PT.Parser ByteString Train
trainRow = do
uid <- decc
rate <- decc
d <- date
let r = RatingDate (UserId uid) d
return $ Train r rate
testRow :: PT.Parser ByteString Test
testRow = do
uid <- decc
d <- date
let r = RatingDate (UserId uid) d
return $ Test r
moviesRow :: PT.Parser ByteString Movie
moviesRow = do
mo <- decc
ye <- decc
title <- takeWhile (inClass "-a-zA-Z0-9 :,&.")
return $ Movie (MovieId mo) (fromGregorian (fromIntegral ye) 1 1) title
parseRows :: PT.Parser ByteString a -> PT.Parser ByteString [a]
parseRows p = many1 (p <* endOfLine)
stanza :: PT.Parser ByteString a -> PT.Parser ByteString (MovieId, [a])
stanza p = do
i <- ident <* endOfLine
pp <- many1 (p <* endOfLine)
return (MovieId (fromIntegral i), pp)
date :: PT.Parser ByteString Day
date = do
(yy:mm:dd:_) <- sepBy decimal dash
pure $ fromGregorian (fromIntegral yy) mm dd
comma, dash :: Parser Char
comma = char ','
dash = char '-'
decc :: PT.Parser ByteString Int
decc = do
d <- decimal
_ <- comma
return d
ident :: PT.Parser ByteString Integer
ident = do
i <- decimal
_ <- char ':'
return i