algebraic-graphs-io-0.5.0.1: I/O utilities and datasets for algebraic-graphs
Safe HaskellSafe-Inferred
LanguageHaskell2010

Algebra.Graph.IO.Datasets.LINQS

Contents

Description

Link-based datasets from https://linqs.soe.ucsc.edu/data

Synopsis

Documentation

restoreContent Source #

Arguments

:: Binary c 
=> FilePath

directory where the data files are saved

-> IO (Map String (Int16, Seq Int16, c)) 

Load the graph node data from local storage

data CitesRow a Source #

Who cites whom

Constructors

CitesRow 

Fields

Instances

Instances details
Generic (CitesRow a) Source # 
Instance details

Defined in Algebra.Graph.IO.Datasets.LINQS

Associated Types

type Rep (CitesRow a) :: Type -> Type #

Methods

from :: CitesRow a -> Rep (CitesRow a) x #

to :: Rep (CitesRow a) x -> CitesRow a #

Show a => Show (CitesRow a) Source # 
Instance details

Defined in Algebra.Graph.IO.Datasets.LINQS

Methods

showsPrec :: Int -> CitesRow a -> ShowS #

show :: CitesRow a -> String #

showList :: [CitesRow a] -> ShowS #

Binary a => Binary (CitesRow a) Source # 
Instance details

Defined in Algebra.Graph.IO.Datasets.LINQS

Methods

put :: CitesRow a -> Put #

get :: Get (CitesRow a) #

putList :: [CitesRow a] -> Put #

Eq a => Eq (CitesRow a) Source # 
Instance details

Defined in Algebra.Graph.IO.Datasets.LINQS

Methods

(==) :: CitesRow a -> CitesRow a -> Bool #

(/=) :: CitesRow a -> CitesRow a -> Bool #

type Rep (CitesRow a) Source # 
Instance details

Defined in Algebra.Graph.IO.Datasets.LINQS

type Rep (CitesRow a) = D1 ('MetaData "CitesRow" "Algebra.Graph.IO.Datasets.LINQS" "algebraic-graphs-io-0.5.0.1-DMAyteJuhT8EAUES5OdwH7" 'False) (C1 ('MetaCons "CitesRow" 'PrefixI 'True) (S1 ('MetaSel ('Just "cirTo") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 a) :*: S1 ('MetaSel ('Just "cirFrom") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 a)))

data ContentRow i c Source #

Dataset row of the .content file

The .content file contains descriptions of the papers in the following format:

<paper_id> <word_attributes> <class_label>

The first entry in each line contains the unique string ID of the paper followed by binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper. Finally, the last entry in the line contains the class label of the paper.

Constructors

CRow 

Fields

Instances

Instances details
Generic (ContentRow i c) Source # 
Instance details

Defined in Algebra.Graph.IO.Datasets.LINQS

Associated Types

type Rep (ContentRow i c) :: Type -> Type #

Methods

from :: ContentRow i c -> Rep (ContentRow i c) x #

to :: Rep (ContentRow i c) x -> ContentRow i c #

(Show i, Show c) => Show (ContentRow i c) Source # 
Instance details

Defined in Algebra.Graph.IO.Datasets.LINQS

Methods

showsPrec :: Int -> ContentRow i c -> ShowS #

show :: ContentRow i c -> String #

showList :: [ContentRow i c] -> ShowS #

(Binary i, Binary c) => Binary (ContentRow i c) Source # 
Instance details

Defined in Algebra.Graph.IO.Datasets.LINQS

Methods

put :: ContentRow i c -> Put #

get :: Get (ContentRow i c) #

putList :: [ContentRow i c] -> Put #

(Eq i, Eq c) => Eq (ContentRow i c) Source # 
Instance details

Defined in Algebra.Graph.IO.Datasets.LINQS

Methods

(==) :: ContentRow i c -> ContentRow i c -> Bool #

(/=) :: ContentRow i c -> ContentRow i c -> Bool #

(Ord i, Ord c) => Ord (ContentRow i c) Source # 
Instance details

Defined in Algebra.Graph.IO.Datasets.LINQS

Methods

compare :: ContentRow i c -> ContentRow i c -> Ordering #

(<) :: ContentRow i c -> ContentRow i c -> Bool #

(<=) :: ContentRow i c -> ContentRow i c -> Bool #

(>) :: ContentRow i c -> ContentRow i c -> Bool #

(>=) :: ContentRow i c -> ContentRow i c -> Bool #

max :: ContentRow i c -> ContentRow i c -> ContentRow i c #

min :: ContentRow i c -> ContentRow i c -> ContentRow i c #

type Rep (ContentRow i c) Source # 
Instance details

Defined in Algebra.Graph.IO.Datasets.LINQS

type Rep (ContentRow i c) = D1 ('MetaData "ContentRow" "Algebra.Graph.IO.Datasets.LINQS" "algebraic-graphs-io-0.5.0.1-DMAyteJuhT8EAUES5OdwH7" 'False) (C1 ('MetaCons "CRow" 'PrefixI 'True) ((S1 ('MetaSel ('Just "crId") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 i) :*: S1 ('MetaSel ('Just "crIdStr") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 String)) :*: (S1 ('MetaSel ('Just "crFeatures") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 (Seq Int16)) :*: S1 ('MetaSel ('Just "crClass") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 c))))

Internal

stash Source #

Arguments

:: Binary c 
=> FilePath

directory where the data files will be saved

-> String

URI of .tar.gz file

-> Int

dictionary size

-> Parser c

document class

-> IO () 

Download, decompress, parse, serialize and save the dataset to local storage

sourceGraphEdges Source #

Arguments

:: (MonadResource m, MonadThrow m) 
=> FilePath

directory of data files

-> Map String (Int16, Seq Int16, c)

content data

-> ConduitT i (Maybe (Graph (ContentRow Int16 c))) m () 

Stream out the edges of the citation graph, in which the nodes are decorated with the document metadata.

The full citation graph can be reconstructed by folding over this stream and overlaying the graph edges as they arrive.

This way the graph can be partitioned in training , test and validation subsets at the usage site

loadGraph Source #

Arguments

:: Binary c 
=> FilePath

directory where the data files were saved

-> IO (Graph (ContentRow Int16 c)) 

Reconstruct the citation graph

NB : relies on the user having stashed the dataset to local disk first.