module Swish.RDF.Parser.Turtle
( ParseResult
, parseTurtle
, parseTurtlefromText
)
where
import Swish.GraphClass (arc)
import Swish.Namespace (Namespace, ScopedName)
import Swish.Namespace (makeNamespace, getNamespaceTuple
, getScopeNamespace, getScopedNameURI
, getScopeNamespace, makeURIScopedName, makeNSScopedName)
import Swish.QName (newLName, emptyLName)
import Swish.RDF.Graph
( RDFGraph, RDFLabel(..)
, ToRDFLabel(..)
, NamespaceMap
, addArc
, setNamespaces
, emptyRDFGraph
)
import Swish.RDF.Vocabulary
( LanguageTag
, toLangTag
, rdfType
, rdfFirst, rdfRest, rdfNil
, xsdBoolean, xsdInteger, xsdDecimal, xsdDouble
, defaultBase
)
import Swish.RDF.Datatype (makeDatatypedLiteral)
import Swish.RDF.Parser.Utils
( ParseResult
, runParserWithError
, ignore
, noneOf
, char
, ichar
, string
, stringT
, symbol
, isymbol
, lexeme
, whiteSpace
, hex4
, hex8
, appendURIs
)
import Control.Applicative
import Control.Monad (foldM)
import Data.Char (chr, ord, isAsciiLower, isAsciiUpper, isDigit)
import Data.Maybe (fromMaybe, fromJust)
import Data.Word (Word32)
import Network.URI (URI(..), parseURIReference)
import Network.URI.Ord ()
import Text.ParserCombinators.Poly.StateText
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
data TurtleState = TurtleState
{ graphState :: RDFGraph
, prefixUris :: NamespaceMap
, baseUri :: URI
, nodeGen :: Word32
} deriving Show
setPrefix :: Maybe T.Text -> URI -> TurtleState -> TurtleState
setPrefix pre uri st = st { prefixUris=p' }
where
p' = M.insert pre uri (prefixUris st)
setBase :: URI -> TurtleState -> TurtleState
setBase buri st = st { baseUri = buri }
getDefaultPrefix :: TurtleParser Namespace
getDefaultPrefix = do
s <- stGet
case getPrefixURI s Nothing of
Just uri -> return $ makeNamespace Nothing uri
_ -> failBad "No default prefix defined; how unexpected (probably a programming error)!"
getPrefixURI :: TurtleState -> Maybe T.Text -> Maybe URI
getPrefixURI st pre = M.lookup pre (prefixUris st)
findPrefixNamespace :: Maybe L.Text -> TurtleParser Namespace
findPrefixNamespace (Just p) = findPrefix (L.toStrict p)
findPrefixNamespace Nothing = getDefaultPrefix
updateGraph :: (RDFGraph -> RDFGraph) -> TurtleState -> TurtleState
updateGraph f s = s { graphState = f (graphState s) }
type TurtleParser a = Parser TurtleState a
parseTurtlefromText ::
L.Text
-> ParseResult
parseTurtlefromText = flip parseTurtle Nothing
parseTurtle ::
L.Text
-> Maybe URI
-> ParseResult
parseTurtle txt mbase = parseAnyfromText turtleDoc mbase txt
hashURI :: URI
hashURI = fromJust $ parseURIReference "#"
emptyState ::
Maybe URI
-> TurtleState
emptyState mbase =
let pmap = M.singleton Nothing hashURI
buri = fromMaybe (getScopedNameURI defaultBase) mbase
in TurtleState
{ graphState = emptyRDFGraph
, prefixUris = pmap
, baseUri = buri
, nodeGen = 0
}
parseAnyfromText ::
TurtleParser a
-> Maybe URI
-> L.Text
-> Either String a
parseAnyfromText parser mbase = runParserWithError parser (emptyState mbase)
newBlankNode :: TurtleParser RDFLabel
newBlankNode = do
n <- stQuery (succ . nodeGen)
stUpdate $ \s -> s { nodeGen = n }
return $ Blank (show n)
comma, semiColon , fullStop :: TurtleParser ()
comma = isymbol ","
semiColon = isymbol ";"
fullStop = isymbol "."
sQuot, dQuot, sQuot3, dQuot3 :: TurtleParser ()
sQuot = ichar '\''
dQuot = ichar '"'
sQuot3 = ignore $ string "'''"
dQuot3 = ignore $ string "\"\"\""
match :: (Ord a) => a -> [(a,a)] -> Bool
match v = any (\(l,h) -> v >= l && v <= h)
br :: String -> String -> TurtleParser a -> TurtleParser a
br lsym rsym = bracket (symbol lsym) (symbol rsym)
atWord :: T.Text -> TurtleParser ()
atWord s = char '@' *> lexeme (stringT s) *> pure ()
addStatement :: RDFLabel -> RDFLabel -> RDFLabel -> TurtleParser ()
addStatement s p o@(TypedLit _ dtype) | dtype `elem` [xsdBoolean, xsdInteger, xsdDecimal, xsdDouble] = do
ost <- stGet
let stmt = arc s p o
oldp = prefixUris ost
ogs = graphState ost
(nspre, nsuri) = getNamespaceTuple $ getScopeNamespace dtype
newp = M.insert nspre nsuri oldp
stUpdate $ \st -> st { prefixUris = newp, graphState = addArc stmt ogs }
addStatement s p o = stUpdate (updateGraph (addArc (arc s p o) ))
isaz, isAZ, isaZ, is09, isaZ09 :: Char -> Bool
isaz = isAsciiLower
isAZ = isAsciiUpper
isaZ c = isaz c || isAZ c
is09 = isDigit
isaZ09 c = isaZ c || is09 c
d2s :: L.Text -> RDFLabel
d2s =
let conv :: String -> Double
conv = read
in toRDFLabel . conv . L.unpack
operatorLabel :: ScopedName -> TurtleParser RDFLabel
operatorLabel snam = do
st <- stGet
let (pkey, pval) = getNamespaceTuple $ getScopeNamespace snam
opmap = prefixUris st
rval = Res snam
case M.lookup pkey opmap of
Just val | val == pval -> return rval
| otherwise -> do
stUpdate $ \s -> s { prefixUris = M.insert pkey pval opmap }
return rval
_ -> do
stUpdate $ \s -> s { prefixUris = M.insert pkey pval opmap }
return rval
findPrefix :: T.Text -> TurtleParser Namespace
findPrefix pre = do
st <- stGet
case M.lookup (Just pre) (prefixUris st) of
Just uri -> return $ makeNamespace (Just pre) uri
Nothing -> failBad $ "Undefined prefix '" ++ T.unpack pre ++ ":'."
turtleDoc :: TurtleParser RDFGraph
turtleDoc = mkGr <$> (whiteSpace *> many statement *> eof *> stGet)
where
mkGr s = setNamespaces (prefixUris s) (graphState s)
statement :: TurtleParser ()
statement = (directive <|> triples) *> fullStop
directive :: TurtleParser ()
directive = lexeme (prefixID <|> base)
prefixID :: TurtleParser ()
prefixID = do
_prefix
p <- lexeme _pnameNS
u <- _iriRef
stUpdate (setPrefix (fmap L.toStrict p) u)
base :: TurtleParser ()
base = _base >> _iriRef >>= stUpdate . setBase
triples :: TurtleParser ()
triples = subject >>= predicateObjectList
predicateObjectList :: RDFLabel -> TurtleParser ()
predicateObjectList subj =
let term = verb >>= objectList subj
in sepBy1 term semiColon *> ignore (optional semiColon)
objectList :: RDFLabel -> RDFLabel -> TurtleParser ()
objectList subj prd = sepBy1 object comma >>= mapM_ (addStatement subj prd)
verb :: TurtleParser RDFLabel
verb = predicate <|> (lexeme (char 'a') *> operatorLabel rdfType)
subject :: TurtleParser RDFLabel
subject = (Res <$> iriref) <|> blank
predicate :: TurtleParser RDFLabel
predicate = Res <$> iriref
object :: TurtleParser RDFLabel
object = (Res <$> iriref) <|> blank <|> literal
literal :: TurtleParser RDFLabel
literal = lexeme $ rdfLiteral <|> numericLiteral <|> booleanLiteral
blank :: TurtleParser RDFLabel
blank = lexeme (_blankNodeLabel
<|>
bracket (char '[') (char ']') handleBlankNode
<|>
collection
)
handleBlankNode :: TurtleParser RDFLabel
handleBlankNode = do
bNode <- newBlankNode
_manyws
ignore $ optional $ predicateObjectList bNode
_manyws
return bNode
collection :: TurtleParser RDFLabel
collection = do
os <- br "(" ")" (many object)
eNode <- operatorLabel rdfNil
case os of
[] -> return eNode
(x:xs) -> do
sNode <- newBlankNode
first <- operatorLabel rdfFirst
addStatement sNode first x
lNode <- foldM addElem sNode xs
rest <- operatorLabel rdfRest
addStatement lNode rest eNode
return sNode
where
addElem prevNode curElem = do
bNode <- newBlankNode
first <- operatorLabel rdfFirst
rest <- operatorLabel rdfRest
addStatement prevNode rest bNode
addStatement bNode first curElem
return bNode
_base :: TurtleParser ()
_base = atWord "base"
_prefix :: TurtleParser ()
_prefix = atWord "prefix"
_uchar :: TurtleParser Char
_uchar = char '\\' *> _uchar'
_uchar' :: TurtleParser Char
_uchar' = (char 'u' *> hex4) <|> (char 'U' *> hex8)
rdfLiteral :: TurtleParser RDFLabel
rdfLiteral = do
lbl <- L.toStrict <$> turtleString
opt <- optional ((Left <$> _langTag) <|> (string "^^" *> (Right <$> iriref)))
return $ case opt of
Just (Left lcode) -> LangLit lbl lcode
Just (Right dtype) -> TypedLit lbl dtype
_ -> Lit lbl
numericLiteral :: TurtleParser RDFLabel
numericLiteral = numericLiteralNegative <|> numericLiteralPositive <|> numericLiteralUnsigned
numericLiteralUnsigned :: TurtleParser RDFLabel
numericLiteralUnsigned =
d2s <$> _double
<|>
(makeDatatypedLiteral xsdDecimal . L.toStrict <$> _decimal)
<|>
(makeDatatypedLiteral xsdInteger . L.toStrict <$> _integer)
numericLiteralPositive :: TurtleParser RDFLabel
numericLiteralPositive =
d2s <$> _doublePositive
<|>
(makeDatatypedLiteral xsdDecimal . L.toStrict <$> _decimalPositive)
<|>
(makeDatatypedLiteral xsdInteger . L.toStrict <$> _integerPositive)
numericLiteralNegative :: TurtleParser RDFLabel
numericLiteralNegative =
d2s <$> _doubleNegative
<|>
(makeDatatypedLiteral xsdDecimal . L.toStrict <$> _decimalNegative)
<|>
(makeDatatypedLiteral xsdInteger . L.toStrict <$> _integerNegative)
booleanLiteral :: TurtleParser RDFLabel
booleanLiteral = makeDatatypedLiteral xsdBoolean . T.pack <$> (string "true" <|> string "false")
turtleString :: TurtleParser L.Text
turtleString =
lexeme (
_stringLiteralLong1 <|> _stringLiteral1 <|>
_stringLiteralLong2 <|> _stringLiteral2)
iriref :: TurtleParser ScopedName
iriref = lexeme ((makeURIScopedName <$> _iriRef) <|> prefixedName)
prefixedName :: TurtleParser ScopedName
prefixedName =
_pnameLN <|>
flip makeNSScopedName emptyLName <$> (_pnameNS >>= findPrefixNamespace)
_iriRef :: TurtleParser URI
_iriRef = do
ignore $ char '<'
ustr <- manyFinally' iriRefChar (char '>')
case parseURIReference ustr of
Nothing -> failBad $ "Invalid URI: <" ++ ustr ++ ">"
Just uref -> do
s <- stGet
either fail return $ appendURIs (baseUri s) uref
iriRefChar :: TurtleParser Char
iriRefChar = satisfy notIRIChar <|> _uchar
notIRIChar :: Char -> Bool
notIRIChar c = c >= chr 0x20
&&
c `notElem` "^<>\"{}|^`\\"
_pnameNS :: TurtleParser (Maybe L.Text)
_pnameNS = optional _pnPrefix <* char ':'
_pnameLN :: TurtleParser ScopedName
_pnameLN = do
ns <- _pnameNS >>= findPrefixNamespace
l <- fmap L.toStrict _pnLocal
case newLName l of
Just lname -> return $ makeNSScopedName ns lname
_ -> fail $ "Invalid local name: '" ++ T.unpack l ++ "'"
_blankNodeLabel :: TurtleParser RDFLabel
_blankNodeLabel = (Blank . L.unpack) <$> (string "_:" *> _pnLocal)
_langTag :: TurtleParser LanguageTag
_langTag = do
ichar '@'
h <- many1Satisfy isaZ
mt <- optional (L.cons <$> char '-' <*> many1Satisfy isaZ09)
let lbl = L.toStrict $ L.append h $ fromMaybe L.empty mt
case toLangTag lbl of
Just lt -> return lt
_ -> fail ("Invalid language tag: " ++ T.unpack lbl)
_integer :: TurtleParser L.Text
_integer = many1Satisfy is09
_decimal :: TurtleParser L.Text
_decimal =
let dpart = L.cons <$> char '.' <*> (fromMaybe "0" <$> optional _integer)
in
(L.append <$> _integer <*> dpart)
<|>
(L.append "0." <$> (char '.' *> _integer))
_double :: TurtleParser L.Text
_double =
(L.append <$> _decimal <*> _exponent)
<|>
(L.append <$> _integer <*> _exponent)
_integerPositive, _decimalPositive, _doublePositive :: TurtleParser L.Text
_integerPositive = char '+' *> _integer
_decimalPositive = char '+' *> _decimal
_doublePositive = char '+' *> _double
_integerNegative, _decimalNegative, _doubleNegative :: TurtleParser L.Text
_integerNegative = L.cons <$> char '-' <*> _integer
_decimalNegative = L.cons <$> char '-' <*> _decimal
_doubleNegative = L.cons <$> char '-' <*> _double
_exponent :: TurtleParser L.Text
_exponent = do
ignore $ satisfy (`elem` "eE")
ms <- optional (satisfy (`elem` "+-"))
e <- _integer
case ms of
Just '-' -> return $ L.append "E-" e
_ -> return $ L.cons 'E' e
_stringLiteral1, _stringLiteral2 :: TurtleParser L.Text
_stringLiteral1 = _stringIt sQuot (_tChars "'\\\n\r")
_stringLiteral2 = _stringIt dQuot (_tChars "\"\\\n\r")
_stringLiteralLong1, _stringLiteralLong2 :: TurtleParser L.Text
_stringLiteralLong1 = _stringItLong sQuot3 (_tCharsLong '\'' "'\\")
_stringLiteralLong2 = _stringItLong dQuot3 (_tCharsLong '"' "\"\\")
_stringIt :: TurtleParser a -> TurtleParser Char -> TurtleParser L.Text
_stringIt sep chars = L.pack <$> bracket sep sep (many chars)
_stringItLong :: TurtleParser a -> TurtleParser L.Text -> TurtleParser L.Text
_stringItLong sep chars = L.concat <$> bracket sep sep (many chars)
_tChars :: String -> TurtleParser Char
_tChars excl = (char '\\' *> (_echar' <|> _uchar'))
<|> noneOf excl
_tCharsLong :: Char -> String -> TurtleParser L.Text
_tCharsLong c excl = do
mq <- optional $ oneOrTwo c
r <- _tChars excl
return $ L.append (fromMaybe L.empty mq) (L.singleton r)
oneOrTwo :: Char -> TurtleParser L.Text
oneOrTwo c = do
a <- char c
mb <- optional (char c)
case mb of
Just b -> return $ L.pack [a,b]
_ -> return $ L.singleton a
_echar :: TurtleParser Char
_echar = char '\\' *> _echar'
_echar' :: TurtleParser Char
_echar' =
(char 't' *> pure '\t') <|>
(char 'b' *> pure '\b') <|>
(char 'n' *> pure '\n') <|>
(char 'r' *> pure '\r') <|>
(char '\\' *> pure '\\') <|>
(char '"' *> pure '"') <|>
(char '\'' *> pure '\'')
_manyws :: TurtleParser ()
_manyws = ignore $ manySatisfy (`elem` " \t\r\n")
_pnCharsBase :: TurtleParser Char
_pnCharsBase =
let f c = let i = ord c
in isaZ c ||
match i [(0xc0, 0xd6), (0xd8, 0xf6), (0xf8, 0x2ff),
(0x370, 0x37d), (0x37f, 0x1fff), (0x200c, 0x200d),
(0x2070, 0x218f), (0x2c00, 0x2fef), (0x3001, 0xd7ff),
(0xf900, 0xfdcf), (0xfdf0, 0xfffd), (0x10000, 0xeffff)]
in satisfy f <|> _uchar
_pnCharsU :: TurtleParser Char
_pnCharsU = _pnCharsBase <|> char '_'
_pnChars :: TurtleParser Char
_pnChars =
_pnCharsU
<|>
satisfy (\c -> let i = ord c
in c == '-' || isDigit c || i == 0xb7 ||
match i [(0x0300, 0x036f), (0x203f, 0x2040)])
_pnPrefix :: TurtleParser L.Text
_pnPrefix = L.cons <$> _pnCharsBase <*> _pnRest
_pnLocal :: TurtleParser L.Text
_pnLocal = L.cons <$> (_pnCharsU <|> satisfy is09)
<*> _pnRest
_pnRest :: TurtleParser L.Text
_pnRest = do
lbl <- many (_pnChars <|> char '.')
let (nret, lclean) = clean lbl
edl = id
snocdl x xs = xs . (x:)
appenddl = (.)
replicatedl n x = (replicate n x ++)
clean :: String -> (Int, String)
clean = go 0 edl
where
go n acc [] = (n, acc [])
go n acc ('.':xs) = go (n+1) acc xs
go 0 acc (x:xs) = go 0 (snocdl x acc) xs
go n acc (x:xs) = go 0 (appenddl acc (snocdl x (replicatedl n '.'))) xs
reparse $ L.replicate (fromIntegral nret) (L.singleton '.')
return $ L.pack lclean