{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
module Swish.RDF.Parser.Turtle
( ParseResult
, parseTurtle
, parseTurtlefromText
)
where
import Swish.GraphClass (arc)
import Swish.Namespace (Namespace, ScopedName)
import Swish.Namespace (makeNamespace, getNamespaceTuple
, getScopeNamespace, getScopedNameURI
, getScopeNamespace, makeURIScopedName, makeNSScopedName)
import Swish.QName (newLName, emptyLName)
import Swish.RDF.Graph
( RDFGraph, RDFLabel(..)
, NamespaceMap
, addArc
, setNamespaces
, emptyRDFGraph
)
import Swish.RDF.Vocabulary
( LanguageTag
, toLangTag
, rdfType
, rdfFirst, rdfRest, rdfNil
, xsdBoolean, xsdInteger, xsdDecimal, xsdDouble
, defaultBase
)
import Swish.RDF.Datatype (makeDatatypedLiteral)
import Swish.RDF.Parser.Utils
( ParseResult
, runParserWithError
, ignore
, noneOf
, char
, ichar
, string
, stringT
, sepEndBy1
, isymbol
, lexeme
, whiteSpace
, hex4
, hex8
, appendURIs
)
import Control.Applicative
import Control.Monad (foldM)
import Data.Char (chr, isAsciiLower, isAsciiUpper, isDigit, isHexDigit, ord, toLower)
#if MIN_VERSION_base(4, 7, 0)
import Data.Functor (($>))
#endif
import Data.Maybe (fromMaybe)
import Data.Word (Word32)
import Network.URI (URI(..), parseURIReference)
import Text.ParserCombinators.Poly.StateText
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
#if !MIN_VERSION_base(4, 7, 0)
($>) :: Functor f => f a -> b -> f b
($>) = flip (<$)
#endif
data TurtleState = TurtleState
{ graphState :: RDFGraph
, prefixUris :: NamespaceMap
, baseUri :: URI
, nodeGen :: Word32
} deriving Show
setPrefix :: Maybe T.Text -> URI -> TurtleState -> TurtleState
setPrefix pre uri st = st { prefixUris=p' }
where
p' = M.insert pre uri (prefixUris st)
setBase :: URI -> TurtleState -> TurtleState
setBase buri st = st { baseUri = buri }
getDefaultPrefix :: TurtleParser Namespace
getDefaultPrefix = do
s <- stGet
case getPrefixURI s Nothing of
Just uri -> return $ makeNamespace Nothing uri
_ -> failBad "No default prefix defined; how unexpected (probably a programming error)!"
getPrefixURI :: TurtleState -> Maybe T.Text -> Maybe URI
getPrefixURI st pre = M.lookup pre (prefixUris st)
findPrefixNamespace :: Maybe L.Text -> TurtleParser Namespace
findPrefixNamespace (Just p) = findPrefix (L.toStrict p)
findPrefixNamespace Nothing = getDefaultPrefix
updateGraph :: (RDFGraph -> RDFGraph) -> TurtleState -> TurtleState
updateGraph f s = s { graphState = f (graphState s) }
type TurtleParser a = Parser TurtleState a
parseTurtlefromText ::
L.Text
-> ParseResult
parseTurtlefromText = flip parseTurtle Nothing
parseTurtle ::
L.Text
-> Maybe URI
-> ParseResult
parseTurtle txt mbase = parseAnyfromText turtleDoc mbase txt
emptyState ::
Maybe URI
-> TurtleState
emptyState mbase =
let pmap = M.empty
buri = fromMaybe (getScopedNameURI defaultBase) mbase
in TurtleState
{ graphState = emptyRDFGraph
, prefixUris = pmap
, baseUri = buri
, nodeGen = 0
}
parseAnyfromText ::
TurtleParser a
-> Maybe URI
-> L.Text
-> Either String a
parseAnyfromText parser mbase = runParserWithError parser (emptyState mbase)
newBlankNode :: TurtleParser RDFLabel
newBlankNode = do
n <- stQuery (succ . nodeGen)
stUpdate $ \s -> s { nodeGen = n }
return $ Blank (show n)
comma, semiColon , fullStop :: TurtleParser ()
comma = isymbol ","
semiColon = isymbol ";"
fullStop = isymbol "."
sQuot, dQuot, sQuot3, dQuot3 :: TurtleParser ()
sQuot = ichar '\''
dQuot = ichar '"'
sQuot3 = ignore $ string "'''"
dQuot3 = ignore $ string "\"\"\""
match :: (Ord a) => a -> [(a,a)] -> Bool
match v = any (\(l,h) -> v >= l && v <= h)
br :: Char -> Char -> TurtleParser a -> TurtleParser a
br lsym rsym =
let f = lexeme . char
in bracket (f lsym) (f rsym)
atWord :: T.Text -> TurtleParser ()
atWord s = (char '@' *> lexeme (stringT s)) $> ()
charI ::
Char
-> TurtleParser Char
charI c = satisfy (`elem` c : [ toLower c ])
stringI ::
String
-> TurtleParser String
stringI = mapM charI
addStatement :: RDFLabel -> RDFLabel -> RDFLabel -> TurtleParser ()
addStatement s p o@(TypedLit _ dtype) | dtype `elem` [xsdBoolean, xsdInteger, xsdDecimal, xsdDouble] = do
ost <- stGet
let stmt = arc s p o
oldp = prefixUris ost
ogs = graphState ost
(nspre, nsuri) = getNamespaceTuple $ getScopeNamespace dtype
newp = M.insert nspre nsuri oldp
stUpdate $ \st -> st { prefixUris = newp, graphState = addArc stmt ogs }
addStatement s p o = stUpdate (updateGraph (addArc (arc s p o) ))
isaz, isAZ, isaZ, is09, isaZ09 :: Char -> Bool
isaz = isAsciiLower
isAZ = isAsciiUpper
isaZ c = isaz c || isAZ c
is09 = isDigit
isaZ09 c = isaZ c || is09 c
operatorLabel :: ScopedName -> TurtleParser RDFLabel
operatorLabel snam = do
st <- stGet
let (pkey, pval) = getNamespaceTuple $ getScopeNamespace snam
opmap = prefixUris st
rval = Res snam
case M.lookup pkey opmap of
Just val | val == pval -> return rval
| otherwise -> do
stUpdate $ \s -> s { prefixUris = M.insert pkey pval opmap }
return rval
_ -> do
stUpdate $ \s -> s { prefixUris = M.insert pkey pval opmap }
return rval
findPrefix :: T.Text -> TurtleParser Namespace
findPrefix pre = do
st <- stGet
case M.lookup (Just pre) (prefixUris st) of
Just uri -> return $ makeNamespace (Just pre) uri
Nothing -> failBad $ "Undefined prefix '" ++ T.unpack pre ++ ":'."
(<?) ::
Parser s a
-> String
-> Parser s a
(<?) p m = adjustErr p ((m++"\n")++)
infixl 4 <?
turtleDoc :: TurtleParser RDFGraph
turtleDoc = mkGr <$> (whiteSpace *> many statement *> eof *> stGet)
where
mkGr s = setNamespaces (prefixUris s) (graphState s)
statement :: TurtleParser ()
statement = directive <|> (triples *> commit fullStop <? "Missing '.' after a statement.")
directive :: TurtleParser ()
directive =
lexeme
(prefixID <? "Unable to parse @prefix statement."
<|> base <? "Unable to parse @base statement."
<|> sparqlPrefix <? "Unable to parse Sparql PREFIX statement."
<|> sparqlBase <? "Unable to parse Sparql BASE statement.")
prefixID :: TurtleParser ()
prefixID = do
atWord "prefix"
p <- commit $ lexeme _pnameNS
u <- lexeme _iriRef
fullStop
stUpdate $ setPrefix (fmap L.toStrict p) u
base :: TurtleParser ()
base = do
atWord "base"
b <- commit $ lexeme _iriRef
fullStop
stUpdate $ setBase b
sparqlBase :: TurtleParser ()
sparqlBase = lexeme (stringI "BASE") >> commit _iriRef >>= stUpdate . setBase
sparqlPrefix :: TurtleParser ()
sparqlPrefix = do
ignore $ lexeme $ stringI "PREFIX"
p <- commit $ lexeme _pnameNS
u <- lexeme _iriRef
stUpdate $ setPrefix (fmap L.toStrict p) u
triples :: TurtleParser ()
triples =
(subject >>= predicateObjectList)
<|>
(blankNodePropertyList >>= ignore . optional . predicateObjectList)
predicateObjectList :: RDFLabel -> TurtleParser ()
predicateObjectList subj =
let term = verb >>= objectList subj
in ignore $ sepEndBy1 term (many1 semiColon)
objectList :: RDFLabel -> RDFLabel -> TurtleParser ()
objectList subj prd = sepBy1 object comma >>= mapM_ (addStatement subj prd)
verb :: TurtleParser RDFLabel
verb = predicate <|> (lexeme (char 'a') *> operatorLabel rdfType)
subject :: TurtleParser RDFLabel
subject = (Res <$> iri) <|> blankNode <|> collection
predicate :: TurtleParser RDFLabel
predicate = Res <$> iri
object :: TurtleParser RDFLabel
object = (Res <$> iri) <|> blankNode <|> collection <|>
blankNodePropertyList <|> literal
literal :: TurtleParser RDFLabel
literal = lexeme $ rdfLiteral <|> numericLiteral <|> booleanLiteral
blankNodePropertyList :: TurtleParser RDFLabel
blankNodePropertyList = do
bNode <- newBlankNode
br '[' ']' $ lexeme (predicateObjectList bNode)
return bNode
collection :: TurtleParser RDFLabel
collection = do
os <- br '(' ')' $ many object
eNode <- operatorLabel rdfNil
case os of
[] -> return eNode
(x:xs) -> do
sNode <- newBlankNode
first <- operatorLabel rdfFirst
addStatement sNode first x
lNode <- foldM addElem sNode xs
rest <- operatorLabel rdfRest
addStatement lNode rest eNode
return sNode
where
addElem prevNode curElem = do
bNode <- newBlankNode
first <- operatorLabel rdfFirst
rest <- operatorLabel rdfRest
addStatement prevNode rest bNode
addStatement bNode first curElem
return bNode
numericLiteral :: TurtleParser RDFLabel
numericLiteral =
let f t v = makeDatatypedLiteral t (L.toStrict v)
in (f xsdDouble <$> _double)
<|>
(f xsdDecimal <$> _decimal)
<|>
(f xsdInteger <$> _integer)
rdfLiteral :: TurtleParser RDFLabel
rdfLiteral = do
lbl <- L.toStrict <$> turtleString
opt <- optional ((Left <$> (_langTag <? "Unable to parse the language tag"))
<|>
(string "^^" *> (Right <$> (commit iri <? "Unable to parse the datatype of the literal"))))
ignore $ optional whiteSpace
return $ case opt of
Just (Left lcode) -> LangLit lbl lcode
Just (Right dtype) -> TypedLit lbl dtype
_ -> Lit lbl
booleanLiteral :: TurtleParser RDFLabel
booleanLiteral = makeDatatypedLiteral xsdBoolean . T.pack <$> lexeme (string "true" <|> string "false")
turtleString :: TurtleParser L.Text
turtleString =
lexeme (
_stringLiteralLongQuote <|> _stringLiteralQuote <|>
_stringLiteralLongSingleQuote <|> _stringLiteralSingleQuote
) <? "Unable to parse a string literal"
iri :: TurtleParser ScopedName
iri = lexeme (
(makeURIScopedName <$> _iriRef)
<|>
prefixedName)
prefixedName :: TurtleParser ScopedName
prefixedName =
_pnameLN <|>
flip makeNSScopedName emptyLName <$> (_pnameNS >>= findPrefixNamespace)
blankNode :: TurtleParser RDFLabel
blankNode = lexeme (_blankNodeLabel <|> _anon)
_iriRef :: TurtleParser URI
_iriRef = do
ustr <- bracket (char '<') (commit (char '>')) (many iriRefChar)
case parseURIReference ustr of
Nothing -> failBad $ "Invalid URI: <" ++ ustr ++ ">"
Just uref -> do
s <- stGet
either fail return $ appendURIs (baseUri s) uref
iriRefChar :: TurtleParser Char
iriRefChar = satisfy isIRIChar <|> _uchar
isIRIChar :: Char -> Bool
isIRIChar c =
c > chr 0x20
&&
c `notElem` ("<>\"{}|^`\\"::String)
_pnameNS :: TurtleParser (Maybe L.Text)
_pnameNS = optional _pnPrefix <* char ':'
_pnameLN :: TurtleParser ScopedName
_pnameLN = do
ns <- _pnameNS >>= findPrefixNamespace
l <- fmap L.toStrict _pnLocal
case newLName l of
Just lname -> return $ makeNSScopedName ns lname
_ -> fail $ "Invalid local name: '" ++ T.unpack l ++ "'"
_blankNodeLabel :: TurtleParser RDFLabel
_blankNodeLabel = do
ignore $ string "_:"
fChar <- _pnCharsU <|> satisfy is09
rest <- _pnRest
return $ Blank $ fChar : L.unpack rest
_pnRest :: TurtleParser L.Text
_pnRest = noTrailingDot _pnChars
noTrailing ::
TurtleParser a
-> ([a] -> String)
-> TurtleParser a
-> TurtleParser L.Text
noTrailing dotParser conv parser = do
lbl <- many (parser <|> dotParser)
let (nret, lclean) = clean $ conv lbl
edl = id
snocdl x xs = xs . (x:)
appenddl = (.)
replicatedl n x = (replicate n x ++)
clean :: String -> (Int, String)
clean = go 0 edl
where
go n acc [] = (n, acc [])
go n acc ('.':xs) = go (n+1) acc xs
go 0 acc (x:xs) = go 0 (snocdl x acc) xs
go n acc (x:xs) = go 0 (appenddl acc (snocdl x (replicatedl n '.'))) xs
reparse $ L.replicate (fromIntegral nret) "."
return $ L.pack lclean
noTrailingDot ::
TurtleParser Char
-> TurtleParser L.Text
noTrailingDot = noTrailing (char '.') id
noTrailingDotM ::
TurtleParser L.Text
-> TurtleParser L.Text
noTrailingDotM = noTrailing (char '.' $> ".") (L.unpack . L.concat)
_langTag :: TurtleParser LanguageTag
_langTag = do
ichar '@'
h <- commit $ many1Satisfy isaZ
mt <- optional (L.cons <$> char '-' <*> many1Satisfy isaZ09)
let lbl = L.toStrict $ L.append h $ fromMaybe L.empty mt
case toLangTag lbl of
Just lt -> return lt
_ -> fail ("Invalid language tag: " ++ T.unpack lbl)
_leadingSign :: TurtleParser (Maybe Bool)
_leadingSign = do
ms <- optional (satisfy (`elem` ("+-"::String)))
return $ (=='+') `fmap` ms
addSign :: Maybe Bool -> L.Text -> L.Text
addSign (Just True) t = L.cons '+' t
addSign (Just _) t = L.cons '-' t
addSign _ t = t
_integer :: TurtleParser L.Text
_integer = do
ms <- _leadingSign
rest <- many1Satisfy is09
return $ addSign ms rest
_decimal :: TurtleParser L.Text
_decimal = do
ms <- _leadingSign
leading <- manySatisfy is09
ichar '.'
trailing <- many1Satisfy is09
let ans2 = L.cons '.' trailing
ans = if L.null leading
then ans2
else L.append leading ans2
return $ addSign ms ans
_d1 :: TurtleParser L.Text
_d1 = do
a <- many1Satisfy is09
ichar '.'
b <- manySatisfy is09
return $ a `L.append` ('.' `L.cons` b)
_d2 :: TurtleParser L.Text
_d2 = do
ichar '.'
b <- many1Satisfy is09
return $ '.' `L.cons` b
_d3 :: TurtleParser L.Text
_d3 = many1Satisfy is09
_double :: TurtleParser L.Text
_double = do
ms <- _leadingSign
leading <- _d1 <|> _d2 <|> _d3
e <- _exponent
return $ addSign ms $ leading `L.append` e
_exponent :: TurtleParser L.Text
_exponent = do
e <- char 'e' <|> char 'E'
ms <- _leadingSign
ep <- _integer
return $ L.cons e $ addSign ms ep
_protChar :: TurtleParser Char
_protChar = char '\\' *> (_echar' <|> _uchar')
_exclSLQ, _exclSLSQ :: String
_exclSLQ = map chr [0x22, 0x5c, 0x0a, 0x0d]
_exclSLSQ = map chr [0x27, 0x5c, 0x0a, 0x0d]
_stringLiteralQuote, _stringLiteralSingleQuote :: TurtleParser L.Text
_stringLiteralQuote = _stringIt dQuot (_tChars _exclSLQ)
_stringLiteralSingleQuote = _stringIt sQuot (_tChars _exclSLSQ)
_stringLiteralLongQuote, _stringLiteralLongSingleQuote :: TurtleParser L.Text
_stringLiteralLongQuote = _stringItLong dQuot3 (_tCharsLong '"')
_stringLiteralLongSingleQuote = _stringItLong sQuot3 (_tCharsLong '\'')
_stringIt :: TurtleParser a -> TurtleParser Char -> TurtleParser L.Text
_stringIt sep chars = L.pack <$> bracket sep sep (many chars)
_stringItLong :: TurtleParser a -> TurtleParser L.Text -> TurtleParser L.Text
_stringItLong sep chars = L.concat <$> bracket sep sep (many chars)
_tChars :: String -> TurtleParser Char
_tChars excl = _protChar <|> noneOf excl
oneOrTwo :: Char -> TurtleParser L.Text
oneOrTwo c = do
ignore $ char c
mb <- optional (char c)
case mb of
Just _ -> return $ L.pack [c,c]
_ -> return $ L.singleton c
_multiQuote :: Char -> TurtleParser L.Text
_multiQuote c = do
mq <- optional (oneOrTwo c)
r <- noneOf (c : "\\")
return $ fromMaybe L.empty mq `L.snoc` r
_tCharsLong :: Char -> TurtleParser L.Text
_tCharsLong c =
L.singleton <$> _protChar
<|> _multiQuote c
_uchar :: TurtleParser Char
_uchar = char '\\' >> _uchar'
_uchar' :: TurtleParser Char
_uchar' =
(char 'u' *> (commit hex4 <? "Expected 4 hex characters after \\u"))
<|>
(char 'U' *> (commit hex8 <? "Expected 8 hex characters after \\U"))
_echar' :: TurtleParser Char
_echar' =
(char 't' $> '\t') <|>
(char 'b' $> '\b') <|>
(char 'n' $> '\n') <|>
(char 'r' $> '\r') <|>
(char 'f' $> '\f') <|>
(char '\\' $> '\\') <|>
(char '"' $> '"') <|>
(char '\'' $> '\'')
_ws :: TurtleParser ()
_ws = ignore $ satisfy (`elem` _wsChars)
_wsChars :: String
_wsChars = map chr [0x20, 0x09, 0x0d, 0x0a]
_anon :: TurtleParser RDFLabel
_anon =
br '[' ']' (many _ws) >> newBlankNode
_pnCharsBase :: TurtleParser Char
_pnCharsBase =
let f c = let i = ord c
in isaZ c ||
match i [(0xc0, 0xd6), (0xd8, 0xf6), (0xf8, 0x2ff),
(0x370, 0x37d), (0x37f, 0x1fff), (0x200c, 0x200d),
(0x2070, 0x218f), (0x2c00, 0x2fef), (0x3001, 0xd7ff),
(0xf900, 0xfdcf), (0xfdf0, 0xfffd), (0x10000, 0xeffff)]
in satisfy f
_pnCharsU, _pnChars :: TurtleParser Char
_pnCharsU = _pnCharsBase <|> char '_'
_pnChars =
let f c = let i = ord c
in match i [(0x300, 0x36f), (0x203f, 0x2040)]
in _pnCharsU <|> char '-' <|> satisfy is09 <|>
char (chr 0xb7) <|> satisfy f
_pnPrefix :: TurtleParser L.Text
_pnPrefix = L.cons <$> _pnCharsBase <*> _pnRest
_pnLocal :: TurtleParser L.Text
_pnLocal = do
s <- L.singleton <$> (_pnCharsU <|> char ':' <|> satisfy is09)
<|> _plx
e <- noTrailingDotM (L.singleton <$> (_pnChars <|> char ':') <|> _plx)
return $ s `L.append` e
_plx, _percent :: TurtleParser L.Text
_plx = _percent <|> (L.singleton <$> _pnLocalEsc)
_percent = do
ichar '%'
a <- _hex
b <- _hex
return $ L.cons '%' $ L.cons a $ L.singleton b
_hex, _pnLocalEsc :: TurtleParser Char
_hex = satisfy isHexDigit
_pnLocalEsc = char '\\' *> satisfy (`elem` _pnLocalEscChars)
_pnLocalEscChars :: String
_pnLocalEscChars = "_~.-!$&'()*+,;=/?#@%"