module Swish.RDF.Parser.Turtle
( ParseResult
, parseTurtle
, parseTurtlefromText
)
where
import Swish.GraphClass (arc)
import Swish.Namespace (Namespace, ScopedName)
import Swish.Namespace (makeNamespace, getNamespaceTuple
, getScopeNamespace, getScopedNameURI
, getScopeNamespace, makeURIScopedName, makeNSScopedName)
import Swish.QName (newLName, emptyLName)
import Swish.RDF.Graph
( RDFGraph, RDFLabel(..)
, NamespaceMap
, addArc
, setNamespaces
, emptyRDFGraph
)
import Swish.RDF.Vocabulary
( LanguageTag
, toLangTag
, rdfType
, rdfFirst, rdfRest, rdfNil
, xsdBoolean, xsdInteger, xsdDecimal, xsdDouble
, defaultBase
)
import Swish.RDF.Datatype (makeDatatypedLiteral)
import Swish.RDF.Parser.Utils
( ParseResult
, runParserWithError
, ignore
, noneOf
, char
, ichar
, string
, stringT
, sepEndBy1
, isymbol
, lexeme
, whiteSpace
, hex4
, hex8
, appendURIs
)
import Control.Applicative
import Control.Monad (foldM)
import Data.Char (chr, isAsciiLower, isAsciiUpper, isDigit, isHexDigit, ord, toLower)
import Data.Maybe (fromMaybe)
import Data.Word (Word32)
import Network.URI (URI(..), parseURIReference)
import Network.URI.Ord ()
import Text.ParserCombinators.Poly.StateText
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
data TurtleState = TurtleState
{ graphState :: RDFGraph
, prefixUris :: NamespaceMap
, baseUri :: URI
, nodeGen :: Word32
} deriving Show
setPrefix :: Maybe T.Text -> URI -> TurtleState -> TurtleState
setPrefix pre uri st = st { prefixUris=p' }
where
p' = M.insert pre uri (prefixUris st)
setBase :: URI -> TurtleState -> TurtleState
setBase buri st = st { baseUri = buri }
getDefaultPrefix :: TurtleParser Namespace
getDefaultPrefix = do
s <- stGet
case getPrefixURI s Nothing of
Just uri -> return $ makeNamespace Nothing uri
_ -> failBad "No default prefix defined; how unexpected (probably a programming error)!"
getPrefixURI :: TurtleState -> Maybe T.Text -> Maybe URI
getPrefixURI st pre = M.lookup pre (prefixUris st)
findPrefixNamespace :: Maybe L.Text -> TurtleParser Namespace
findPrefixNamespace (Just p) = findPrefix (L.toStrict p)
findPrefixNamespace Nothing = getDefaultPrefix
updateGraph :: (RDFGraph -> RDFGraph) -> TurtleState -> TurtleState
updateGraph f s = s { graphState = f (graphState s) }
type TurtleParser a = Parser TurtleState a
parseTurtlefromText ::
L.Text
-> ParseResult
parseTurtlefromText = flip parseTurtle Nothing
parseTurtle ::
L.Text
-> Maybe URI
-> ParseResult
parseTurtle txt mbase = parseAnyfromText turtleDoc mbase txt
emptyState ::
Maybe URI
-> TurtleState
emptyState mbase =
let pmap = M.empty
buri = fromMaybe (getScopedNameURI defaultBase) mbase
in TurtleState
{ graphState = emptyRDFGraph
, prefixUris = pmap
, baseUri = buri
, nodeGen = 0
}
parseAnyfromText ::
TurtleParser a
-> Maybe URI
-> L.Text
-> Either String a
parseAnyfromText parser mbase = runParserWithError parser (emptyState mbase)
newBlankNode :: TurtleParser RDFLabel
newBlankNode = do
n <- stQuery (succ . nodeGen)
stUpdate $ \s -> s { nodeGen = n }
return $ Blank (show n)
comma, semiColon , fullStop :: TurtleParser ()
comma = isymbol ","
semiColon = isymbol ";"
fullStop = isymbol "."
sQuot, dQuot, sQuot3, dQuot3 :: TurtleParser ()
sQuot = ichar '\''
dQuot = ichar '"'
sQuot3 = ignore $ string "'''"
dQuot3 = ignore $ string "\"\"\""
match :: (Ord a) => a -> [(a,a)] -> Bool
match v = any (\(l,h) -> v >= l && v <= h)
br :: Char -> Char -> TurtleParser a -> TurtleParser a
br lsym rsym =
let f = lexeme . char
in bracket (f lsym) (f rsym)
atWord :: T.Text -> TurtleParser ()
atWord s = char '@' *> lexeme (stringT s) *> pure ()
charI ::
Char
-> TurtleParser Char
charI c = satisfy (`elem` c : [ toLower c ])
stringI ::
String
-> TurtleParser String
stringI = mapM charI
addStatement :: RDFLabel -> RDFLabel -> RDFLabel -> TurtleParser ()
addStatement s p o@(TypedLit _ dtype) | dtype `elem` [xsdBoolean, xsdInteger, xsdDecimal, xsdDouble] = do
ost <- stGet
let stmt = arc s p o
oldp = prefixUris ost
ogs = graphState ost
(nspre, nsuri) = getNamespaceTuple $ getScopeNamespace dtype
newp = M.insert nspre nsuri oldp
stUpdate $ \st -> st { prefixUris = newp, graphState = addArc stmt ogs }
addStatement s p o = stUpdate (updateGraph (addArc (arc s p o) ))
isaz, isAZ, isaZ, is09, isaZ09 :: Char -> Bool
isaz = isAsciiLower
isAZ = isAsciiUpper
isaZ c = isaz c || isAZ c
is09 = isDigit
isaZ09 c = isaZ c || is09 c
operatorLabel :: ScopedName -> TurtleParser RDFLabel
operatorLabel snam = do
st <- stGet
let (pkey, pval) = getNamespaceTuple $ getScopeNamespace snam
opmap = prefixUris st
rval = Res snam
case M.lookup pkey opmap of
Just val | val == pval -> return rval
| otherwise -> do
stUpdate $ \s -> s { prefixUris = M.insert pkey pval opmap }
return rval
_ -> do
stUpdate $ \s -> s { prefixUris = M.insert pkey pval opmap }
return rval
findPrefix :: T.Text -> TurtleParser Namespace
findPrefix pre = do
st <- stGet
case M.lookup (Just pre) (prefixUris st) of
Just uri -> return $ makeNamespace (Just pre) uri
Nothing -> failBad $ "Undefined prefix '" ++ T.unpack pre ++ ":'."
(<?) ::
Parser s a
-> String
-> Parser s a
(<?) p m = adjustErr p ((m++"\n")++)
infixl 4 <?
turtleDoc :: TurtleParser RDFGraph
turtleDoc = mkGr <$> (whiteSpace *> many statement *> eof *> stGet)
where
mkGr s = setNamespaces (prefixUris s) (graphState s)
statement :: TurtleParser ()
statement = directive <|> (triples *> commit fullStop <? "Missing '.' after a statement.")
directive :: TurtleParser ()
directive =
lexeme
(prefixID <? "Unable to parse @prefix statement."
<|> base <? "Unable to parse @base statement."
<|> sparqlPrefix <? "Unable to parse Sparql PREFIX statement."
<|> sparqlBase <? "Unable to parse Sparql BASE statement.")
prefixID :: TurtleParser ()
prefixID = do
atWord "prefix"
p <- commit $ lexeme _pnameNS
u <- lexeme _iriRef
fullStop
stUpdate $ setPrefix (fmap L.toStrict p) u
base :: TurtleParser ()
base = do
atWord "base"
b <- commit $ lexeme _iriRef
fullStop
stUpdate $ setBase b
sparqlBase :: TurtleParser ()
sparqlBase = lexeme (stringI "BASE") >> commit _iriRef >>= stUpdate . setBase
sparqlPrefix :: TurtleParser ()
sparqlPrefix = do
ignore $ lexeme $ stringI "PREFIX"
p <- commit $ lexeme _pnameNS
u <- lexeme _iriRef
stUpdate $ setPrefix (fmap L.toStrict p) u
triples :: TurtleParser ()
triples =
(subject >>= predicateObjectList)
<|>
(blankNodePropertyList >>= ignore . optional . predicateObjectList)
predicateObjectList :: RDFLabel -> TurtleParser ()
predicateObjectList subj =
let term = verb >>= objectList subj
in ignore $ sepEndBy1 term (many1 semiColon)
objectList :: RDFLabel -> RDFLabel -> TurtleParser ()
objectList subj prd = sepBy1 object comma >>= mapM_ (addStatement subj prd)
verb :: TurtleParser RDFLabel
verb = predicate <|> (lexeme (char 'a') *> operatorLabel rdfType)
subject :: TurtleParser RDFLabel
subject = (Res <$> iri) <|> blankNode <|> collection
predicate :: TurtleParser RDFLabel
predicate = Res <$> iri
object :: TurtleParser RDFLabel
object = (Res <$> iri) <|> blankNode <|> collection <|>
blankNodePropertyList <|> literal
literal :: TurtleParser RDFLabel
literal = lexeme $ rdfLiteral <|> numericLiteral <|> booleanLiteral
blankNodePropertyList :: TurtleParser RDFLabel
blankNodePropertyList = do
bNode <- newBlankNode
br '[' ']' $ lexeme (predicateObjectList bNode)
return bNode
collection :: TurtleParser RDFLabel
collection = do
os <- br '(' ')' $ many object
eNode <- operatorLabel rdfNil
case os of
[] -> return eNode
(x:xs) -> do
sNode <- newBlankNode
first <- operatorLabel rdfFirst
addStatement sNode first x
lNode <- foldM addElem sNode xs
rest <- operatorLabel rdfRest
addStatement lNode rest eNode
return sNode
where
addElem prevNode curElem = do
bNode <- newBlankNode
first <- operatorLabel rdfFirst
rest <- operatorLabel rdfRest
addStatement prevNode rest bNode
addStatement bNode first curElem
return bNode
numericLiteral :: TurtleParser RDFLabel
numericLiteral =
let f t v = makeDatatypedLiteral t (L.toStrict v)
in (f xsdDouble <$> _double)
<|>
(f xsdDecimal <$> _decimal)
<|>
(f xsdInteger <$> _integer)
rdfLiteral :: TurtleParser RDFLabel
rdfLiteral = do
lbl <- L.toStrict <$> turtleString
opt <- optional ((Left <$> (_langTag <? "Unable to parse the language tag"))
<|>
(string "^^" *> (Right <$> (commit iri <? "Unable to parse the datatype of the literal"))))
ignore $ optional whiteSpace
return $ case opt of
Just (Left lcode) -> LangLit lbl lcode
Just (Right dtype) -> TypedLit lbl dtype
_ -> Lit lbl
booleanLiteral :: TurtleParser RDFLabel
booleanLiteral = makeDatatypedLiteral xsdBoolean . T.pack <$> lexeme (string "true" <|> string "false")
turtleString :: TurtleParser L.Text
turtleString =
lexeme (
_stringLiteralLongQuote <|> _stringLiteralQuote <|>
_stringLiteralLongSingleQuote <|> _stringLiteralSingleQuote
) <? "Unable to parse a string literal"
iri :: TurtleParser ScopedName
iri = lexeme (
(makeURIScopedName <$> _iriRef)
<|>
prefixedName)
prefixedName :: TurtleParser ScopedName
prefixedName =
_pnameLN <|>
flip makeNSScopedName emptyLName <$> (_pnameNS >>= findPrefixNamespace)
blankNode :: TurtleParser RDFLabel
blankNode = lexeme (_blankNodeLabel <|> _anon)
_iriRef :: TurtleParser URI
_iriRef = do
ustr <- bracket (char '<') (commit (char '>')) (many iriRefChar)
case parseURIReference ustr of
Nothing -> failBad $ "Invalid URI: <" ++ ustr ++ ">"
Just uref -> do
s <- stGet
either fail return $ appendURIs (baseUri s) uref
iriRefChar :: TurtleParser Char
iriRefChar = satisfy isIRIChar <|> _uchar
isIRIChar :: Char -> Bool
isIRIChar c =
c > chr 0x20
&&
c `notElem` "<>\"{}|^`\\"
_pnameNS :: TurtleParser (Maybe L.Text)
_pnameNS = optional _pnPrefix <* char ':'
_pnameLN :: TurtleParser ScopedName
_pnameLN = do
ns <- _pnameNS >>= findPrefixNamespace
l <- fmap L.toStrict _pnLocal
case newLName l of
Just lname -> return $ makeNSScopedName ns lname
_ -> fail $ "Invalid local name: '" ++ T.unpack l ++ "'"
_blankNodeLabel :: TurtleParser RDFLabel
_blankNodeLabel = do
ignore $ string "_:"
fChar <- _pnCharsU <|> satisfy is09
rest <- _pnRest
return $ Blank $ fChar : L.unpack rest
_pnRest :: TurtleParser L.Text
_pnRest = noTrailingDot _pnChars
noTrailing ::
TurtleParser a
-> ([a] -> String)
-> TurtleParser a
-> TurtleParser L.Text
noTrailing dotParser conv parser = do
lbl <- many (parser <|> dotParser)
let (nret, lclean) = clean $ conv lbl
edl = id
snocdl x xs = xs . (x:)
appenddl = (.)
replicatedl n x = (replicate n x ++)
clean :: String -> (Int, String)
clean = go 0 edl
where
go n acc [] = (n, acc [])
go n acc ('.':xs) = go (n+1) acc xs
go 0 acc (x:xs) = go 0 (snocdl x acc) xs
go n acc (x:xs) = go 0 (appenddl acc (snocdl x (replicatedl n '.'))) xs
reparse $ L.replicate (fromIntegral nret) "."
return $ L.pack lclean
noTrailingDot ::
TurtleParser Char
-> TurtleParser L.Text
noTrailingDot = noTrailing (char '.') id
noTrailingDotM ::
TurtleParser L.Text
-> TurtleParser L.Text
noTrailingDotM = noTrailing (char '.' *> pure ".") (L.unpack . L.concat)
_langTag :: TurtleParser LanguageTag
_langTag = do
ichar '@'
h <- commit $ many1Satisfy isaZ
mt <- optional (L.cons <$> char '-' <*> many1Satisfy isaZ09)
let lbl = L.toStrict $ L.append h $ fromMaybe L.empty mt
case toLangTag lbl of
Just lt -> return lt
_ -> fail ("Invalid language tag: " ++ T.unpack lbl)
_leadingSign :: TurtleParser (Maybe Bool)
_leadingSign = do
ms <- optional (satisfy (`elem` "+-"))
return $ (=='+') `fmap` ms
addSign :: Maybe Bool -> L.Text -> L.Text
addSign (Just True) t = L.cons '+' t
addSign (Just _) t = L.cons '-' t
addSign _ t = t
_integer :: TurtleParser L.Text
_integer = do
ms <- _leadingSign
rest <- many1Satisfy is09
return $ addSign ms rest
_decimal :: TurtleParser L.Text
_decimal = do
ms <- _leadingSign
leading <- manySatisfy is09
ichar '.'
trailing <- many1Satisfy is09
let ans2 = L.cons '.' trailing
ans = if L.null leading
then ans2
else L.append leading ans2
return $ addSign ms ans
_d1 :: TurtleParser L.Text
_d1 = do
a <- many1Satisfy is09
ichar '.'
b <- manySatisfy is09
return $ a `L.append` ('.' `L.cons` b)
_d2 :: TurtleParser L.Text
_d2 = do
ichar '.'
b <- many1Satisfy is09
return $ '.' `L.cons` b
_d3 :: TurtleParser L.Text
_d3 = many1Satisfy is09
_double :: TurtleParser L.Text
_double = do
ms <- _leadingSign
leading <- _d1 <|> _d2 <|> _d3
e <- _exponent
return $ addSign ms $ leading `L.append` e
_exponent :: TurtleParser L.Text
_exponent = do
e <- char 'e' <|> char 'E'
ms <- _leadingSign
ep <- _integer
return $ L.cons e $ addSign ms ep
_protChar :: TurtleParser Char
_protChar = char '\\' *> (_echar' <|> _uchar')
_exclSLQ, _exclSLSQ :: String
_exclSLQ = map chr [0x22, 0x5c, 0x0a, 0x0d]
_exclSLSQ = map chr [0x27, 0x5c, 0x0a, 0x0d]
_stringLiteralQuote, _stringLiteralSingleQuote :: TurtleParser L.Text
_stringLiteralQuote = _stringIt dQuot (_tChars _exclSLQ)
_stringLiteralSingleQuote = _stringIt sQuot (_tChars _exclSLSQ)
_stringLiteralLongQuote, _stringLiteralLongSingleQuote :: TurtleParser L.Text
_stringLiteralLongQuote = _stringItLong dQuot3 (_tCharsLong '"')
_stringLiteralLongSingleQuote = _stringItLong sQuot3 (_tCharsLong '\'')
_stringIt :: TurtleParser a -> TurtleParser Char -> TurtleParser L.Text
_stringIt sep chars = L.pack <$> bracket sep sep (many chars)
_stringItLong :: TurtleParser a -> TurtleParser L.Text -> TurtleParser L.Text
_stringItLong sep chars = L.concat <$> bracket sep sep (many chars)
_tChars :: String -> TurtleParser Char
_tChars excl = _protChar <|> noneOf excl
oneOrTwo :: Char -> TurtleParser L.Text
oneOrTwo c = do
ignore $ char c
mb <- optional (char c)
case mb of
Just _ -> return $ L.pack [c,c]
_ -> return $ L.singleton c
_multiQuote :: Char -> TurtleParser L.Text
_multiQuote c = do
mq <- optional (oneOrTwo c)
r <- noneOf (c : "\\")
return $ fromMaybe L.empty mq `L.snoc` r
_tCharsLong :: Char -> TurtleParser L.Text
_tCharsLong c =
L.singleton <$> _protChar
<|> _multiQuote c
_uchar :: TurtleParser Char
_uchar = char '\\' >> _uchar'
_uchar' :: TurtleParser Char
_uchar' =
(char 'u' *> (commit hex4 <? "Expected 4 hex characters after \\u"))
<|>
(char 'U' *> (commit hex8 <? "Expected 8 hex characters after \\U"))
_echar' :: TurtleParser Char
_echar' =
(char 't' *> pure '\t') <|>
(char 'b' *> pure '\b') <|>
(char 'n' *> pure '\n') <|>
(char 'r' *> pure '\r') <|>
(char 'f' *> pure '\f') <|>
(char '\\' *> pure '\\') <|>
(char '"' *> pure '"') <|>
(char '\'' *> pure '\'')
_ws :: TurtleParser ()
_ws = ignore $ satisfy (`elem` _wsChars)
_wsChars :: String
_wsChars = map chr [0x20, 0x09, 0x0d, 0x0a]
_anon :: TurtleParser RDFLabel
_anon =
br '[' ']' (many _ws) >> newBlankNode
_pnCharsBase :: TurtleParser Char
_pnCharsBase =
let f c = let i = ord c
in isaZ c ||
match i [(0xc0, 0xd6), (0xd8, 0xf6), (0xf8, 0x2ff),
(0x370, 0x37d), (0x37f, 0x1fff), (0x200c, 0x200d),
(0x2070, 0x218f), (0x2c00, 0x2fef), (0x3001, 0xd7ff),
(0xf900, 0xfdcf), (0xfdf0, 0xfffd), (0x10000, 0xeffff)]
in satisfy f
_pnCharsU, _pnChars :: TurtleParser Char
_pnCharsU = _pnCharsBase <|> char '_'
_pnChars =
let f c = let i = ord c
in match i [(0x300, 0x36f), (0x203f, 0x2040)]
in _pnCharsU <|> char '-' <|> satisfy is09 <|>
char (chr 0xb7) <|> satisfy f
_pnPrefix :: TurtleParser L.Text
_pnPrefix = L.cons <$> _pnCharsBase <*> _pnRest
_pnLocal :: TurtleParser L.Text
_pnLocal = do
s <- L.singleton <$> (_pnCharsU <|> char ':' <|> satisfy is09)
<|> _plx
e <- noTrailingDotM (L.singleton <$> (_pnChars <|> char ':') <|> _plx)
return $ s `L.append` e
_plx, _percent :: TurtleParser L.Text
_plx = _percent <|> (L.singleton <$> _pnLocalEsc)
_percent = do
ichar '%'
a <- _hex
b <- _hex
return $ L.cons '%' $ L.cons a $ L.singleton b
_hex, _pnLocalEsc :: TurtleParser Char
_hex = satisfy isHexDigit
_pnLocalEsc = char '\\' *> satisfy (`elem` _pnLocalEscChars)
_pnLocalEscChars :: String
_pnLocalEscChars = "_~.-!$&'()*+,;=/?#@%"