{-# LANGUAGE CPP #-}

--------------------------------------------------------------------------------
--  See end of this file for licence information.
--------------------------------------------------------------------------------

-- |
--  Module      :  NTriples
--  Copyright   :  (c) 2011, 2012, 2013, 2018, 2021 Douglas Burke
--  License     :  GPL V2
--
--  Maintainer  :  Douglas Burke
--  Stability   :  experimental
--  Portability :  CPP
--
--  This Module implements a NTriples parser, returning a
--  new 'RDFGraph' consisting of triples and namespace information parsed from
--  the supplied NTriples input string, or an error indication.
--
-- REFERENCES:
--
--  - \"RDF Test Cases\",
--    W3C Recommendation 10 February 2004,
--    <http://www.w3.org/TR/rdf-testcases/#ntriples>
--
-- NOTES:
--
--  - If the URI is actually an IRI (Internationalized Resource Identifiers)
--    then the parser will fail since 'Network.URI.parseURI' fails.
--
--  - The case of language tags is retained.
--
--  - Update to the document \"N-Triples. A line-based syntax for an RDF graph\"
--    W3C Working Group Note 09 April 2013,
--    <http://www.w3.org/TR/2013/NOTE-n-triples-20130409/>
--
--------------------------------------------------------------------------------

module Swish.RDF.Parser.NTriples
    ( ParseResult
    , parseNT      
    )
where

import Swish.GraphClass (arc)
import Swish.Namespace (ScopedName, makeURIScopedName)

import Swish.RDF.Graph (RDFGraph, RDFLabel(..), addArc, emptyRDFGraph)
import Swish.RDF.Vocabulary (LanguageTag, toLangTag)

import Swish.RDF.Parser.Utils (ParseResult
    , runParserWithError
    , ignore
    , skipMany
    , noneOf
    , char
    , string
    , eoln
    , fullStop
    , hex4
    , hex8
    )
  
import Control.Applicative

import Network.URI (parseURI)

import qualified Data.Text as T
import qualified Data.Text.Lazy as L

import Data.Char (isAsciiLower, isAsciiUpper, isDigit, ord)

#if MIN_VERSION_base(4, 7, 0)
import Data.Functor (($>))
#endif

import Data.Maybe (fromMaybe)

import Text.ParserCombinators.Poly.StateText

#if !MIN_VERSION_base(4, 7, 0)
($>) :: Functor f => f a -> b -> f b
($>) = flip (<$)
#endif

----------------------------------------------------------------------
-- Define parser state and helper functions
----------------------------------------------------------------------

-- | NT parser state
data NTState = NTState
        { NTState -> RDFGraph
graphState :: RDFGraph            -- Graph under construction
        }

emptyState :: NTState
emptyState :: NTState
emptyState = NTState { graphState :: RDFGraph
graphState = RDFGraph
emptyRDFGraph }
           
--  Return function to update graph in NT parser state,
--  using the supplied function of a graph. This is for use
--  with stUpdate.
--
updateGraph :: (RDFGraph -> RDFGraph) -> NTState -> NTState
updateGraph :: (RDFGraph -> RDFGraph) -> NTState -> NTState
updateGraph RDFGraph -> RDFGraph
f NTState
s = NTState
s { graphState :: RDFGraph
graphState = RDFGraph -> RDFGraph
f (NTState -> RDFGraph
graphState NTState
s) }

----------------------------------------------------------------------
--  Define top-level parser function:
--  accepts a string and returns a graph or error
----------------------------------------------------------------------

-- | Parser that carries around a NTState record.
type NTParser a = Parser NTState a

-- | Parse a string.
-- 
parseNT ::
  L.Text -- ^ input in NTriples format.
  -> ParseResult
parseNT :: Text -> ParseResult
parseNT = forall a. NTParser a -> Text -> Either String a
parsefromText NTParser RDFGraph
ntripleDoc

{-
-- useful for testing
test :: String -> RDFGraph
test = either error id . parseNT
-}

-- | Function to supply initial context and parse supplied term.
--
--  Used for debugging.
parsefromText :: 
    NTParser a      -- ^ parser to apply
    -> L.Text       -- ^ input to be parsed
    -> Either String a
parsefromText :: forall a. NTParser a -> Text -> Either String a
parsefromText NTParser a
parser = forall a b. Parser a b -> a -> Text -> Either String b
runParserWithError NTParser a
parser NTState
emptyState

-- helper routines

{-
lineFeed :: NTParser ()
lineFeed = ignore (char '\r')
-}

-- Add statement to graph in NT parser state

addStatement :: RDFLabel -> RDFLabel -> RDFLabel -> NTParser ()
addStatement :: RDFLabel -> RDFLabel -> RDFLabel -> NTParser ()
addStatement RDFLabel
s RDFLabel
p RDFLabel
o = forall s. (s -> s) -> Parser s ()
stUpdate ((RDFGraph -> RDFGraph) -> NTState -> NTState
updateGraph (forall lb. Label lb => Arc lb -> NSGraph lb -> NSGraph lb
addArc (forall lb. lb -> lb -> lb -> Arc lb
arc RDFLabel
s RDFLabel
p RDFLabel
o) ))

----------------------------------------------------------------------
--  Syntax productions
----------------------------------------------------------------------

{-

EBNF from the specification, using the notation from XML 1.0, second edition,
is included inline below.

We do not force ASCII 7-bit semantics here yet.

space	::=	#x20 /* US-ASCII space - decimal 32 */	
cr	::=	#xD /* US-ASCII carriage return - decimal 13 */	
lf	::=	#xA /* US-ASCII line feed - decimal 10 */	
tab	::=	#x9 /* US-ASCII horizontal tab - decimal 9 */	

The productions are kept as close as possible to the specification
for now.

-}

{-
ntripleDoc	::=	line*	
line	::=	ws* ( comment | triple )? eoln	

We relax the rule that the input must be empty or end with a new line.

ntripleDoc :: NTParser RDFGraph
ntripleDoc = graphState <$> (many line *> eof *> getState)

line :: NTParser ()
line = skipMany ws *> optional (comment <|> triple) *> eoln
-}

ntripleDoc :: NTParser RDFGraph
ntripleDoc :: NTParser RDFGraph
ntripleDoc = NTState -> RDFGraph
graphState forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (forall (p :: * -> *) a sep. PolyParse p => p a -> p sep -> p [a]
sepBy NTParser ()
line forall s. Parser s ()
eoln forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional forall s. Parser s ()
eoln forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> NTParser ()
skipWS forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall s. Parser s ()
eof forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall s. Parser s s
stGet)

line :: NTParser ()
line :: NTParser ()
line = NTParser ()
skipWS forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall (f :: * -> *) a. Applicative f => f a -> f ()
ignore (forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional (NTParser ()
comment forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> NTParser ()
triple))

{-
ws	::=	space | tab	

Could use whiteSpace rule here, but that would permit
constructs (e.g. comments) where we do not support them.
-}

isWS :: Char -> Bool
isWS :: Char -> Bool
isWS = (forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` (String
" \t" :: String))

{-
ws :: NTParser ()
-- ws = ignore (char ' ' <|> tab)
ws = ignore $ satisfy isWS
-}
           
skipWS :: NTParser ()
skipWS :: NTParser ()
skipWS = forall (f :: * -> *) a. Applicative f => f a -> f ()
ignore forall a b. (a -> b) -> a -> b
$ forall s. (Char -> Bool) -> Parser s Text
manySatisfy Char -> Bool
isWS

skip1WS :: NTParser ()
skip1WS :: NTParser ()
skip1WS = forall (f :: * -> *) a. Applicative f => f a -> f ()
ignore forall a b. (a -> b) -> a -> b
$ forall s. (Char -> Bool) -> Parser s Text
many1Satisfy Char -> Bool
isWS

{-
comment	::=	'#' ( character - ( cr | lf ) )*	
-}

comment :: NTParser ()
comment :: NTParser ()
comment = forall s. Char -> Parser s Char
char Char
'#' forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall s a. Parser s a -> Parser s ()
skipMany (forall s. String -> Parser s Char
noneOf String
"\r\n")

{-
eoln	::=	cr | lf | cr lf	
-}

{-
name	::=	[A-Za-z][A-Za-z0-9]*	
-}

isaz, isAZ, is09 :: Char -> Bool
isaz :: Char -> Bool
isaz = Char -> Bool
isAsciiLower
isAZ :: Char -> Bool
isAZ = Char -> Bool
isAsciiUpper
is09 :: Char -> Bool
is09 = Char -> Bool
isDigit

isaZ, isaZ09 :: Char -> Bool
isaZ :: Char -> Bool
isaZ Char
c = Char -> Bool
isaz Char
c Bool -> Bool -> Bool
|| Char -> Bool
isAZ Char
c
isaZ09 :: Char -> Bool
isaZ09 Char
c = Char -> Bool
isaZ Char
c Bool -> Bool -> Bool
|| Char -> Bool
is09 Char
c

isHeadChar, isBodyChar :: Char -> Bool
isHeadChar :: Char -> Bool
isHeadChar = Char -> Bool
isaZ
isBodyChar :: Char -> Bool
isBodyChar = Char -> Bool
isaZ09

name :: NTParser L.Text
name :: NTParser Text
name = Char -> Text -> Text
L.cons forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall s. (Char -> Bool) -> Parser s Char
satisfy Char -> Bool
isHeadChar forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> forall s. (Char -> Bool) -> Parser s Text
manySatisfy Char -> Bool
isBodyChar

nameStr :: NTParser String
nameStr :: NTParser String
nameStr = Text -> String
L.unpack forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> NTParser Text
name

{-
triple	::=	subject ws+ predicate ws+ object ws* '.' ws*	

-}

triple :: NTParser ()
triple :: NTParser ()
triple = 
  do
    RDFLabel
s <- Parser NTState RDFLabel
subject forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* NTParser ()
skip1WS
    RDFLabel
p <- Parser NTState RDFLabel
predicate forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* NTParser ()
skip1WS
    RDFLabel
o <- Parser NTState RDFLabel
object forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* (NTParser ()
skipWS forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall s. Parser s ()
fullStop forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> NTParser ()
skipWS)
    RDFLabel -> RDFLabel -> RDFLabel -> NTParser ()
addStatement RDFLabel
s RDFLabel
p RDFLabel
o

{-
subject	::=	uriref | nodeID	
predicate	::=	uriref	
object	::=	uriref | nodeID | literal	
-}

subject :: NTParser RDFLabel
subject :: Parser NTState RDFLabel
subject = Parser NTState RDFLabel
urirefLbl forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser NTState RDFLabel
nodeID

predicate :: NTParser RDFLabel
predicate :: Parser NTState RDFLabel
predicate = Parser NTState RDFLabel
urirefLbl

object :: NTParser RDFLabel
object :: Parser NTState RDFLabel
object = Parser NTState RDFLabel
urirefLbl forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser NTState RDFLabel
nodeID forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser NTState RDFLabel
literal

{-
uriref	::=	'<' absoluteURI '>'	
absoluteURI	::=	character+ with escapes as defined below (from section 'URI References')	

The absoluteURI production encodes a Unicode string representing an RDF URI references as specified in
[RDF-CONCEPTS]. These are encoded in N-Triples using the escapes described in section Strings.

-}

uriref :: NTParser ScopedName
uriref :: NTParser ScopedName
uriref = do
  forall (f :: * -> *) a. Applicative f => f a -> f ()
ignore forall a b. (a -> b) -> a -> b
$ forall s. Char -> Parser s Char
char Char
'<'
  String
uri <- forall (p :: * -> *) a z.
(PolyParse p, Show a) =>
p a -> p z -> p [a]
manyFinally' NTParser Char
character (forall s. Char -> Parser s Char
char Char
'>')
  forall b a. b -> (a -> b) -> Maybe a -> b
maybe (forall (p :: * -> *) a. PolyParse p => String -> p a
failBad (String
"Invalid URI: <" forall a. [a] -> [a] -> [a]
++ String
uri forall a. [a] -> [a] -> [a]
++ String
">"))
    (forall (m :: * -> *) a. Monad m => a -> m a
return forall b c a. (b -> c) -> (a -> b) -> a -> c
. URI -> ScopedName
makeURIScopedName)
    (String -> Maybe URI
parseURI String
uri)

urirefLbl :: NTParser RDFLabel
urirefLbl :: Parser NTState RDFLabel
urirefLbl = ScopedName -> RDFLabel
Res forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> NTParser ScopedName
uriref

{-
nodeID	::=	'_:' name	
-}

nodeID :: NTParser RDFLabel
nodeID :: Parser NTState RDFLabel
nodeID = String -> RDFLabel
Blank forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (forall s. String -> Parser s String
string String
"_:" forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> NTParser String
nameStr)

{-  
literal	::=	langString | datatypeString	
langString	::=	'"' string '"' ( '@' language )?	
datatypeString	::=	'"' string '"' '^^' uriref	
language	::=	[a-z]+ ('-' [a-z0-9]+ )*
encoding a language tag.	
string	::=	character* with escapes as defined in section Strings	

-}

literal :: NTParser RDFLabel
literal :: Parser NTState RDFLabel
literal = do
    Text
lit <- String -> Text
T.pack forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> NTParser String
ntstring
    Maybe (Either LanguageTag ScopedName)
opt <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional NTParser (Either LanguageTag ScopedName)
dtlang
    forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case Maybe (Either LanguageTag ScopedName)
opt of
               Just (Left LanguageTag
lcode)  -> Text -> LanguageTag -> RDFLabel
LangLit Text
lit LanguageTag
lcode
               Just (Right ScopedName
dtype) -> Text -> ScopedName -> RDFLabel
TypedLit Text
lit ScopedName
dtype
               Maybe (Either LanguageTag ScopedName)
_                  -> Text -> RDFLabel
Lit Text
lit

ntstring :: NTParser String
ntstring :: NTParser String
ntstring = forall (p :: * -> *) bra ket a.
PolyParse p =>
p bra -> p ket -> p a -> p a
bracket (forall s. Char -> Parser s Char
char Char
'"') (forall s. Char -> Parser s Char
char Char
'"') (forall (f :: * -> *) a. Alternative f => f a -> f [a]
many NTParser Char
character)

dtlang :: NTParser (Either LanguageTag ScopedName)
dtlang :: NTParser (Either LanguageTag ScopedName)
dtlang = 
    (forall s. Char -> Parser s Char
char Char
'@' forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall (p :: * -> *) a. Commitment p => p a -> p a
commit (forall a b. a -> Either a b
Left forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> NTParser LanguageTag
language))
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall s. String -> Parser s String
string String
"^^" forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall (p :: * -> *) a. Commitment p => p a -> p a
commit (forall a b. b -> Either a b
Right forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> NTParser ScopedName
uriref))

-- Note that toLangTag may fail since it does some extra
-- validation not done by the parser (mainly on the length of the
-- primary and secondary tags).
--
-- NOTE: This parser does not accept multiple secondary tags which RFC3066
-- does.
--
-- Although the EBNF only lists [a-z] we also support upper case values,
-- since the W3C Turtle test case includes a NTriples file with
-- "...@en-UK" in it.
--
language :: NTParser LanguageTag
language :: NTParser LanguageTag
language = do
    Text
h <- forall s. (Char -> Bool) -> Parser s Text
many1Satisfy Char -> Bool
isaZ
    Maybe Text
mt <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional forall a b. (a -> b) -> a -> b
$ Char -> Text -> Text
L.cons forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall s. Char -> Parser s Char
char Char
'-' forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> forall s. (Char -> Bool) -> Parser s Text
many1Satisfy Char -> Bool
isaZ09
    let lbl :: Text
lbl = Text -> Text
L.toStrict forall a b. (a -> b) -> a -> b
$ Text -> Text -> Text
L.append Text
h forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a -> a
fromMaybe Text
L.empty Maybe Text
mt
    case Text -> Maybe LanguageTag
toLangTag Text
lbl of
        Just LanguageTag
lt -> forall (m :: * -> *) a. Monad m => a -> m a
return LanguageTag
lt
        Maybe LanguageTag
_ -> forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String
"Invalid language tag: " forall a. [a] -> [a] -> [a]
++ Text -> String
T.unpack Text
lbl) -- should this be failBad?

{-
String handling: 

EBNF has:

character	::=	[#x20-#x7E] /* US-ASCII space to decimal 126 */	

Additional information from:

  http://www.w3.org/TR/rdf-testcases/#ntrip_strings

N-Triples strings are sequences of US-ASCII character productions encoding [UNICODE] character strings. The characters outside the US-ASCII range and some other specific characters are made available by \-escape sequences as follows:

 Unicode character
 (with code point u)	N-Triples encoding
 [#x0-#x8]	\uHHHH
 4 required hexadecimal digits HHHH encoding Unicode character u
 #x9	\t
 #xA	\n
 [#xB-#xC]	\uHHHH
 4 required hexadecimal digits HHHH encoding Unicode character u
 #xD	\r
 [#xE-#x1F]	\uHHHH
 4 required hexadecimal digits HHHH encoding Unicode character u
 [#x20-#x21]	the character u
 #x22	\"
 [#x23-#x5B]	the character u
 #x5C	\\
 [#x5D-#x7E]	the character u
 [#x7F-#xFFFF]	\uHHHH
 4 required hexadecimal digits HHHH encoding Unicode character u
 [#10000-#x10FFFF]	\UHHHHHHHH
 8 required hexadecimal digits HHHHHHHH encoding Unicode character u
 where H is a hexadecimal digit: [#x30-#x39],[#x41-#x46] (0-9, uppercase A-F).

This escaping satisfies the [CHARMOD] section Reference Processing Model on making the full Unicode character range U+0 to U+10FFFF available to applications and providing only one way to escape any character.

-}

-- 0x22 is " and 0x5c is \

isAsciiChar :: Char -> Bool
isAsciiChar :: Char -> Bool
isAsciiChar Char
c = let i :: Int
i = Char -> Int
ord Char
c
                in Int
i forall a. Ord a => a -> a -> Bool
>= Int
0x20 Bool -> Bool -> Bool
&& Int
i forall a. Ord a => a -> a -> Bool
<= Int
0x21 Bool -> Bool -> Bool
||
                   Int
i forall a. Ord a => a -> a -> Bool
>= Int
0x23 Bool -> Bool -> Bool
&& Int
i forall a. Ord a => a -> a -> Bool
<= Int
0x5b Bool -> Bool -> Bool
||
                   Int
i forall a. Ord a => a -> a -> Bool
>= Int
0x5d Bool -> Bool -> Bool
&& Int
i forall a. Ord a => a -> a -> Bool
<= Int
0x7e

protectedChar :: NTParser Char
protectedChar :: NTParser Char
protectedChar =
  (forall s. Char -> Parser s Char
char Char
't' forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Char
'\t')
  forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall s. Char -> Parser s Char
char Char
'n' forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Char
'\n')
  forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall s. Char -> Parser s Char
char Char
'r' forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Char
'\r')
  forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall s. Char -> Parser s Char
char Char
'"' forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Char
'"')
  forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall s. Char -> Parser s Char
char Char
'\\' forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Char
'\\')
  forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall s. Char -> Parser s Char
char Char
'u' forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall a. Parser a Char
hex4)
  forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall s. Char -> Parser s Char
char Char
'U' forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall a. Parser a Char
hex8)

character :: NTParser Char
character :: NTParser Char
character = 
  (forall s. Char -> Parser s Char
char Char
'\\' forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> NTParser Char
protectedChar)
  forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall s. (Char -> Bool) -> Parser s Char
satisfy Char -> Bool
isAsciiChar

--------------------------------------------------------------------------------
--
--  Copyright (c) 2011, 2012, 2013, 2018 Douglas Burke
--  All rights reserved.
--
--  This file is part of Swish.
--
--  Swish is free software; you can redistribute it and/or modify
--  it under the terms of the GNU General Public License as published by
--  the Free Software Foundation; either version 2 of the License, or
--  (at your option) any later version.
--
--  Swish is distributed in the hope that it will be useful,
--  but WITHOUT ANY WARRANTY; without even the implied warranty of
--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
--  GNU General Public License for more details.
--
--  You should have received a copy of the GNU General Public License
--  along with Swish; if not, write to:
--    The Free Software Foundation, Inc.,
--    59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
--
--------------------------------------------------------------------------------