module NLP.GenI.Parser (
geniTestSuite, geniSemanticInput, geniTestSuiteString,
geniDerivations,
geniMacros, geniTagElems,
geniLexicon, geniMorphInfo,
geniFeats, geniSemantics, geniValue, geniWords,
geniWord, geniLanguageDef, tillEof,
parseFromFile,
module Text.Parsec,
module Text.Parsec.Text,
) where
import Control.Applicative ((*>), (<$>), (<*), (<*>), pure)
import Control.Monad (liftM, when)
import qualified Data.ByteString as B
import Data.Functor.Identity (Identity)
import qualified Data.Map as Map
import Data.Monoid ((<>))
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Tree as T
import Text.Parsec
import Text.Parsec.Text
import Text.Parsec.Token (GenLanguageDef (..),
makeTokenParser)
import qualified Text.Parsec.Token as P
import Data.FullList (FullList, Listable (..))
import NLP.GenI.FeatureStructure (AvPair (..), Flist, sortFlist)
import NLP.GenI.General (isGeniIdentLetter)
import NLP.GenI.GeniShow (GeniShow (..), geniKeyword)
import NLP.GenI.GeniVal (GeniVal, SchemaVal (..), isAnon,
mkGAnon, mkGConst, mkGConstNone,
mkGVar)
import NLP.GenI.Lexicon (LexEntry (..), fromLexSem,
mkFullLexEntry)
import NLP.GenI.Pretty (above)
import NLP.GenI.Semantics (LitConstr, Literal (..), Sem,
SemInput, sortSem)
import NLP.GenI.Tag (TagElem (..), setTidnums)
import NLP.GenI.TestSuite (TestCase (..))
import NLP.GenI.TreeSchema (GNode (..), GType (..), Ptype (..),
AdjunctionConstraint(..),
SchemaTree, Ttree (..))
#define SEMANTICS "semantics"
#define SENTENCE "sentence"
#define OUTPUT "output"
#define TRACE "trace"
#define ANCHOR "anchor"
#define SUBST "subst"
#define FOOT "foot"
#define LEX "lex"
#define TYPE "type"
#define ACONSTR "aconstr"
#define INITIAL "initial"
#define AUXILIARY "auxiliary"
#define IDXCONSTRAINTS "idxconstraints"
#define BEGIN "begin"
#define END "end"
geniLanguageDef :: P.GenLanguageDef Text () Identity
geniLanguageDef = LanguageDef
{ commentLine = "%"
, commentStart = "/*"
, commentEnd = "*/"
, nestedComments = True
, opStart = opLetter geniLanguageDef
, opLetter = oneOf ""
, reservedOpNames = []
, reservedNames =
[ SEMANTICS , SENTENCE, OUTPUT, IDXCONSTRAINTS, TRACE
, ANCHOR , SUBST , FOOT , LEX , TYPE , ACONSTR
, INITIAL , AUXILIARY
, BEGIN , END
]
, identLetter = identStuff
, identStart = identStuff
, caseSensitive = True
}
where
identStuff = satisfy isGeniIdentLetter
geniValue :: Parser GeniVal
geniValue = ((try $ anonymous) <?> "_ or ?_")
<|> (constants <?> "a constant or atomic disjunction")
<|> (variable <?> "a variable")
where
question = "?"
disjunction = geniAtomicDisjunction
constants :: Parser GeniVal
constants = mkGConst <$> disjunction
variable :: Parser GeniVal
variable =
do symbol question
v <- identifier
mcs <- option Nothing $ (symbol "/" >> Just `liftM` disjunction)
return (mkGVar v mcs)
anonymous :: Parser GeniVal
anonymous =
do optional $ symbol question
symbol "_"
return mkGAnon
geniAtomicDisjunction :: Parser (FullList Text)
geniAtomicDisjunction = do
(x:xs) <- atom `sepBy1` (symbol "|")
return (x !: xs)
where
atom = looseFlexiIdentifier
geniFancyDisjunction :: Parser SchemaVal
geniFancyDisjunction = SchemaVal <$> geniValue `sepBy1` symbol ";"
class GeniValLike v where
geniValueLike :: Parser v
instance GeniValLike GeniVal where
geniValueLike = geniValue
instance GeniValLike SchemaVal where
geniValueLike = geniFancyDisjunction
geniFeats :: GeniValLike v => Parser (Flist v)
geniFeats = option [] $ squares $ many geniAttVal
geniAttVal :: GeniValLike v => Parser (AvPair v)
geniAttVal = do
att <- identifierR <?> "an attribute"; colon
val <- geniValueLike <?> "a GenI value"
return $ AvPair att val
geniSemantics :: Parser Sem
geniSemantics =
do sem <- many (geniLiteral <?> "a literal")
return (sortSem sem)
geniLiteral :: Parser (Literal GeniVal)
geniLiteral = geniLiteral_ mkGAnon geniValue
geniLiteral_ :: a -> Parser a -> Parser (Literal a)
geniLiteral_ zero gv =
Literal <$> (option zero handleParser <?> "a handle")
<*> (gv <?> "a predicate")
<*> (parens (many gv) <?> "some parameters")
where
handleParser = try $ gv <* char ':'
geniSemanticInput :: Parser (Sem,Flist GeniVal,[LitConstr])
geniSemanticInput =
do keywordSemantics
(sem,litC) <- liftM unzip $ squares $ many literalAndConstraint
idxC <- option [] geniIdxConstraints
let sem2 = createHandles sem
semlitC2 = [ (s,c) | (s,c) <- zip sem2 litC, (not.null) c ]
return (createHandles sem, idxC, semlitC2)
where
createHandles :: Sem -> Sem
createHandles = zipWith setHandle ([1..] :: [Int])
setHandle i (Literal h pred_ par) =
let h2 = if isAnon h
then mkGConstNone ("genihandle" `T.append` T.pack (show i))
else h
in Literal h2 pred_ par
literalAndConstraint :: Parser LitConstr
literalAndConstraint =
do l <- geniLiteral
t <- option [] $ squares $ many identifier
return (l,t)
geniSemanticInputString :: Parser Text
geniSemanticInputString = do
keywordSemantics
s <- squaresString
whiteSpace
xs <- option [] geniIdxConstraints
return (spitBack s xs)
where
spitBack semStr idxC =
geniKeyword SEMANTICS semStr `above` r
where
r | null idxC = ""
| otherwise = geniKeyword IDXCONSTRAINTS (geniShowText idxC)
geniIdxConstraints :: Parser (Flist GeniVal)
geniIdxConstraints = keyword IDXCONSTRAINTS >> geniFeats
squaresString :: Parser Text
squaresString =
between (char '[') (char ']') $ do
xs <- many1 (nonSq <|> squaresString)
return $ "[" <> T.concat xs <> "]"
where
nonSq :: Parser Text
nonSq = T.pack <$> many1 (noneOf "[]")
geniTestSuite :: Parser [TestCase SemInput]
geniTestSuite =
tillEof (many geniTestCase)
geniTestSuiteString :: Parser [Text]
geniTestSuiteString =
tillEof (many geniTestCaseString)
geniDerivations :: Parser [TestCaseOutput]
geniDerivations = tillEof $ many geniOutput
geniTestCase :: Parser (TestCase SemInput)
geniTestCase =
TestCase <$> (option "" (flexiIdentifier <?> "a test case name"))
<*> lookAhead geniSemanticInputString
<*> geniSemanticInput
<*> many geniSentence
<*> pure Nothing
type TestCaseOutput = (Text, Map.Map (Text,Text) [Text])
geniOutput :: Parser TestCaseOutput
geniOutput = do
ws <- keyword OUTPUT >> squares geniWords
ds <- Map.fromList <$> many geniTraces
return (ws, ds)
geniTraces :: Parser ((Text,Text), [Text])
geniTraces = do
keyword TRACE
squares $ do
k1 <- withWhite geniWord
k2 <- withWhite geniWord
whiteSpace >> char '!' >> whiteSpace
traces <- geniWord `sepEndBy1` whiteSpace
return ((k1,k2), traces)
withWhite :: Parser a -> Parser a
withWhite p = p >>= (\a -> whiteSpace >> return a)
geniSentence :: Parser Text
geniSentence = optional (keyword SENTENCE) >> squares geniWords
geniWords :: Parser Text
geniWords =
T.unwords <$> (sepEndBy1 geniWord whiteSpace <?> "a sentence")
geniWord :: Parser Text
geniWord = T.pack <$> many1 (noneOf "[]\v\f\t\r\n ")
geniTestCaseString :: Parser Text
geniTestCaseString = do
option "" (flexiIdentifier <?> "a test case name")
geniSemanticInputString <* (many geniSentence >> many geniOutput)
geniLexicon :: Parser [LexEntry]
geniLexicon = tillEof $ many1 geniLexicalEntry
geniLexicalEntry :: Parser LexEntry
geniLexicalEntry =
do lemmas <- geniAtomicDisjunction <?> "a lemma (or disjunction thereof)"
family <- identifier <?> "a tree family"
(pars, interface) <- option ([],[]) $ parens paramsParser
equations <- option [] $ do keyword "equations"
geniFeats <?> "path equations"
filters <- option [] $ do keyword "filters"
geniFeats
keywordSemantics
(sem, pols) <- fromLexSem <$> squares geniLexSemantics
return (mkFullLexEntry lemmas family pars interface filters equations sem pols)
where
paramsParser :: Parser ([GeniVal], Flist GeniVal)
paramsParser = do
pars <- many geniValue <?> "some parameters"
interface <- option [] $ do symbol "!"
many geniAttVal
return (pars, interface)
geniLexSemantics :: Parser [Literal PolValue]
geniLexSemantics = sortSem <$> many (geniLexLiteral <?> "a literal")
type PolValue = (GeniVal, Int)
geniLexLiteral :: Parser (Literal PolValue)
geniLexLiteral = geniLiteral_ (mkGAnon,0) geniPolValue
geniPolValue :: Parser (GeniVal, Int)
geniPolValue =
do p <- geniPolarity
v <- geniValue
return (v,p)
geniMacros :: Parser [SchemaTree]
geniMacros = tillEof $ many geniTreeDef
initType, auxType :: Parser Ptype
initType = do { reserved INITIAL ; return Initial }
auxType = do { reserved AUXILIARY ; return Auxiliar }
geniTreeDef :: Parser SchemaTree
geniTreeDef =
do sourcePos <- getPosition
family <- identifier
tname <- option "" (colon *> identifier)
(pars,iface) <- geniParams
theTtype <- (initType <|> auxType)
theTree <- geniTree
let treeFail x =
do setPosition sourcePos
fail $ "In tree " ++ T.unpack family ++ ":"
++ T.unpack tname ++ " " ++ show sourcePos ++ ": " ++ x
let theNodes = T.flatten theTree
numFeet = length [ x | x <- theNodes, gtype x == Foot ]
numAnchors = length [ x | x <- theNodes, ganchor x ]
when (not $ any ganchor theNodes) $
treeFail "At least one node in an LTAG tree must be an anchor"
when (numAnchors > 1) $
treeFail "There can be no more than 1 anchor node in a tree"
when (numFeet > 1) $
treeFail "There can be no more than 1 foot node in a tree"
when (theTtype == Initial && numFeet > 0) $
treeFail "Initial trees may not have foot nodes"
psem <- option Nothing $ do { keywordSemantics; liftM Just (squares geniSemantics) }
ptrc <- option [] $ do { keyword TRACE; squares (many identifier) }
return TT{ params = pars
, pfamily = family
, pidname = tname
, pinterface = sortFlist iface
, ptype = theTtype
, tree = theTree
, ptrace = ptrc
, psemantics = psem
}
geniTree :: (Ord v, GeniValLike v) => Parser (T.Tree (GNode v))
geniTree =
do node <- geniNode
kids <- option [] (braces $ many geniTree)
<?> "child nodes"
let noKidsAllowed t c = when (c node && (not.null $ kids)) $
fail $ t ++ " nodes may *not* have any children"
noKidsAllowed "Anchor" $ ganchor
noKidsAllowed "Substitution" $ (== Subs) . gtype
noKidsAllowed "Foot" $ (== Foot) . gtype
return (T.Node node kids)
geniNode :: (Ord v, GeniValLike v) => Parser (GNode v)
geniNode = do
name <- identifier
nodeType <- geniNodeAnnotation
lex_ <- if nodeType == AnnoLexeme
then (flexiIdentifier `sepBy` symbol "|") <?> "some lexemes"
else return []
constr <- case nodeType of
AnnoDefault -> adjConstraintParser
AnnoAnchor -> adjConstraintParser
_ -> return ExplicitNoAdj
(top,bot) <- if nodeType == AnnoLexeme
then option ([],[]) $ try topbotParser
else topbotParser
return $ GN { gnname = name
, gtype = fromAnnotation nodeType
, gup = sortFlist top
, gdown = sortFlist bot
, glexeme = lex_
, ganchor = nodeType == AnnoAnchor
, gaconstr = constr
, gorigin = ""
}
where
adjConstraintParser = option MaybeAdj $
reserved ACONSTR >> char ':' >> symbol "noadj" >> return ExplicitNoAdj
topbotParser = do
top <- geniFeats <?> "top features"
symbol "!"
bot <- geniFeats <?> "bot features"
return (top,bot)
data Annotation = AnnoAnchor
| AnnoLexeme
| AnnoSubst
| AnnoFoot
| AnnoDefault
deriving Eq
fromAnnotation :: Annotation -> GType
fromAnnotation AnnoLexeme = Lex
fromAnnotation AnnoAnchor = Lex
fromAnnotation AnnoSubst = Subs
fromAnnotation AnnoFoot = Foot
fromAnnotation AnnoDefault = Other
geniNodeAnnotation :: Parser Annotation
geniNodeAnnotation =
(keyword TYPE *> ty) <|>
(reserved ANCHOR >> return AnnoAnchor) <|>
return AnnoDefault
where
ty = choice [ try (symbol s) >> return t | (s,t) <- table ]
table =
[ (ANCHOR, AnnoAnchor)
, (FOOT, AnnoFoot)
, (SUBST, AnnoSubst)
, (LEX, AnnoLexeme)
]
geniTagElems :: Parser [TagElem]
geniTagElems = tillEof $ setTidnums `fmap` many geniTagElem
geniTagElem :: Parser TagElem
geniTagElem = do
family <- identifier
tname <- option "" $ (colon *> identifier)
iface <- (snd `liftM` geniParams) <|> geniFeats
theType <- initType <|> auxType
theTree <- geniTree
sem <- do { keywordSemantics; squares geniSemantics }
return $ TE { idname = tname
, ttreename = family
, tinterface = iface
, ttype = theType
, ttree = theTree
, tsemantics = sem
, tidnum = 1
, tpolarities = Map.empty
, tsempols = []
, ttrace = []
}
geniParams :: Parser ([GeniVal], Flist GeniVal)
geniParams = parens $ do
pars <- many geniValue <?> "some parameters"
interface <- option [] $ do { symbol "!"; many geniAttVal }
return (pars, interface)
geniMorphInfo :: Parser [(Text,Flist GeniVal)]
geniMorphInfo = tillEof $ many morphEntry
morphEntry :: Parser (Text,Flist GeniVal)
morphEntry = (,) <$> identifier <*> geniFeats
geniPolarity :: Parser Int
geniPolarity = option 0 (plus <|> minus)
where
plus = do { char '+'; return 1 }
minus = do { char '-'; return (1) }
keyword :: Text -> Parser Text
keyword k =
(try $ do { reserved k; colon; return k }) <?> T.unpack k ++ ":"
keywordSemantics :: Parser Text
keywordSemantics = keyword SEMANTICS
lexer :: P.GenTokenParser Text () Identity
lexer = makeTokenParser geniLanguageDef
whiteSpace :: Parser ()
whiteSpace = P.whiteSpace lexer
identifier :: Parser Text
identifier = decode <$> P.identifier lexer
flexiIdentifier :: Parser Text
flexiIdentifier = stringLiteral <|> identifier
looseIdentifier :: Parser Text
looseIdentifier =
decode <$> do { i <- ident ; whiteSpace; return i }
where
ident = do
{ c <- identStart geniLanguageDef
; cs <- many (identLetter geniLanguageDef)
; return (c:cs) } <?> "identifier"
looseFlexiIdentifier :: Parser Text
looseFlexiIdentifier = looseIdentifier <|> stringLiteral
colon :: Parser Text
colon = decode <$> P.colon lexer
stringLiteral :: Parser Text
stringLiteral = decode <$> P.stringLiteral lexer
squares, braces, parens :: Parser a -> Parser a
squares = P.squares lexer
braces = P.braces lexer
parens = P.parens lexer
reserved :: Text -> Parser Text
reserved s = P.reserved lexer (T.unpack s) >> return s
symbol :: Text -> Parser Text
symbol s = P.symbol lexer (T.unpack s) >> return s
decode :: String -> Text
decode = T.pack
identifierR :: Parser Text
identifierR = decode <$> do
{ c <- P.identStart geniLanguageDef
; cs <- many (P.identLetter geniLanguageDef)
; return (c:cs)
} <?> "identifier or reserved word"
tillEof :: Parser a -> Parser a
tillEof p = whiteSpace *> p <* eof
parseFromFile :: Parser a -> SourceName -> IO (Either ParseError a)
parseFromFile p fname = do
{ input <- T.decodeUtf8 <$> B.readFile fname
; return (parse p fname input)
}