{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE RecordWildCards #-} import Control.Applicative ((<$>)) import Control.Monad (unless, forM_) import Data.Double.Conversion.Text (toShortest) import System.Console.CmdArgs import System.IO (hFlush, stdout) import qualified Network as N import qualified Numeric.SGD as SGD import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.IO as L import qualified Data.Text as T import qualified Data.Text.IO as T import qualified Data.List.Split as Split import qualified Data.Char as Char import Data.Tagset.Positional (parseTagset) import qualified Data.Tagset.Positional as P import Data.Maybe (fromMaybe) import qualified Data.Map as M import qualified NLP.Concraft.Morphosyntax.Accuracy as Acc import qualified NLP.Concraft.Guess as Guess import qualified NLP.Concraft.Croatian as C import NLP.Concraft (tagset) import qualified NLP.Concraft.Croatian.Request as R import qualified NLP.Concraft.Croatian.Server as S import qualified NLP.Concraft.Croatian.Morphosyntax as X import qualified NLP.Concraft.Morphosyntax as X hiding (Word, orth) import Paths_concraft_hr (version, getDataFileName) import Data.Version (showVersion) import qualified NLP.Morphosyntax.Analyzer as A -- | Default port number. portDefault :: Int portDefault = 43982 --------------------------------------- -- Command line options --------------------------------------- -- | A description of the Concraft-hr tool. concraftDesc :: String concraftDesc = "Concraft-hr " ++ showVersion version data Concraft = -- | Training mode. Requires a path to the training file and output file, all -- other arguments are optional. It is recommended that one at least tries to -- find a proper regularization value, otherwise it is possible that the model -- fits the data a lot. On Croatian dataset the fit with the default regularization -- value is close to 100% (not good if you want to tag something other than your -- training set). Train { trainPath :: FilePath , evalPath :: Maybe FilePath , tagsetPath :: Maybe FilePath , noAna :: Bool , inAnalyzer :: Maybe FilePath , gIterNum :: Double , dIterNum :: Double -- Double? , gBatchSize :: Int , dBatchSize :: Int , gRegVar :: Double , dRegVar :: Double , gGain0 :: Double , dGain0 :: Double , gTau :: Double , dTau :: Double , disk :: Bool , outModel :: FilePath , guessNum :: Int , r0 :: Guess.R0T } -- | Tagging mode. This one requires a model and all other parameters are optional. -- It is recommended to precompute the analysis (set of possible tags) for all words -- if testing/evaluating the models, otherwise the analyzer is necessary for the -- tagging to work. | Tag { inModel :: FilePath , inAnalyzer :: Maybe FilePath , noAna :: Bool , marginals :: Bool } -- , guessNum :: Int } -- | Server mode starts the server and requires a path to the model and analyzer. -- It is recommended to provide a path to the analyzer as it might -- happen that 'Client' started requires additional analysis. If not the analyzer -- won't be used but currently it's necessary to provide a path. | Server { inModel :: FilePath , inAnalyzer :: Maybe FilePath , port :: Int } -- | Client mode starts a client which communicates with the 'Server' in a type-safe -- manner. The results are produced to standard output on the client side. | Client { noAna :: Bool , tagsetPath :: Maybe FilePath , marginals :: Bool , host :: String , port :: Int } -- | Compare mode can be used to produce accuracy results for the two provided -- tagged sentences in files. | Compare { tagsetPath :: Maybe FilePath , refPath :: FilePath , otherPath :: FilePath } -- | Prune mode given a path to the model, output model, and threshold value removes -- the features from the tagging model which have weights below the threshold (in log-domain). | Prune { inModel :: FilePath , outModel :: FilePath , threshold :: Double } -- | ReAna -- { format :: Format } deriving (Data, Typeable, Show) trainMode :: Concraft trainMode = Train { trainPath = def &= argPos 1 &= typ "TRAIN-FILE" , evalPath = def &= typFile &= help "Evaluation file" , tagsetPath = def &= typFile &= help "Tagset definition file" , noAna = False &= help "Do not perform reanalysis - (False)" , inAnalyzer= def &= typFile &= help "Morphosyntactic analyzer - needed if noana is false" -- , discardHidden = False &= help "Discard hidden features" , gIterNum = 20 &= help "Number of SGD iterations of guessing model - (20)" , dIterNum = 20 &= help "Number of SGD iterations of disambiguation model - (20)" , gBatchSize = 50 &= help "Batch size for SGD of guessing model - (50)" , dBatchSize = 50 &= help "Batch size for SGD of disambiguation model - (50)" , gRegVar = 10.0 &= help "Regularization variance of guessing model - (10.0)" , dRegVar = 10.0 &= help "Regularization variance of disambiguation model - (10.0)" , gGain0 = 1.0 &= help "Initial gain parameter of guessing model - (1.0)" , dGain0 = 1.0 &= help "Initial gain parameter of disambiguation model - (1.0)" , gTau = 5.0 &= help "Initial tau parameter of guessing model - (5.0)" , dTau = 5.0 &= help "Initial tau parameter for disambiguation model - (5.0)" , disk = False &= help "Store SGD dataset on disk - (False)" , outModel = def &= typFile &= help "Output Model file" , guessNum = 10 &= help "Number of guessed tags for each unknown word - (10)" , r0 = Guess.OovChosen &= help "R0 construction method (AnyInterps, AnyChosen, OovChosen) - (OovChosen)" } &= help "Training mode - needs training data - outputs a CRF model to a given filepath.\ \ Values or modes shown between the brackets are default values." tagMode :: Concraft tagMode = Tag { inModel = def &= argPos 0 &= typ "MODEL-FILE" , inAnalyzer= def &= typFile &= help "Morphosyntactic analyzer - needed if noana is false" , noAna = False &= help "Do not analyse input text - (False)" , marginals = False &= help "Tag with marginal probabilities - (False)" } -- , guessNum = 10 &= help "Number of guessed tags for each unknown word" } &= help "Given tokenized input data it outputs the same tokenized form with tags next to each word.\ \ Values shown between the brackets are default values." serverMode :: Concraft serverMode = Server { inModel = def &= argPos 0 &= typ "MODEL-FILE" &= help "Concraft model file" , inAnalyzer= def &= typFile &= help "Morphosyntactic analyzer file" , port = portDefault &= help "Port number" } &= help "Starts a server which can process requests of tokenized sentences - tagging and analysis." clientMode :: Concraft clientMode = Client { noAna = False &= help "Do not perform reanalysis" , tagsetPath = def &= typFile &= help "Tagset definition file" , port = portDefault &= help "Port number - (43982)" , host = "localhost" &= help "Server host name" , marginals = False &= help "Tag with marginal probabilities" } &= help "Starts a client which sends requests of tokenized sentences, received from stdin, to server." compareMode :: Concraft compareMode = Compare { refPath = def &= argPos 1 &= typ "REFERENCE-FILE" , otherPath = def &= argPos 2 &= typ "OTHER-FILE" , tagsetPath = def &= typFile &= help "Tagset definition file" } &= help "Compares the two given files -- meant to be used for accuracy checking of the model." pruneMode :: Concraft pruneMode = Prune { inModel = def &= argPos 0 &= typ "INPUT-MODEL" , outModel = def &= argPos 1 &= typ "OUTPUT-MODEL" , threshold = 0.05 &= help "Remove disambiguation features below the threshold - (0.05)" } &= help "Prunes (removes) the features of the model (in log-domain) with weights below the given threshold." argModes :: Mode (CmdArgs Concraft) argModes = cmdArgsMode $ modes [trainMode, tagMode, serverMode, clientMode, compareMode, pruneMode] &= summary concraftDesc &= program "concraft-hr" --------------------------------------- -- Main --------------------------------------- main :: IO () main = exec =<< cmdArgsRun argModes exec :: Concraft -> IO () exec Train{..} = do tagsetPath' <- case tagsetPath of Nothing -> getDataFileName "config/tagset.cfg" Just x -> return x tset <- parseTagset tagsetPath' <$> readFile tagsetPath' let train0 = parseTrain tset trainPath let eval0 = case evalPath of Nothing -> return [] Just x -> parseTrain tset x let alyzer = fmap A.load inAnalyzer let ean = A.create tset A.emptyConf [] -- dirty `undefined` bugfix analyzer <- fromMaybe (if noAna then return ean else error errMsg) alyzer let af = analyze $ A.getTags analyzer -- analysis function let train1 = if noAna then train0 else map af <$> train0 let eval1 = if noAna then eval0 else map af <$> eval0 concraft <- C.train (trainConf tset) train1 eval1 unless (null outModel) $ do putStrLn $ "\nSaving model in " ++ outModel ++ "..." C.saveModel outModel concraft where analyze af sent = X.addAnalysis sent (map (af . X.orth . X.word) sent) gsgdArgs = SGD.SgdArgs { SGD.batchSize = gBatchSize , SGD.regVar = gRegVar , SGD.iterNum = gIterNum , SGD.gain0 = gGain0 , SGD.tau = gTau } dsgdArgs = SGD.SgdArgs { SGD.batchSize = dBatchSize , SGD.regVar = dRegVar , SGD.iterNum = dIterNum , SGD.gain0 = dGain0 , SGD.tau = dTau } trainConf tagset = C.TrainConf { tagset = tagset , gSgdArgs = gsgdArgs , dSgdArgs = dsgdArgs , reana = not noAna , onDisk = disk , guessNum = guessNum , r0 = r0 } errMsg = "Please define the path to analyzer or set -noana to true!\n" exec Tag{..} = do cft <- C.loadModel inModel let alyzer = fmap A.load inAnalyzer let ean = A.create (tagset cft) A.emptyConf [] -- dirty `undefined` bugfix analyzer <- fromMaybe (if noAna then return ean else error errMsg) alyzer let tset = tagset cft inp <- parseContent tset <$> L.getContents out <- R.short analyzer cft $ rq $ buildReq inp showData tset $ fmap (addTags inp) out where -- | The @x@ is sentence of words and possible interpretations. -- If the chosen tagging mod isn't marginals, it is expected that -- the result of the tagging is a list of tuples @(a,b)@ the -- set of possible tags (which were guessed by the guessing model or -- as a result of analysis from the 'Analyzer') @a@, and the disambiguated -- tag @b@. The 'X.Seg' is then zipped with a tag to prepare the results -- for the output. addTags x = zip x . map (map snd) -- | Builds a request and adds the body which will be sent to the server. rq x = R.Request { R.rqBody = x } errMsg = "Please define the path to analyzer or set -noana to true!\n" -- | Depending on the no-analysis and marginals boolean values a proper -- constructor is chosen. Constructors having an A suffix obviously are -- used to indicate that additional analysis is needed. buildReq = case (noAna, marginals) of (True,True) -> R.Marginal (True,False) -> R.PlainTag (False,True) -> R.MarginalA (False,False)-> R.PlainTagA exec Server{..} = do putStr "Loading model..." >> hFlush stdout concraft <- C.loadModel inModel putStr "Loading analyzer..." >> hFlush stdout let alyzer = fmap A.load inAnalyzer analyzer <- fromMaybe (error errMsg) alyzer putStrLn " done" let portNum = N.PortNumber $ fromIntegral port putStrLn $ "Listening on port " ++ show port S.runConcraftServer analyzer concraft portNum where errMsg = "Please define the path to analyzer!\n" exec Client{..} = do tagsetPath' <- case tagsetPath of Nothing -> getDataFileName "config/tagset.cfg" Just x -> return x tset <- parseTagset tagsetPath' <$> readFile tagsetPath' let portNum = N.PortNumber $ fromIntegral port inp <- parseContent tset <$> L.getContents out <- S.submit host portNum $ rq $ buildReq inp showData tset (fmap (addTags inp) out) where -- | The @x@ is sentence of words and possible interpretations. -- If the chosen tagging mod isn't marginals, it is expected that -- the result of the tagging is a list of tuples @(a,b)@ the -- set of possible tags (which were guessed by the guessing model or -- as a result of analysis from the 'Analyzer') @a@, and the disambiguated -- tag @b@. The 'X.Seg' is then zipped with a tag to prepare the results -- for the output. addTags x = zip x . map (map snd) -- | Builds a request and adds the body which will be sent to the server. rq x = R.Request { R.rqBody = x} -- | Depending on the no-analysis and marginals boolean values a proper -- constructor is chosen. Constructors having an A suffix obviously are -- used to indicate that additional analysis is needed. buildReq = case (noAna, marginals) of (True,True) -> R.Marginal (True,False) -> R.PlainTag (False,True) -> R.MarginalA (False,False)-> R.PlainTagA exec Compare{..} = do tagsetPath' <- case tagsetPath of Nothing -> getDataFileName "config/tagset.cfg" -- if no path is given the default one is used Just x -> return x tset <- parseTagset tagsetPath' <$> readFile tagsetPath' let convert = concat xs <- convert <$> parseFile tset refPath ys <- convert <$> parseFile tset otherPath let s = Acc.weakLB tset xs ys putStrLn $ "Number of segments in reference file: " ++ show (Acc.gold s) putStrLn $ "Number of correct tags: " ++ show (Acc.good s) putStrLn $ "Weak accuracy lower bound: " ++ show (Acc.accuracy s) exec Prune{..} = do cft <- C.loadModel inModel C.saveModel outModel $ C.prune threshold cft --------------------------------------- -- Reading files --------------------------------------- parseFileG :: ([L.Text] -> X.Sent X.Word P.Tag) -> FilePath -> IO [X.Sent X.Word P.Tag] parseFileG packSent path = map packSent . parseText <$> L.readFile path -- | Assumes that the given 'FilePath' is a file containing the training set. -- It is also assumed that the sentences in the file are clearly separated by -- at least two whitespaces (be it newlines or spaces) and that each word is -- in its own row and any tags should follow the word (in the same row) separated -- by one or more whitespaces. parseTrain :: P.Tagset -> FilePath -> IO [X.Sent X.Word P.Tag] parseTrain tset = parseFileG (X.packSentT tset) -- | It is assumed that the sentences in the file are clearly separated by -- at least two whitespaces (be it newlines or spaces) and that each word is -- in its own row and any tags should follow the word (in the same row) separated -- by one or more whitespaces. parseFile :: P.Tagset -> FilePath -> IO [X.Sent X.Word P.Tag] parseFile tset = parseFileG (X.packSent tset) --------------------------------------- -- Parsing --------------------------------------- -- | Parses a given 'L.Text'. Function is used to prepare the data for packing -- into 'X.Sent'. Used by 'parseFile' and 'parseTrain'. parseText :: L.Text -> [[L.Text]] parseText = filter (not . null) . Split.splitWhen (L.all Char.isSpace) . L.lines -- | Used as a function which parses the formated input as described in 'parseTrain' or -- 'parseFile' but doesn't except a 'FilePath'. parseContent :: P.Tagset -> L.Text -> [X.Sent X.Word P.Tag] parseContent tset = map (X.packSent tset) . parseText --------------------------------------- -- Displaying data --------------------------------------- -- | Displays the data of tagged sentences. -- If tagging was done using marginal probabilities all the tags are shown -- with their corresponding probabilities - format WORD(\tPROB\tTAG)+\n. -- Otherwise, WORD\tTAG\n format is used and sentences are separated with a newline character. showData :: P.Tagset -> Either [X.Sent X.Word P.Tag] [(X.Sent X.Word P.Tag, [P.Tag])] -> IO () showData tset (Left xss) = forM_ xss (\sentence -> do forM_ sentence (\wrd -> do T.putStr $ (X.orth . X.word) wrd forM_ (M.toList . X.unWMap $ X.tags wrd) (\(tg,prob) -> do T.putStr "\t" T.putStr $ toShortest prob T.putStr "\t" T.putStr . T.filter (/=':') . P.showTag tset $ tg -- ^^ Text form of a tag is filtered because by default -- the separator between POS and attributes is ':' -- TODO remove the dependency on the 'P.showTag' ) T.putStr "\n" ) T.putStr "\n" ) showData tset (Right xss) = forM_ xss (\(sent, tgs) -> do forM_ (zip sent tgs) (\(wrd, tg) -> do T.putStr $ (X.orth . X.word) wrd T.putStr "\t" T.putStr . T.filter (/=':') . P.showTag tset $ tg T.putStr "\n" ) T.putStr "\n" )