module NLP.Polh.Binary
( BinEntry (..)
, Key (..)
, Rule (..)
, proxyForm
, binKey
, between
, apply
, savePolh
, loadPolh
, PolhT
, runPolhT
, PolhM
, runPolh
, index
, withKey
, lookup
) where
import Prelude hiding (lookup)
import Control.Exception (try, SomeException)
import Control.Applicative (Applicative, (<$>), (<*>))
import Control.Monad (when, guard)
import Control.Monad.Trans.Class (MonadTrans)
import Control.Monad.IO.Class (liftIO, MonadIO)
import Control.Monad.Reader (ReaderT (..), ask, lift)
import Control.Monad.Trans.Maybe (MaybeT (..))
import System.IO.Unsafe (unsafeInterleaveIO)
import System.FilePath ((</>))
import System.Directory ( getDirectoryContents, createDirectoryIfMissing
, createDirectory, doesDirectoryExist )
import Data.Maybe (catMaybes)
import Data.List (mapAccumL)
import Data.Binary (Binary, get, put, encodeFile, decodeFile)
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.DAWG.Dynamic as DD
import qualified Data.DAWG.Static as D
import NLP.Polh.Types
import qualified NLP.Polh.Util as Util
type DAWG a = D.DAWG Char () a
entryDir :: String
entryDir = "entries"
formMapFile :: String
formMapFile = "forms.bin"
data BinEntry = BinEntry {
entry :: LexEntry
, uid :: Int }
deriving (Show, Eq, Ord)
instance Binary BinEntry where
put BinEntry{..} = put entry >> put uid
get = BinEntry <$> get <*> get
data Key = Key {
keyForm :: T.Text
, keyUid :: Int }
deriving (Show, Eq, Ord)
proxyForm :: LexEntry -> T.Text
proxyForm entry = case Util.allForms entry of
(x:_) -> x
[] -> error "proxyForm: entry with no forms"
binKey :: BinEntry -> Key
binKey BinEntry{..} = Key (proxyForm entry) uid
showKey :: Key -> String
showKey Key{..} = (T.unpack . T.concat) [T.pack (show keyUid), "-", keyForm]
parseKey :: String -> Key
parseKey x =
let (uid'S, (_:form'S)) = break (=='-') x
in Key (T.pack form'S) (read uid'S)
loadContents :: FilePath -> IO [FilePath]
loadContents path = do
xs <- getDirectoryContents path
return [x | x <- xs, x /= ".", x /= ".."]
emptyDirectory :: FilePath -> IO Bool
emptyDirectory path = null <$> loadContents path
saveLexEntry :: FilePath -> BinEntry -> IO ()
saveLexEntry path x =
let binPath = showKey . binKey
in encodeFile (path </> binPath x) x
withUid :: DD.DAWG Char Int -> LexEntry -> (DD.DAWG Char Int, BinEntry)
withUid m x =
let path = T.unpack (proxyForm x)
num = maybe 0 id (DD.lookup path m) + 1
in (DD.insert path num m, BinEntry x num)
withUids :: [LexEntry] -> [BinEntry]
withUids = snd . mapAccumL withUid DD.empty
mapIO'Lazy :: (a -> IO b) -> [a] -> IO [b]
mapIO'Lazy f (x:xs) = (:) <$> f x <*> unsafeInterleaveIO (mapIO'Lazy f xs)
mapIO'Lazy _ [] = return []
savePolh :: FilePath -> Polh -> IO ()
savePolh path xs = do
createDirectoryIfMissing True path
isEmpty <- emptyDirectory path
when (not isEmpty) $ do
error $ "savePolh: directory " ++ path ++ " is not empty"
let lexPath = path </> entryDir
createDirectory lexPath
formMap' <- D.fromListWith S.union . concat
<$> mapIO'Lazy (saveLex lexPath) (withUids xs)
encodeFile (path </> formMapFile) formMap'
where
saveLex lexPath x = do
saveLexEntry lexPath x
return $ rules x
rules binEntry =
[ ( T.unpack x
, S.singleton (between x key) )
| x <- Util.allForms (entry binEntry) ]
where
key = binKey binEntry
maybeErr :: MonadIO m => IO a -> m (Maybe a)
maybeErr io = do
r <- liftIO (try io)
case r of
Left (_e :: SomeException) -> return Nothing
Right x -> return (Just x)
maybeT :: Monad m => Maybe a -> MaybeT m a
maybeT = MaybeT . return
maybeErrT :: MonadIO m => IO a -> MaybeT m a
maybeErrT io = do
r <- liftIO (maybeErr io)
maybeT r
loadLexEntry :: FilePath -> Key -> IO (Maybe BinEntry)
loadLexEntry path key = do
maybeErr $ decodeFile (path </> showKey key)
data Rule = Rule {
cut :: !Int
, suffix :: !T.Text
, ruleUid :: !Int }
deriving (Show, Eq, Ord)
instance Binary Rule where
put Rule{..} = put cut >> put suffix >> put ruleUid
get = Rule <$> get <*> get <*> get
apply :: Rule -> T.Text -> Key
apply r x =
let y = T.take (T.length x cut r) x `T.append` suffix r
in Key y (ruleUid r)
between :: T.Text -> Key -> Rule
between source dest =
let k = lcp source (keyForm dest)
in Rule (T.length source k) (T.drop k (keyForm dest)) (keyUid dest)
where
lcp a b = case T.commonPrefixes a b of
Just (c, _, _) -> T.length c
Nothing -> 0
data MemData = MemData
{ polhPath :: FilePath
, formMap :: DAWG (S.Set Rule) }
newtype PolhT m a = PolhT (ReaderT MemData m a)
deriving (Functor, Applicative, Monad, MonadTrans, MonadIO)
type PolhM a = PolhT IO a
entryPath :: MemData -> FilePath
entryPath = (</> entryDir) . polhPath
index :: (Applicative m, MonadIO m) => PolhT m [Key]
index = PolhT $ do
path <- entryPath <$> ask
map parseKey <$> liftIO (loadContents path)
withKey :: (Applicative m, MonadIO m) => Key -> PolhT m (Maybe BinEntry)
withKey key = PolhT $ do
path <- entryPath <$> ask
liftIO . unsafeInterleaveIO $ loadLexEntry path key
lookup :: (Applicative m, MonadIO m) => T.Text -> PolhT m [BinEntry]
lookup x = do
fm <- PolhT $ formMap <$> ask
keys <- return $ case D.lookup (T.unpack x) fm of
Nothing -> []
Just xs -> map (flip apply x) (S.toList xs)
catMaybes <$> mapM withKey keys
runPolh :: FilePath -> PolhM a -> IO (Maybe a)
runPolh path polh = runPolhT path polh
runPolhT :: MonadIO m => FilePath -> PolhT m a -> m (Maybe a)
runPolhT path (PolhT r) = runMaybeT $ do
formMap' <- maybeErrT $ decodeFile (path </> formMapFile)
doesExist <- liftIO $ doesDirectoryExist (path </> entryDir)
guard doesExist
lift $ runReaderT r (MemData path formMap')
loadPolh :: FilePath -> IO (Maybe [BinEntry])
loadPolh path = runPolhT path $ do
keys <- index
catMaybes <$> mapM withKey keys