module Text.PhonotacticLearner.MaxentGrammar (
Length, Lexicon(..), sortLexicon, lengthCdf, lengthPdf,
maxentProb,
lexLogProbTotalDeriv, lexLogProbPartialDeriv,
llpOptimizeWeights,
sampleWord, sampleWordSalad
) where
import Text.PhonotacticLearner.Util.Ring
import Text.PhonotacticLearner.DFST
import Text.PhonotacticLearner.Util.Probability
import Text.PhonotacticLearner.Util.ConjugateGradient
import Data.Array.IArray
import Data.Array.Unboxed
import Control.Monad
import Control.Monad.State
import Control.Arrow ((&&&),(***))
import System.Random
import Data.List as L
import Data.Monoid
import Data.Maybe
import qualified Data.Map as M
import qualified Data.Vector.Unboxed as V
maxentProb :: Vec -> Multicount -> Double
maxentProb !weights !counts = exp . negate $ innerProd weights (fromMC counts)
type Length = Int
data Lexicon sigma = Lex { totalWords :: Int
, lengthFreqs :: Array Length Int
, wordFreqs :: [([sigma], Int)]
} deriving Show
sortLexicon :: (Ord sigma) => [([sigma],Int)] -> Lexicon sigma
sortLexicon wfs = Lex twords alengths awf
where
mwf = M.fromListWith (+) wfs
awf = M.assocs mwf
mlengths = M.mapKeysWith (+) length mwf
maxlen = fst (M.findMax mlengths)
alengths = accumArray (+) 0 (0,maxlen) (M.assocs mlengths)
twords = sum (M.elems mlengths)
lengthCdf :: Lexicon sigma -> Cdf Length
lengthCdf = massToCdf . assocs . fmap fromIntegral . lengthFreqs
lengthPdf :: Lexicon sigma -> [(Length, Double)]
lengthPdf wfs = assocs . fmap fracOfTotal . lengthFreqs $ wfs
where fracOfTotal k = fromIntegral k / fromIntegral (totalWords wfs)
priorDeriv :: Vec -> Vec -> Vec
priorDeriv (Vec !weights) (Vec !dlp) = Vec $ V.zipWith (\w d -> if w < 0.01 then min (d+1) 0 else d+1) weights dlp
lexLogProbTotalDeriv :: (Ix sigma) => MulticountDFST sigma
-> Array Length Int
-> Vec
-> Vec
-> (Double, Vec)
lexLogProbTotalDeriv !ctr !lengths !oviols !weights = (totalViolWeight + totalNormalizer + prior, priorDeriv weights (oviols ⊖ expviols))
where
prior = l1Vec weights
edfa = weightExpVec ctr weights
(_,maxlen) = bounds lengths
exps = expsByLengthVec edfa maxlen
totalViolWeight = innerProd oviols weights
totalNormalizer = sum . fmap (\(l,n) -> n ⊙ log (prob (exps ! l))) . assocs $ lengths
expviols = sumR . fmap (\(l,n) -> n ⊙ normalizeExp (exps ! l)) . assocs $ lengths
lexLogProbPartialDeriv :: (Ix sigma) => MulticountDFST sigma -> Array Length Int -> Vec -> Vec -> Vec -> Double
lexLogProbPartialDeriv !ctr !lengths !oviols !weights !dir = innerProd (dl1Vec weights) dir + innerProd dir oviols expviols
where
edfa = weightExpPartial ctr weights dir
(_,maxlen) = bounds lengths
exps = expsByLengthDouble edfa maxlen
expviols = sumR . fmap (\(l,n) -> n ⊙ normalizeExp (exps ! l)) . assocs $ lengths
zeroNeg :: Vec -> (Vec, Bool)
zeroNeg (Vec v) = (Vec (V.map (\x -> if x < 0.01 then 0 else x) v), V.any (\x -> x /= 0 && x < 0.01) v)
llpOptimizeWeights :: (Ix sigma) => Array Length Int
-> PackedText sigma
-> MulticountDFST sigma
-> Vec
-> Vec
llpOptimizeWeights lengths pwfs dfa initweights =
let oviols = fromMC (transducePackedMulti dfa pwfs)
in conjugateGradientSearch True
(0.01, 0.005)
zeroNeg
(lexLogProbTotalDeriv dfa lengths oviols)
(lexLogProbPartialDeriv dfa lengths oviols)
initweights
stepweights :: (Ix q, Ix sigma, Semiring k) => DFST q sigma k -> Array q k -> Array q k
stepweights dfa@(DFST _ tm _) prev = accumArray (⊕) zero sbound (fmap pathweight (range (bounds tm)))
where
sbound = stateBounds dfa
pathweight (s,c) = let (ns,w) = tm!(s,c) in (ns, (prev!s) ⊗ w)
initialWeightArray :: (Ix l, Ix sigma, Semiring w) => DFST l sigma w -> Array l w
initialWeightArray dfa = fnArray (stateBounds dfa) (\x -> if x == initialState dfa then one else zero)
reverseTM :: (Ix q, Ix sigma) => DFST q sigma k -> Array (q,sigma) [(q,k)]
reverseTM (DFST _ arr _) = accumArray (flip (:)) [] (bounds arr) (fmap (\((s,c),(s',w)) -> ((s',c),(s,w))) (assocs arr))
sampleWord :: forall g sigma m . (RandomGen g, Ix sigma, MonadState g m)
=> DFST Int sigma Double
-> Length
-> (Length -> m [sigma])
sampleWord dfa maxn = backDists `seq` \n -> do
fs <- sampleCdf (finalStates ! n)
rcs <- flip evalStateT fs . forM (reverse . range $ (1, min n maxn)) $ \k -> do
s <- get
(c,s') <- lift . sampleCdf $ backDists!(k,s)
put s'
return c
return (reverse rcs)
where
backnfa = reverseTM dfa
qbound = stateBounds dfa
maxentPrefixes = take (maxn + 1) (iterate (stepweights dfa) (initialWeightArray dfa))
maxentArray :: UArray (Int,Int) Double
maxentArray = array ((0,maxn) `xbd` qbound) . join . snd . mapAccumL (\k a -> (k+1, fmap (\(x,p)->((k,x),p)) (assocs a))) 0 $ maxentPrefixes
backDist :: (Int, Int) -> Cdf (sigma, Int)
backDist (k, s) = massToCdf $ do
c <- range (segBounds dfa)
(s', w) <- backnfa!(s,c)
return ((c,s'), w * (maxentArray!(k1,s')))
backDists :: Array (Int, Int) (Cdf (sigma, Int))
backDists = fnArray ((1,maxn) `xbd` qbound) backDist
finalStates :: Array Int (Cdf Int)
finalStates = array (1,maxn) $ do
n <- range (1,maxn)
let cdf = massToCdf $ do
s <- range qbound
return (s, maxentArray!(n,s) * finalWeights dfa!s)
return (n,cdf)
sampleWordSalad :: (RandomGen g, Ix sigma, MonadState g m) => DFST Int sigma Double -> Cdf Length -> Int -> m [[sigma]]
sampleWordSalad dfa lengthdist samples = mapM sampler wordlenlist
where
wordlen = uniformSample lengthdist samples
maxn = maximum (fmap fst wordlen)
sampler = sampleWord dfa maxn
wordlenlist = wordlen >>= uncurry (flip replicate)