{-# LANGUAGE DataKinds #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE PolyKinds #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE AllowAmbiguousTypes #-}
{-# LANGUAGE OverloadedLabels #-}
{-# LANGUAGE QuantifiedConstraints #-}
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use camelCase" #-}

module LDA where

import Model
import Sampler
import Control.Monad
import Data.Kind (Constraint)
import Env
import Inference.SIM as SIM ( simulate )
import Inference.MH as MH


-- ** Latent dirichlet allocation (topic model) 
type TopicEnv =
  '[ "θ" ':= [Double],
     "φ" ':= [Double],
     "w" ':= String
   ]

topicWordPrior :: Observable env "φ" [Double]
  => [String] -> Model env ts [Double]
topicWordPrior vocab
  = dirichlet (replicate (length vocab) 1) #φ

docTopicPrior :: Observable env "θ" [Double]
  => Int -> Model env ts [Double]
docTopicPrior n_topics = dirichlet (replicate n_topics 1) #θ

-- | Distribution over likely words
wordDist :: Observable env "w" String =>
  [String] -> [Double] -> Model env ts String
wordDist vocab ps =
  categorical (zip vocab ps) #w

-- | Distribution over the topics in a document, over the distribution of words in a topic
topicModel :: (Observables env '["φ", "θ"] [Double],
                 Observable env "w" String)
  => [String]
  -> Int
  -> Int
  -> Model env ts [String]
topicModel vocab n_topics n_words = do
  -- Generate distribution over words for each topic
  topic_word_ps <- replicateM n_topics $ topicWordPrior vocab
  -- Generate distribution over topics for a given document
  doc_topic_ps  <- docTopicPrior n_topics
  replicateM n_words (do  z <- discrete' doc_topic_ps
                          let word_ps = topic_word_ps !! z
                          wordDist vocab word_ps)

-- | Topic distribution over many topics
topicModels :: (Observables env '["φ", "θ"] [Double],
               Observable env "w" String)
  => [String]  -- Possible vocabulary in a document
  -> Int       -- Assumed number of topics in a document
  -> [Int]     -- Number of words in a document
  -> Model env ts [[String]]
topicModels vocab n_topics doc_words = do
  mapM (topicModel vocab n_topics) doc_words

-- | Simulating from topic model
-- | Possible vocabulary
vocab :: [String]
vocab = ["DNA", "evolution", "parsing", "phonology"]

simLDA :: Sampler [String]
simLDA = do
  let n_words = 100
      env_in = #θ := [[0.5, 0.5]] <:>
               #φ := [[0.12491280814569208,1.9941599739151505e-2,0.5385152817942926,0.3166303103208638],
                      [1.72605174564027e-2,2.9475900240868515e-2,9.906011619752661e-2,0.8542034661052021]] <:>
               #w := [] <:> nil
  (words, env_out) <- SIM.simulate (topicModel vocab 2) env_in n_words
  return words

-- | MH Inference from topic model
-- | Document of words to perform inference over
topic_data :: [String]
topic_data     = ["DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution", "parsing", "phonology", "DNA","evolution", "DNA", "parsing", "evolution","phonology", "evolution", "DNA","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution", "parsing", "phonology", "DNA","evolution", "DNA", "parsing", "evolution","phonology", "evolution", "DNA","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution", "parsing", "phonology", "DNA","evolution", "DNA", "parsing", "evolution","phonology", "evolution", "DNA","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution", "parsing", "phonology", "DNA","evolution", "DNA", "parsing", "evolution","phonology", "evolution", "DNA","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution", "parsing", "phonology", "DNA","evolution", "DNA", "parsing", "evolution","phonology", "evolution", "DNA"]

mhLDA :: Sampler ([[Double]], [[Double]])
mhLDA  = do
  -- Do MH inference over the topic model using the above data
  let n_words   = 100
      n_topics  = 2
      env_mh_in = #θ := [] <:>  #φ := [] <:> #w := topic_data <:> nil
  env_mh_outs <- MH.mh 500 (topicModel vocab n_topics) (n_words, env_mh_in) ["φ", "θ"]
  -- Draw the most recent sampled parameters from MH
  let env_pred   = head env_mh_outs
      θs         = get #θ env_pred
      φs         = get #φ env_pred
  return (θs, φs)