{-# LANGUAGE OverloadedStrings #-}

{-|
Module      : Text.Pronounce.ParseDict
Description : Module for parsing the CMU Dictionary
Copyright   : (c) Noah Goodman, 2018
License     : BSD3
Stability   : experimental

This module has functions for parsing the CMU pronouncing dictionary, and exports the 
@CMUdict@ type and the function @initDict@ to the main module "Text.Pronounce"
-}


module Text.Pronounce.ParseDict
    ( EntryWord
    , Phones
    , CMUdict
    , initDict
    , stdDict
    , parseDict
    , parseLine
    ) where

import Paths_pronounce
import System.FilePath
import Text.ParserCombinators.ReadP
import Data.Char
import Data.Text.Encoding
import Data.Binary (decodeFile)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Map as Map

-- | Represents an entry word in the cmu pronouncing dictionary (simply an alias
-- for @Text@ to improve type specificity and readability
type EntryWord = T.Text

-- | Represents a string containing the phonetic breakdown of a word, in a
-- similar manner to the @EntryWord@ type
type Phones = T.Text

-- | A Map from @EntryWord@s to lists of possible pronunciations (@Phones@), serving as our
-- representation of the CMU Pronouncing Dictionary
type CMUdict = Map.Map EntryWord [Phones]

-- | Initializes the cmu pronunctiation dictionary into our program, given an
-- optional file name of the dictionary
initDict :: Maybe FilePath -> Bool -> IO CMUdict
initDict path usesBin = case usesBin of
    True ->
        case path of
          Just p ->
              return . Map.mapKeys decodeUtf8 . fmap (map decodeUtf8) =<< decodeFile p
          Nothing ->
              return . Map.mapKeys decodeUtf8 . fmap (map decodeUtf8) =<< decodeFile =<< getDataFileName "cmubin"
    False ->
        case path of
          Just p ->
              return . parseDict =<< T.readFile p
          Nothing ->
              return . parseDict =<< T.readFile =<< getDataFileName "cmuutf"

-- | Default settings for @initDict@
stdDict :: IO CMUdict
stdDict = initDict Nothing True

-- | Go through all the entries in the dictionary, parsing, and inserting into
-- the map data structure
parseDict :: T.Text -> CMUdict
parseDict = Map.fromListWith (++) . map packAndParse . filter ((/= ';') . T.head) . T.lines
    where packAndParse = (\(a,b) -> (T.pack a, [T.pack b])) . fst . head . readP_to_S parseLine . T.unpack

-- | Parses a line in the dictionary, returning as @(key,val)@ pair, ignoring
-- parenthetical part if it exists
parseLine :: ReadP (String, String)
parseLine = (,) <$> (many get) <* (paren <++ string "") <* string "  "
                <*> (munch . const $ True)

-- Helper function to parse numbers in between parentheses
paren :: ReadP String
paren = char '(' *> munch isDigit <* char ')'