module Data.Chinese.CCDict
( initiate
, version
, Entry(..)
, ppEntry
, entryVariants
, entryOriginal
, entrySimplified
, entryTraditional
, entryWordFrequency
, entryPinyin
, Variant(..)
, lookupMatch
, lookupMatches
) where
import Data.Char
import Data.Maybe
import Data.Ord
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Paths_cndict
import Prelude hiding (lookup)
import System.IO.Unsafe (unsafePerformIO)
import qualified Data.Text.Internal as T
import qualified Data.Text.Array as T
import qualified Data.Array.Unboxed as U
import qualified Data.Text.Read as T
import Control.Exception (evaluate)
initiate :: IO ()
initiate = do
evaluate ccDict
return ()
data CCDict = CCDict !T.Array !Int (U.UArray Int Int)
mkCCDict :: Text -> CCDict
mkCCDict text@(T.Text arr _ len) =
CCDict arr len
(U.listArray (0,n1) offsets)
where
ls = T.lines text
offsets =
[ offset
| T.Text _ offset _length <- ls ]
n = length ls
ccDict :: CCDict
ccDict = mkCCDict utfData
where
utfData = unsafePerformIO $ do
path <- getDataFileName "data/dict.sorted"
T.readFile path
ccDictNth :: Int -> CCDict -> Text
ccDictNth n (CCDict arr totalLen offsets) =
T.text arr offset len
where
lastIdx = snd (U.bounds offsets)
offset = offsets U.! n
len
| lastIdx == n = totalLen offset 1
| otherwise = offsets U.! (n+1) offset 1
bounds :: CCDict -> (Int, Int)
bounds (CCDict _ _ offsets) = U.bounds offsets
findPrefix :: CCDict -> Int -> Int -> Int -> Text -> Maybe (Int,Int)
findPrefix dict maxUpper lower upper key
| lower > upper = Nothing
| otherwise =
case compare (T.take len key) (T.take len val) of
LT -> findPrefix dict (middle1) lower (middle1) key
GT -> findPrefix dict maxUpper (middle+1) upper key
EQ ->
case compare (T.length key) (T.length val) of
GT -> findPrefix dict maxUpper (middle+1) upper key
_ -> Just $ fromMaybe (middle, maxUpper) $
findPrefix dict maxUpper lower (middle1) key
where
middle = (upper lower) `div` 2 + lower
val = T.takeWhile (/='\t') $ ccDictNth middle dict
len = min (T.length val) (T.length key)
lookupMatches :: Text -> Maybe [Entry]
lookupMatches key
| T.null key = Nothing
lookupMatches key =
if null entries
then Nothing
else Just entries
where
keys = tail $ T.inits key
entries = worker (bounds ccDict) keys
worker _ [] = []
worker (lower, upper) (k:ks) =
case findPrefix ccDict upper lower upper k of
Nothing -> []
Just (first, newUpper) ->
maybe id (:) (scrapeEntry ccDict first k) $
worker (first, newUpper) ks
lookupMatch :: Text -> Maybe Entry
lookupMatch key
| T.null key = Nothing
| otherwise =
case findPrefix ccDict upper lower upper key of
Nothing -> Nothing
Just (first, _newUpper) ->
scrapeEntry ccDict first key
where
(lower, upper) = bounds ccDict
allVariants :: [Variant]
allVariants = worker 0
where
worker nth | nth > snd (bounds ccDict) = []
worker nth = parseVariant (ccDictNth nth ccDict) : worker (nth+1)
scrapeEntry :: CCDict -> Int -> Text -> Maybe Entry
scrapeEntry dict nth key =
case variants of
[] -> Nothing
(v:vs) -> Just (Entry key v vs)
where
variants = scrapeVariants dict nth key
scrapeVariants :: CCDict -> Int -> Text -> [Variant]
scrapeVariants dict nth key
| nth > snd (bounds dict) = []
| T.takeWhile (/='\t') raw == key =
parseVariant raw : scrapeVariants dict (nth+1) key
| otherwise = []
where
raw = ccDictNth nth dict
parseVariant :: Text -> Variant
parseVariant line =
case T.splitOn "\t" line of
[chinese, count, pinyin, english] ->
mkVariant chinese chinese count pinyin english
[traditional, simplified, "T", count, pinyin, english] ->
mkVariant traditional simplified count pinyin english
[simplified, traditional, "S", count, pinyin, english] ->
mkVariant traditional simplified count pinyin english
_ -> error $ "invalid variant: " ++ T.unpack line
where
mkVariant traditional simplified countStr pinyin english = Variant
{ variantTraditional = traditional
, variantSimplified = simplified
, variantWordFrequency = count
, variantPinyin = pinyin
, variantDefinitions = splitDefinition english }
where
Right (count,_) = T.decimal countStr
data Entry = Entry !Text Variant [Variant]
deriving (Show, Read, Eq, Ord)
data Variant = Variant
{ variantSimplified :: !Text
, variantTraditional :: !Text
, variantWordFrequency :: !Int
, variantPinyin :: !Text
, variantDefinitions :: [Text]
} deriving ( Read, Show, Eq, Ord )
entryOriginal :: Entry -> Text
entryOriginal (Entry o _v _vs) = o
entryVariants :: Entry -> [Variant]
entryVariants (Entry _o v vs) = v:vs
dominantVariant :: Entry -> Variant
dominantVariant (Entry _o v vs) =
foldr dom v vs
where
dom v1 v2
| variantWordFrequency v1 < variantWordFrequency v2 =
v2
| otherwise =
v1
entrySimplified :: Entry -> Text
entrySimplified = variantSimplified . dominantVariant
entryTraditional :: Entry -> Text
entryTraditional = variantTraditional . dominantVariant
entryWordFrequency :: Entry -> Int
entryWordFrequency = variantWordFrequency . dominantVariant
entryPinyin :: Entry -> [Text]
entryPinyin = map variantPinyin . entryVariants
ppEntry :: Entry -> Text
ppEntry = T.intercalate "\n" . map ppVariant . entryVariants
ppVariant :: Variant -> Text
ppVariant (Variant simplified traditional frequency pinyin english) =
T.intercalate "\t"
[simplified, traditional, count, pinyin, english']
where
count = T.pack $ show frequency
english' = T.intercalate "/" english
splitDefinition :: Text -> [Text]
splitDefinition = filter (not . T.null) . T.splitOn "/" . T.dropAround isSpace