{-# LANGUAGE FlexibleContexts #-} module Data.Text.WordCount ( topN , displayWords , filterTop -- * Filters , small , common -- * For making graphs , makeFile , makeDistribution -- * File processing with pandoc , processFile , globFile -- * Low level-ish , buildFreq ) where import Control.Lens hiding (argument) import Data.Char import Data.List import qualified Data.Map.Lazy as M import Data.Map.Lens import Data.Monoid import Data.Ord import qualified Data.Text as TL import Data.Text.WordCount.FileRead import Data.Tuple import Graphics.Rendering.Chart.Backend.Diagrams import Graphics.Rendering.Chart.Easy hiding (argument) -- | Return top n words and their frequencies -- -- @ -- >>> topN 2 "hello hello goodbye it is time is it why why why it it" -- [(4,"it"),(3,"why")] -- @ topN :: Int -> TL.Text -> [(Int,TL.Text)] topN n = take n . order . buildFreq -- | Return the top n words, with some filter applied. filterTop :: Int -> (TL.Text -> Bool) -> TL.Text -> [(Int, TL.Text)] filterTop n p = take n . filter (p . snd) . order . buildFreq small :: TL.Text -> Bool small = (> 5) . TL.length . TL.filter (/= '\'') common :: TL.Text -> Bool common = flip elem ["the","and","a","an","or","not","but","on","so","if","in","that","this","for"] displayWords :: [(Int,TL.Text)] -> TL.Text displayWords [] = "" displayWords (pair:pairs) = display pair <> "\n" <> displayWords pairs where display (n,str) = (TL.pack . show) n <> ": " <> str buildFreq :: TL.Text -> M.Map TL.Text Int buildFreq = count . TL.words . TL.map toLower order :: M.Map TL.Text Int -> [(Int, TL.Text)] order = sortBy (flip (comparing fst)) . fmap swap . M.toList count :: [TL.Text] -> M.Map TL.Text Int count words = foldr ((.) . wordFunction) id words M.empty where wordFunction word map = case map ^. at word of Nothing -> at word ?~ 1 $ map _ -> ix word %~ (+1) $ map -- | Make a bar graph from the word frequencies -- -- @ -- makeFile :: IO () -- makeFile [(4,"it"),(3,"why")] "out.html" -- @ makeFile :: [(Int,TL.Text)] -> FilePath -> IO () makeFile points out = toFile def out (makeDistribution points) makeDistribution points = do let values = addIndexes (fmap fst points) let alabels = fmap (TL.unpack . snd) points let fillStyle = solidFillStyle (opaque lightblue) layout_title .= "Word Frequencies" layout_x_axis . laxis_generate .= autoIndexAxis alabels layout_y_axis . laxis_override .= axisGridHide layout_left_axis_visibility . axis_show_ticks .= False plot $ fmap plotBars $ liftEC $ do plot_bars_values .= fmap (over _2 pure) values plot_bars_item_styles .= pure (fillStyle, Nothing)