module ExtractDescriptionTerms ( extractSynopsisTerms, extractDescriptionTerms ) where import Data.Text (Text) import qualified Data.Text as T import Data.Set (Set) import qualified Data.Set as Set import Data.Char import qualified NLP.Tokenize as NLP import qualified NLP.Snowball as NLP import Control.Monad ((>=>)) import Data.Maybe import HaddockTypes as Haddock import HaddockHtml as Haddock (markup) import qualified HaddockParse as Haddock (parseHaddockParagraphs) import qualified HaddockLex as Haddock (tokenise) extractSynopsisTerms :: Set Text -> String -> [Text] extractSynopsisTerms stopWords = NLP.stems NLP.English . filter (`Set.notMember` stopWords) . map (T.toCaseFold . T.pack) . concatMap splitTok . filter (not . ignoreTok) . NLP.tokenize ignoreTok :: String -> Bool ignoreTok = all isPunctuation splitTok :: String -> [String] splitTok tok = case go tok of toks@(_:_:_) -> tok:toks toks -> toks where go remaining = case break (\c -> c == ')' || c == '-' || c == '/') remaining of ([], _:trailing) -> go trailing (leading, _:trailing) -> leading : go trailing ([], []) -> [] (leading, []) -> leading : [] extractDescriptionTerms :: Set Text -> String -> [Text] extractDescriptionTerms stopWords = NLP.stems NLP.English . filter (`Set.notMember` stopWords) . map (T.toCaseFold . T.pack) . maybe [] --TODO: something here ( filter (not . ignoreTok) . NLP.tokenize . concat . Haddock.markup termsMarkup) . (Haddock.tokenise >=> Haddock.parseHaddockParagraphs) termsMarkup :: DocMarkup String [String] termsMarkup = Markup { markupEmpty = [], markupString = \s -> [s], markupParagraph = id, markupAppend = (++), markupIdentifier = \s -> [s], markupModule = const [], -- i.e. filter these out markupEmphasis = id, markupMonospaced = \s -> if length s > 1 then [] else s, markupUnorderedList = concat, markupOrderedList = concat, markupDefList = concatMap (\(d,t) -> d ++ t), markupCodeBlock = const [], markupHyperlink = \(Hyperlink _url mLabel) -> maybeToList mLabel, --TODO: extract main part of hostname markupPic = const [], markupAName = const [] } {- ------------------- -- Main experiment -- main = do pkgsFile <- readFile "pkgs" let mostFreq :: [String] pkgs :: [PackageDescription] (mostFreq, pkgs) = read pkgsFile stopWordsFile <- T.readFile "stopwords.txt" -- wordsFile <- T.readFile "/usr/share/dict/words" -- let ws = Set.fromList (map T.toLower $ T.lines wordsFile) print "reading file" evaluate (length mostFreq + length pkgs) print "done" let stopWords = Set.fromList $ T.lines stopWordsFile print stopWords sequence_ [ putStrLn $ display (packageName pkg) ++ ": " ++ --intercalate ", " (description pkg) ++ "\n" ++ intercalate ", " (map T.unpack $ extractDescriptionTerms stopWords (description pkg)) ++ "\n" | pkg <- pkgs , let pkgname = display (packageName pkg) ] -}