----------------------------------------------------------------------------- -- | -- Module : Lentil.Parse.Source -- Copyright : © 2015 Francesco Ariis -- License : GPLv3 (see the LICENSE file) -- -- Issues parsing from source. ----------------------------------------------------------------------------- module Lentil.Parse where import Lentil.Types import Text.Parsec import Control.Applicative hiding ( (<|>), optional, many ) import qualified Data.Char as C import qualified Data.List as L import qualified System.FilePath as SF import qualified System.Directory as D import qualified System.IO as I import qualified Data.Either as E -- TODO: eliminate lookahead? fai il parser davvero da bottom up senza sub -- parsing [lint] [refactor] ? ----------- -- TYPES -- ----------- type ParSource = Parsec String ParState data ParState = ParState { psPath :: FilePath } --------------- -- PRIMITIVE -- --------------- -- a unix file is defined as many lines (each one ended by '\n') -- this parser captures the definition and can be used as eof replacement too eoft :: ParSource () eoft = optional (char '\n') *> eof -- i.e. remove unneeded whitespace htmlify :: String -> String htmlify cs = unwords . words $ cs -- occhio che il primo carattere non è controllato da ED -- TODO: manyTill1 broken? occhio che al primon on chiama ed! [test] manyTill1 :: ParSource a -> ParSource b -> ParSource [a] manyTill1 p ed = (:) <$> p <*> manyTill p ed ---------- -- TAGS -- ---------- -- simple tags parsing -- tag only tag :: ParSource Tag tag = Tag <$> (openPar *> tagLabel <* closePar) "tag" -- anything goes, apart from ' ' tagLabel :: ParSource String tagLabel = many1 (satisfy sf) "tag label" where sf :: Char -> Bool sf c | c == closeDel = False | C.isSpace c = False | otherwise = True openPar, closePar :: ParSource Char openPar = char openDel "open-tag delimiter" closePar = char closeDel "close-tag delimiter" ------------ -- ISSUES -- ------------ -- simple issue parsing -- parses a number of issues from a given line-of-text issues :: ParSource [Issue] issues = many (try $ manyTill anyChar (lookAhead $ try incipit) *> issue) "issues" -- an issue is a flagword, followed by : , followed by some description and -- ended optionally by some tags (No t/f? End by whiteline or eof or -- another TODO). issue :: ParSource Issue issue = Issue <$> fmap psPath getState <*> fmap sourceLine getPosition <*> (incipit *> fmap htmlify freeText) <*> (optional spaces *> option [] (try tags)) "issue" -- flagwords + ": " incipit :: ParSource () incipit = () <$ (choice (map string flagWords) *> string ": ") "incipit" flagWords :: [String] flagWords = ["TODO", "FIXME", "NOTE", "XXX"] tags :: ParSource [Tag] tags = sepEndBy1 tag spaces "tags" -- any text. Since tags/fields at the end of the issue are optional, we need -- a way to delimit this. Delimiters are: eof, tags/fields, new issue freeText :: ParSource Description freeText = manyTill anyChar end "free text" where vp p = try . parsecMap (const ()) $ p end = lookAhead $ choice [vp (spaces *> tag), vp (spaces *> incipit), vp eoft] ------------ -- BLOCKS -- ------------ -- blocks parsing, check `block` comment for more info blocks :: ParSource [Issue] blocks = concat <$> many1 block -- a block is some text separated by emptyline(s) (a paragraph). Once you -- parse it you need to unwrap it and scan for comments. Whitespace -- is treated like in html. block :: ParSource [Issue] block = getPosition >>= \lep -> manyTill1 anyChar (try $ emptyline <|> eoft) >>= \rb -> getState >>= \ps -> case runParser (bi lep) ps "" rb of Right r -> return r Left l -> parserFail ("block subp: " ++ show l) "block" where bi p = setPosition p >> issues emptyline :: ParSource () emptyline = () <$ (endOfLine *> many (satisfy ws) *> endOfLine) "emptyline" where ws c = C.isSpace c && notElem c "\n\r\f\v" -- as per definition of isSpace in Data.Char -------------- -- COMMENTS -- -------------- -- TYPES -- -- StringPos is the comment itself + its indentation level. Conventionally -- multiline comments will have IL of 0 to sever them from single line -- comments type StringPos = (Int, Int, String) -- Col, Row, String -- FUNCTIONS -- -- Comment is the same, with just line indication, for comments that go -- together indentation wise. Check the utility function sps2cr type CommentRow = (Int, String) -- row, String sps2cr :: [[StringPos]] -> [CommentRow] sps2cr sps = map sp2cr sps where sp2r (_,r,_) = r sp2s (_,_,s) = s sp2cr [] = error "empty group in sps2cr" sp2cr zs = (sp2r . head $ zs, myUnlines . map sp2s $ zs) -- do not add unneeded \n at end myUnlines zs = L.intercalate "\n" zs -- comments: from a source file to something parsable by `blocks` -- indent level 0, to recognise them from single line multiLine :: String -> String -> ParSource StringPos multiLine st ed = getPosition >>= \a -> (,,) <$> pure 0 <*> pure (sourceLine a) <*> (string st *> manyTill anyChar (try $ string ed)) "multiline comment" -- returns single line comment and starting column of such comment singleLine :: String -> ParSource StringPos -- int: starting col singleLine st = getPosition >>= \a -> (string st *> manyTill anyChar (() <$ endOfLine <|> eoft)) >>= \s -> return (sourceColumn a, sourceLine a, s) "singleline comment" rawComment :: String -> String -> String -> ParSource StringPos rawComment st ed slt = try (multiLine st ed) <|> singleLine slt "raw comment" rawComments :: String -> String -> String -> ParSource [StringPos] rawComments st ed slt = E.rights <$> many1 ( try (Right <$> rawComment st ed slt) <|> try (Left <$> (() <$ codeString)) <|> (Left <$> other ) ) -- TODO: o many? [test] "raw comments" where initP = lookAhead (try (() <$ string st)) <|> -- long comment lookAhead (try (() <$ string slt)) <|> -- line comment lookAhead (() <$ char '"') <|> -- code string (() <$ char '\n') -- TODO: use eoft [bug] -- non comment stuff other = () <$ manyTill anyChar initP -- XXX -- quoted strings, by mauke^ codeString :: ParSource String codeString = q *> many ((char '\\' *> anyChar) <|> noneOf "\"\\") <* q "codestring" where q = char '"' -- single lineblock (which will be parsed) plus its original row position groupStringPos :: [StringPos] -> [CommentRow] groupStringPos sps = let grp = L.groupBy sameBlock sps' -- sps'': now that they are grouped, go back to sp tuple c2s qs = map (\(c,r,s,_) -> (c,r,s)) qs sps'' = map c2s grp -- this is a list of lists, hence -- the doublemap in sps2cr sps'' where -- how to verify if lines are sequential? -- subtract [1..] to line-number and you can use Eq on this! -- clever hack, the subtraction is the 4th value of the tuple sps' = map (\((c,r,s),k) -> (c,r,s,r-k)) (zip sps [1..]) -- equality function: -- multiline blocks are always to be treated alone -- single line blocks are to be grouped only if on same col and -- sequential lines sameBlock (0,_,_,_ ) _ = False sameBlock (c,_,_,il) (c',_,_,il') = c == c' && il == il' ------------ -- SOURCE -- ------------ source :: String -> String -> String -> ParSource [Issue] source st ed slt = fmap groupStringPos (rawComments st ed slt) >>= \cc -> fmap psPath getState >>= \fp -> fmap concat (mapM (parseCr fp) cc) where parseCr :: FilePath -> CommentRow -> ParSource [Issue] parseCr fp (r,s) = case runParSource (bpp r) fp s of Right y -> pure y Left _ -> parserFail "" -- TODO: more meaningful err mes. [duct] bpp r = getPosition >>= \p -> setPosition (setSourceLine p r) >> blocks -- TODO: elimina runParser per runParSource [duct] ---------- -- FILE -- ---------- -- main function with wich to parse issues; errors on stderr issueFinder :: [FilePath] -> IO [Issue] issueFinder fps = concat <$> mapM (uncurry fileParser) (matchPar fps) where -- matches each file w/ its parser, filters parserless files matchPar :: [FilePath] -> [(FilePath, ParSource [Issue])] matchPar fpz = filtNotPar . map (\f -> (f, langParser f)) $ fpz -- I can't use == on ParSource filtNotPar :: [(FilePath, Maybe (ParSource [Issue]))] -> [(FilePath, ParSource [Issue])] filtNotPar [] = [] filtNotPar ((f, Just p):tps) = (f,p) : filtNotPar tps filtNotPar ((_, Nothing):tps) = filtNotPar tps fileParser :: FilePath -> ParSource [Issue] -> IO [Issue] fileParser fp p = rfe fp >>= \cs -> case runParSource p fp cs of Left l -> perr (fp ++ " : parse error " ++ show l) >> return [] Right r -> return r where rfe :: FilePath -> IO String rfe _ = D.doesFileExist fp >>= \fb -> if fb then readFile fp else perr (fp ++ " : no such file") >> return "" perr cs = I.hPutStrLn I.stderr cs runParSource :: ParSource a -> FilePath -> String -> Either ParseError a runParSource p fp cs = runParser p (ParState fp) fp cs -- TODO: add more langparsers [feature:intermediate] langParser :: FilePath -> Maybe (ParSource [Issue]) langParser fp | ext `elem` [".hs", ".lhs"] = Just haskell | ext `elem` [".c", ".h"] = Just c | ext `elem` [".txt"] = Just blocks | otherwise = Nothing where ext = SF.takeExtension fp haskell = source "{-" "-}" "--" c = source "/*" "*/" "//" -- Shell: # (Perl: same) -- php: as C + Shell -- ruby: shell and =begin =end -- javascript: C + html -- python: # and """ -- java, objective-c, etc. watch tiobe -- html -- TODO: do not add more languages but parse every text -- file? [design] [u:3]