{-# LANGUAGE OverloadedStrings #-} module Text.HTML.DirectoryListing.Parser (parseDirectoryListing) where import Text.HTML.DirectoryListing.Type import Text.HTML.TagSoup import Data.Time.LocalTime import Data.Time.Format import Data.List import Data.Maybe import qualified Data.Text as T -- | Parse a html file to Entrys parseDirectoryListing :: T.Text -> [Entry] parseDirectoryListing html = catMaybes fileLines where fileLines :: [Maybe Entry] fileLines = map (toEntry . filter (not . isNoise) . parseTags) . T.lines $ html toEntry :: [Tag T.Text] -> Maybe Entry toEntry [TagOpen "a" [("href", ref)], TagText name, TagClose "a", TagText dateTimeAndFilesize] | (length . T.words $ dateTimeAndFilesize) /= 3 = Nothing | otherwise = toEntry [TagOpen "a" [("href", ref)], TagText name, TagClose "a", TagText dateTime, TagText filesize] where dateTime = T.concat . intersperse " " . take 2 . T.words $ dateTimeAndFilesize filesize = last . T.words $ dateTimeAndFilesize toEntry [TagOpen "a" [("href", ref)], TagText name, TagClose "a", TagText dateTime, TagText filesize] = Just Entry { visibleName = name , href = ref , lastModified = parseLastModified dateTime , fileSize = parseFileSize filesize } toEntry _ = Nothing -- | apache's directory listing have many noise, filter them out isNoise (TagOpen "tr" _) = True isNoise (TagClose "tr") = True isNoise (TagOpen "td" _) = True isNoise (TagClose "td") = True isNoise (TagOpen "img" _) = True isNoise (TagText t) = t' `elem` [" ", "\160", ""] where t' = T.strip t isNoise _ = False -- | Bad design, it throws error when noParse -- some example inputs: -- 24-Apr-2014 11:55 -- 04-Jan-2014 13:18 parseLastModified :: T.Text -> LocalTime parseLastModified t = parseTimeOrError True locale format (T.unpack t) where format = "%d-%b-%Y %R" locale = defaultTimeLocale { months = map (\y -> (y, y)) [ "Jan", "Feb", "Mar", "Apr" , "May", "Jun", "Jul", "Aug" , "Sep", "Oct", "Nov", "Dec" ] } -- | this function only accept the following format: -- 665 -- 123B -- 249K -- 3.8M -- 5.6G -- return value: represent file size in Byte. parseFileSize :: T.Text -> Maybe Integer parseFileSize t = case reads s::[(Double, String)] of [] -> Nothing [(f, "" )] -> Just . round $ f [(f, "B")] -> Just . round $ f [(f, "K")] -> Just . round $ f*1024 [(f, "M")] -> Just . round $ f*1024*1024 [(f, "G")] -> Just . round $ f*1024*1024*1024 _ -> Nothing where s = T.unpack . T.strip $ t