{-# LANGUAGE OverloadedStrings #-}
module Network.HTTP.Robots where

import qualified Data.ByteString.Char8 as BS
import           Data.ByteString.Char8(ByteString)
import Data.Attoparsec.Char8 hiding (skipSpace)
import Control.Applicative
import Data.List(find)
import Data.Maybe(catMaybes)

type Robot = [([UserAgent], [Directive])]

data UserAgent = Wildcard | Literal ByteString
  deriving (Show,Eq)
type Path = ByteString

data Directive = Allow Path
               | Disallow Path
               | CrawlDelay Int
  deriving (Show,Eq)

-- | parseRobots is the main entry point for parsing a robots.txt file.
parseRobots :: ByteString -> Either String Robot
parseRobots = parseOnly robotP
              . BS.unlines
              . filter ( (\x -> BS.null x || BS.head x /= '#' ) . BS.dropWhile (==' '))
              . BS.lines

robotP :: Parser Robot
robotP = many ((,) <$> many1 agentP <*> many1 directiveP) <?> "robot"

skipSpace :: Parser ()
skipSpace = skipWhile (\x -> x==' ' || x == '\t')

directiveP :: Parser Directive
directiveP = choice [ Allow <$>      (string "Allow:"       >> skipSpace >> tokenP)
                    , (string "Disallow:"    >> skipSpace >>
                       (choice [Disallow <$> tokenP,
                                -- this requires some explanation.
                                -- The RFC suggests that an empty
                                -- Disallow line means anything is
                                -- allowed. Being semantically
                                -- equivalent to 'Allow: "/"',
                                -- I have chosen to change it here
                                -- rather than carry the bogus
                                -- distinction around.
                                endOfLine >> return (Allow "/") ] ))
                    , CrawlDelay <$> (string "Crawl-delay:" >>  skipSpace >>decimal)
                    ] <* commentsP <?> "directive"

agentP :: Parser UserAgent
agentP = do
  string "User-agent:"
  skipSpace
  ((string "*" >> return Wildcard) <|>
   (Literal  <$> tokenP)) <* skipSpace <* endOfLine <?> "agent"


commentsP :: Parser ()
commentsP = skipSpace >>
            ((string "#" >> takeTill (=='\n') >> skipSpace >> endOfLine) <|> return ())

tokenP :: Parser ByteString
tokenP = skipSpace >> takeWhile1 (not . isSpace) <* skipSpace

-- I lack the art to make this prettier.
canAccess :: ByteString -> Robot -> Path -> Bool
canAccess _ _ "/robots.txt" = True -- special-cased
canAccess agent robot path = case stanzas of
  [] -> True
  ((_,directives):_) -> matchingDirective directives
  where stanzas = catMaybes [find ((Literal agent `elem`) . fst) robot,
                             find ((Wildcard `elem`) . fst) robot]
        matchingDirective [] = True
        matchingDirective (x:xs) = case x of
          Allow robot_path -> if robot_path `BS.isPrefixOf` path
                              then True
                              else matchingDirective xs
          Disallow robot_path ->
            if robot_path `BS.isPrefixOf` path
            then False
            else matchingDirective xs
          _ -> matchingDirective xs