{-# LANGUAGE OverloadedStrings #-}
module Network.HTTP.Robots where

import           Control.Applicative
import           Control.Monad
import           Data.Attoparsec.ByteString.Char8 hiding (skipSpace)
import qualified Data.Attoparsec.Text             as AT (isEndOfLine)
import           Data.ByteString.Char8            (ByteString)
import qualified Data.ByteString.Char8            as BS
import           Data.Char                        (toUpper)
import           Data.Either                      (partitionEithers)
import           Data.List                        (find)
import           Data.Maybe                       (catMaybes)
import           Data.Ratio
import           Data.Time.Clock
import           Data.Time.LocalTime              ()

type Robot = ([([UserAgent], [Directive])], [Unparsable])

type Unparsable = ByteString

data UserAgent = Wildcard | Literal ByteString
  deriving (Int -> UserAgent -> ShowS
[UserAgent] -> ShowS
UserAgent -> [Char]
(Int -> UserAgent -> ShowS)
-> (UserAgent -> [Char])
-> ([UserAgent] -> ShowS)
-> Show UserAgent
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> UserAgent -> ShowS
showsPrec :: Int -> UserAgent -> ShowS
$cshow :: UserAgent -> [Char]
show :: UserAgent -> [Char]
$cshowList :: [UserAgent] -> ShowS
showList :: [UserAgent] -> ShowS
Show,UserAgent -> UserAgent -> Bool
(UserAgent -> UserAgent -> Bool)
-> (UserAgent -> UserAgent -> Bool) -> Eq UserAgent
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: UserAgent -> UserAgent -> Bool
== :: UserAgent -> UserAgent -> Bool
$c/= :: UserAgent -> UserAgent -> Bool
/= :: UserAgent -> UserAgent -> Bool
Eq)
type Path = ByteString

-- http://www.conman.org/people/spc/robots2.html
-- This was never actually accepted as a standard,
-- but some sites do use it.
type TimeInterval = (DiffTime, DiffTime)

-- Crawldelay may have a decimal point
-- http://help.yandex.com/webmaster/controlling-robot/robots-txt.xml
-- Added directives NoArchive, NoSnippet, NoTranslate, SiteMap.
-- http://bloganddiscussion.com/anythingcomputer/1/robots-txt-noarchive-nocache-nosnippet/
data Directive = Allow Path
               | Disallow Path
               | CrawlDelay { Directive -> Rational
crawlDelay   :: Rational
                            , Directive -> TimeInterval
timeInterval :: TimeInterval
                            }
               | NoArchive Path
               | NoSnippet Path
               | NoTranslate Path
               -- not used by Google, Yahoo or Live Search/Bing
               -- http://searchengineland.com/a-deeper-look-at-robotstxt-17573
               | NoIndex Path
  deriving (Int -> Directive -> ShowS
[Directive] -> ShowS
Directive -> [Char]
(Int -> Directive -> ShowS)
-> (Directive -> [Char])
-> ([Directive] -> ShowS)
-> Show Directive
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Directive -> ShowS
showsPrec :: Int -> Directive -> ShowS
$cshow :: Directive -> [Char]
show :: Directive -> [Char]
$cshowList :: [Directive] -> ShowS
showList :: [Directive] -> ShowS
Show,Directive -> Directive -> Bool
(Directive -> Directive -> Bool)
-> (Directive -> Directive -> Bool) -> Eq Directive
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Directive -> Directive -> Bool
== :: Directive -> Directive -> Bool
$c/= :: Directive -> Directive -> Bool
/= :: Directive -> Directive -> Bool
Eq)

-- For use in the attoparsec monad, allows to reparse a sub expression
subParser :: Parser a -> ByteString -> Parser a
subParser :: forall a. Parser a -> Path -> Parser a
subParser Parser a
p = ([Char] -> Parser a)
-> (a -> Parser a) -> Either [Char] a -> Parser a
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (Parser a -> [Char] -> Parser a
forall a b. a -> b -> a
const Parser a
forall a. Parser Path a
forall (m :: * -> *) a. MonadPlus m => m a
mzero) a -> Parser a
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Either [Char] a -> Parser a)
-> (Path -> Either [Char] a) -> Path -> Parser a
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Parser a -> Path -> Either [Char] a
forall a. Parser a -> Path -> Either [Char] a
parseOnly Parser a
p


-- Seems the rational parser is unsecure in the presence of an exponent
-- but since there is no alternative to parse a rational, we just to refuse
-- to parse numbers with 'e' / exponent
-- https://hackage.haskell.org/package/attoparsec-0.12.1.0/docs/Data-Attoparsec-ByteString-Char8.html#v:rational
safeParseRational :: Parser Rational
safeParseRational :: Parser Rational
safeParseRational = do
  (Path
bs,Scientific
_) <- Parser Scientific -> Parser (Path, Scientific)
forall a. Parser a -> Parser (Path, a)
match Parser Scientific
scientific
  if Char -> Path -> Bool
BS.elem Char
'e' Path
bs Bool -> Bool -> Bool
|| Char -> Path -> Bool
BS.elem Char
'E' Path
bs
    then Parser Rational
forall a. Parser Path a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
    else Parser Rational -> Path -> Parser Rational
forall a. Parser a -> Path -> Parser a
subParser Parser Rational
forall a. Fractional a => Parser a
rational Path
bs

-- Yeah, robots.txt should be ASCII, but some sites
-- include the UTF-8 marker at start.
-- We just drop it, but handle the file as ASCII.
dropUTF8BOM :: ByteString -> ByteString
dropUTF8BOM :: Path -> Path
dropUTF8BOM Path
bs = if Int -> Path -> Path
BS.take Int
3 Path
bs Path -> Path -> Bool
forall a. Eq a => a -> a -> Bool
== ( Char
'\239' Char -> Path -> Path
`BS.cons`
                                      Char
'\187' Char -> Path -> Path
`BS.cons`
                                      Char
'\191' Char -> Path -> Path
`BS.cons` Path
BS.empty)
                   then Int -> Path -> Path
BS.drop Int
3 Path
bs
                   else Path
bs

parseHourMinute :: Parser (Integer,Integer)
parseHourMinute :: Parser (Integer, Integer)
parseHourMinute = Parser (Integer, Integer)
parseWithColon Parser (Integer, Integer)
-> Parser (Integer, Integer) -> Parser (Integer, Integer)
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser (Integer, Integer)
parseWithoutColon
  where
    parseWithColon :: Parser (Integer, Integer)
parseWithColon = do
      Integer
hours <- Parser ()
skipSpace Parser () -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path Integer
forall a. Integral a => Parser a
decimal
      Parser Path Char -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void         (Parser Path Char -> Parser ()) -> Parser Path Char -> Parser ()
forall a b. (a -> b) -> a -> b
$ Parser ()
skipSpace Parser () -> Parser Path Char -> Parser Path Char
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser Path Char
char Char
':'
      Integer
mins  <- Parser ()
skipSpace Parser () -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path Integer
forall a. Integral a => Parser a
decimal
      (Integer, Integer) -> Parser (Integer, Integer)
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
hours,Integer
mins)
    parseWithoutColon :: Parser (Integer, Integer)
parseWithoutColon = do
      Integer
h <- Int -> Parser Path
Data.Attoparsec.ByteString.Char8.take Int
2 Parser Path -> (Path -> Parser Path Integer) -> Parser Path Integer
forall a b. Parser Path a -> (a -> Parser Path b) -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Parser Path Integer -> Path -> Parser Path Integer
forall a. Parser a -> Path -> Parser a
subParser Parser Path Integer
forall a. Integral a => Parser a
decimal
      Integer
m <- Int -> Parser Path
Data.Attoparsec.ByteString.Char8.take Int
2 Parser Path -> (Path -> Parser Path Integer) -> Parser Path Integer
forall a b. Parser Path a -> (a -> Parser Path b) -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Parser Path Integer -> Path -> Parser Path Integer
forall a. Parser a -> Path -> Parser a
subParser Parser Path Integer
forall a. Integral a => Parser a
decimal
      (Integer, Integer) -> Parser (Integer, Integer)
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
h,Integer
m)

parseTimeInterval :: Parser TimeInterval
parseTimeInterval :: Parser TimeInterval
parseTimeInterval = do
  (Integer
hours_start, Integer
mins_start) <- Parser (Integer, Integer)
parseHourMinute
  Parser () -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void         (Parser () -> Parser ()) -> Parser () -> Parser ()
forall a b. (a -> b) -> a -> b
$ (Parser ()
skipSpace Parser () -> Parser Path Char -> Parser Path Char
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser Path Char
char Char
'-' Parser Path Char -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace) Parser () -> Parser () -> Parser ()
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
skipSpace
  (Integer
hours_end  , Integer
mins_end  ) <- Parser (Integer, Integer)
parseHourMinute
  TimeInterval -> Parser TimeInterval
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return ( Integer -> DiffTime
secondsToDiffTime (Integer
hours_start Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
+ Integer
mins_start Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60)
         , Integer -> DiffTime
secondsToDiffTime (Integer
hours_end   Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
+ Integer
mins_end   Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60))

allDay :: TimeInterval
allDay :: TimeInterval
allDay =  ( Integer -> DiffTime
secondsToDiffTime Integer
0
          , Integer -> DiffTime
secondsToDiffTime (Integer
24Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
*Integer
60Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
*Integer
60) -- because of leap seconds
          )

parseRequestRate :: Parser Directive
parseRequestRate :: Parser Directive
parseRequestRate = do
  Parser Path -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser Path -> Parser ()) -> Parser Path -> Parser ()
forall a b. (a -> b) -> a -> b
$ Path -> Parser Path
stringCI Path
"Request-rate:"
  Integer
docs <- Parser ()
skipSpace Parser () -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path Integer
forall a. Integral a => Parser a
decimal
  Parser Path Char -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser Path Char -> Parser ()) -> Parser Path Char -> Parser ()
forall a b. (a -> b) -> a -> b
$  Parser ()
skipSpace Parser () -> Parser Path Char -> Parser Path Char
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser Path Char
char Char
'/'
  Integer
ptim <- Parser ()
skipSpace Parser () -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path Integer
forall a. Integral a => Parser a
decimal
  Integer
units<- Parser ()
skipSpace Parser () -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>>   (  (Char -> Parser Path Char
char Char
's' Parser Path Char -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Integer -> Parser Path Integer
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (    Integer
1 :: Integer))
                        Parser Path Integer -> Parser Path Integer -> Parser Path Integer
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Path Char
char Char
'm' Parser Path Char -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Integer -> Parser Path Integer
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (   Integer
60 :: Integer))
                        Parser Path Integer -> Parser Path Integer -> Parser Path Integer
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Path Char
char Char
'h' Parser Path Char -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Integer -> Parser Path Integer
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
60Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
*Integer
60 :: Integer))
                        Parser Path Integer -> Parser Path Integer -> Parser Path Integer
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>              Integer -> Parser Path Integer
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (    Integer
1 :: Integer)
                         )
  TimeInterval
tint <- Parser ()
skipSpace Parser () -> Parser TimeInterval -> Parser TimeInterval
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ( Parser TimeInterval
parseTimeInterval Parser TimeInterval -> Parser TimeInterval -> Parser TimeInterval
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> TimeInterval -> Parser TimeInterval
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return TimeInterval
allDay)
  Directive -> Parser Directive
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Directive -> Parser Directive) -> Directive -> Parser Directive
forall a b. (a -> b) -> a -> b
$ Rational -> TimeInterval -> Directive
CrawlDelay ((Integer
ptim Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
units) Integer -> Integer -> Rational
forall a. Integral a => a -> a -> Ratio a
% Integer
docs) TimeInterval
tint

parseVisitTime :: Parser Directive
parseVisitTime :: Parser Directive
parseVisitTime = do
  Parser Path -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser Path -> Parser ()) -> Parser Path -> Parser ()
forall a b. (a -> b) -> a -> b
$ Path -> Parser Path
stringCI Path
"Visit-time:"
  TimeInterval
tint <- Parser ()
skipSpace Parser () -> Parser TimeInterval -> Parser TimeInterval
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser TimeInterval
parseTimeInterval
  Directive -> Parser Directive
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Directive -> Parser Directive) -> Directive -> Parser Directive
forall a b. (a -> b) -> a -> b
$ Rational -> TimeInterval -> Directive
CrawlDelay ( Integer
0 Integer -> Integer -> Rational
forall a. Integral a => a -> a -> Ratio a
% Integer
1) TimeInterval
tint

parseCrawlDelay :: Parser Directive
parseCrawlDelay :: Parser Directive
parseCrawlDelay = do
  Rational
delay <- Path -> Parser Path
stringCI Path
"Crawl-Delay:" Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Rational -> Parser Rational
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Rational
safeParseRational
  Directive -> Parser Directive
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Directive -> Parser Directive) -> Directive -> Parser Directive
forall a b. (a -> b) -> a -> b
$ Rational -> TimeInterval -> Directive
CrawlDelay Rational
delay TimeInterval
allDay

-- ... yeah.
strip :: ByteString -> ByteString
strip :: Path -> Path
strip = Path -> Path
BS.reverse (Path -> Path) -> (Path -> Path) -> Path -> Path
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Path -> Path
BS.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
' ') (Path -> Path) -> (Path -> Path) -> Path -> Path
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> Path
BS.reverse (Path -> Path) -> (Path -> Path) -> Path -> Path
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Path -> Path
BS.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
' ')

-- | parseRobots is the main entry point for parsing a robots.txt file.
parseRobots :: ByteString -> Either String Robot
parseRobots :: Path -> Either [Char] Robot
parseRobots Path
input = case Either [Char] Robot
parsed of
  -- special case no parsable lines and rubbish
  Right ([], out :: [Path]
out@(Path
_:[Path]
_)) ->
    [Char] -> Either [Char] Robot
forall a b. a -> Either a b
Left ([Char]
"no parsable lines: " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Path] -> [Char]
forall a. Show a => a -> [Char]
show [Path]
out)
  Either [Char] Robot
_ -> Either [Char] Robot
parsed

  where parsed :: Either [Char] Robot
parsed = Parser Robot -> Path -> Either [Char] Robot
forall a. Parser a -> Path -> Either [Char] a
parseOnly Parser Robot
robotP
              (Path -> Either [Char] Robot)
-> (Path -> Path) -> Path -> Either [Char] Robot
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Path] -> Path
BS.unlines
  -- Filthy hack to account for the fact we don't grab sitemaps
  -- properly. people seem to just whack them anywhere, which makes it
  -- hard to write a nice parser for them.
              ([Path] -> Path) -> (Path -> [Path]) -> Path -> Path
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Path -> Bool) -> [Path] -> [Path]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (Path -> Bool) -> Path -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> Path -> Bool
BS.isPrefixOf Path
"SITEMAP:" (Path -> Bool) -> (Path -> Path) -> Path -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char) -> Path -> Path
BS.map Char -> Char
toUpper)
              ([Path] -> [Path]) -> (Path -> [Path]) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Path -> Bool) -> [Path] -> [Path]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (Path -> Bool) -> Path -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> Path -> Bool
BS.isPrefixOf Path
"HOST:"    (Path -> Bool) -> (Path -> Path) -> Path -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char) -> Path -> Path
BS.map Char -> Char
toUpper)
              ([Path] -> [Path]) -> (Path -> [Path]) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Path -> Bool) -> [Path] -> [Path]
forall a. (a -> Bool) -> [a] -> [a]
filter (\Path
x -> Path -> Char
BS.head Path
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'#' )
              ([Path] -> [Path]) -> (Path -> [Path]) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Path -> Bool) -> [Path] -> [Path]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (Path -> Bool) -> Path -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> Bool
BS.null)
              ([Path] -> [Path]) -> (Path -> [Path]) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Path -> Path) -> [Path] -> [Path]
forall a b. (a -> b) -> [a] -> [b]
map Path -> Path
strip
              ([Path] -> [Path]) -> (Path -> [Path]) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> [Path]
BS.lines
              -- worst way of handling windows newlines ever
              (Path -> [Path]) -> (Path -> Path) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Path -> Path
BS.filter (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\r')
              (Path -> Path) -> (Path -> Path) -> Path -> Path
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> Path
dropUTF8BOM
              (Path -> Either [Char] Robot) -> Path -> Either [Char] Robot
forall a b. (a -> b) -> a -> b
$ Path
input

robotP :: Parser Robot
robotP :: Parser Robot
robotP = do
  ([([UserAgent], [Directive])]
dirs, [Path]
unparsable) <- [Either ([UserAgent], [Directive]) Path] -> Robot
forall a b. [Either a b] -> ([a], [b])
partitionEithers ([Either ([UserAgent], [Directive]) Path] -> Robot)
-> Parser Path [Either ([UserAgent], [Directive]) Path]
-> Parser Robot
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Path (Either ([UserAgent], [Directive]) Path)
-> Parser Path [Either ([UserAgent], [Directive]) Path]
forall a. Parser Path a -> Parser Path [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many (Parser Path ([UserAgent], [Directive])
-> Parser Path
-> Parser Path (Either ([UserAgent], [Directive]) Path)
forall (f :: * -> *) a b.
Alternative f =>
f a -> f b -> f (Either a b)
eitherP Parser Path ([UserAgent], [Directive])
agentDirectiveP Parser Path
unparsableP) Parser Robot -> [Char] -> Parser Robot
forall i a. Parser i a -> [Char] -> Parser i a
<?> [Char]
"robot"
  Robot -> Parser Robot
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return ([([UserAgent], [Directive])]
dirs, (Path -> Bool) -> [Path] -> [Path]
forall a. (a -> Bool) -> [a] -> [a]
filter (Path -> Path -> Bool
forall a. Eq a => a -> a -> Bool
/= Path
"") [Path]
unparsable)

unparsableP :: Parser ByteString
unparsableP :: Parser Path
unparsableP = (Char -> Bool) -> Parser Path
takeTill Char -> Bool
AT.isEndOfLine Parser Path -> Parser () -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
endOfLine -- char '\n'

agentDirectiveP :: Parser ([UserAgent],[Directive])
agentDirectiveP :: Parser Path ([UserAgent], [Directive])
agentDirectiveP = (,) ([UserAgent] -> [Directive] -> ([UserAgent], [Directive]))
-> Parser Path [UserAgent]
-> Parser Path ([Directive] -> ([UserAgent], [Directive]))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Path UserAgent -> Parser Path [UserAgent]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many1 Parser Path UserAgent
agentP Parser Path ([Directive] -> ([UserAgent], [Directive]))
-> Parser Path [Directive]
-> Parser Path ([UserAgent], [Directive])
forall a b. Parser Path (a -> b) -> Parser Path a -> Parser Path b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Parser Directive -> Parser Path [Directive]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many1 Parser Directive
directiveP Parser Path ([UserAgent], [Directive])
-> [Char] -> Parser Path ([UserAgent], [Directive])
forall i a. Parser i a -> [Char] -> Parser i a
<?> [Char]
"agentDirective"


skipSpace :: Parser ()
skipSpace :: Parser ()
skipSpace = (Char -> Bool) -> Parser ()
skipWhile (\Char
x -> Char
xChar -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
' ' Bool -> Bool -> Bool
|| Char
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\t')

directiveP :: Parser Directive
directiveP :: Parser Directive
directiveP = do
  Parser ()
skipSpace
  [Parser Directive] -> Parser Directive
forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice [ Path -> Parser Path
stringCI Path
"Disallow:" Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Directive -> Parser Directive
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>>
                        ((Path -> Directive
Disallow (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Path
tokenP) Parser Directive -> Parser Directive -> Parser Directive
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
                      -- This requires some explanation.
                      -- The RFC suggests that an empty Disallow line means
                      -- anything is allowed. Being semantically equivalent to
                      -- 'Allow: "/"', I have chosen to change it here
                      -- rather than carry the bogus distinction around.
                             (Parser ()
endOfLine Parser () -> Parser Directive -> Parser Directive
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Directive -> Parser Directive
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Path -> Directive
Allow    Path
"/")))
                    , Path -> Parser Path
stringCI Path
"Allow:" Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Directive -> Parser Directive
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>>
                        ((Path -> Directive
Allow (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Path
tokenP) Parser Directive -> Parser Directive -> Parser Directive
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
                      -- If an empty disallow means 'disallow nothing',
                      -- an empty allow means 'allow nothing'. Right?
                      -- Not sure, actually, but only the americanexpress.com
                      -- has such a case, which in one hand I am tempted
                      -- to consider an error... but for now:
                             (Parser ()
endOfLine Parser () -> Parser Directive -> Parser Directive
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Directive -> Parser Directive
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Path -> Directive
Disallow Path
"/")))
                    , Parser Directive
parseCrawlDelay
                    , Parser Directive
parseRequestRate
                    , Parser Directive
parseVisitTime
                    , Path -> Directive
NoArchive   (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Path -> Parser Path
stringCI Path
"Noarchive:"  Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path
tokenP)
                    , Path -> Directive
NoSnippet   (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Path -> Parser Path
stringCI Path
"Nosnippet:"  Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path
tokenP)
                    , Path -> Directive
NoTranslate (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Path -> Parser Path
stringCI Path
"Notranslate:"Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path
tokenP)
                    , Path -> Directive
NoIndex     (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Path -> Parser Path
stringCI Path
"Noindex:"    Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path
tokenP)
                    ] Parser Directive -> Parser () -> Parser Directive
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
commentsP Parser Directive -> [Char] -> Parser Directive
forall i a. Parser i a -> [Char] -> Parser i a
<?> [Char]
"directive"

agentP :: Parser UserAgent
agentP :: Parser Path UserAgent
agentP = do
  Parser Path -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser Path -> Parser ()) -> Parser Path -> Parser ()
forall a b. (a -> b) -> a -> b
$ Path -> Parser Path
stringCI Path
"user-agent:"
  Parser ()
skipSpace
  ((Path -> Parser Path
string Path
"*" Parser Path -> Parser Path UserAgent -> Parser Path UserAgent
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> UserAgent -> Parser Path UserAgent
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return UserAgent
Wildcard) Parser Path UserAgent
-> Parser Path UserAgent -> Parser Path UserAgent
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
   (Path -> UserAgent
Literal  (Path -> UserAgent) -> Parser Path -> Parser Path UserAgent
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Path
tokenWithSpacesP)) Parser Path UserAgent -> Parser () -> Parser Path UserAgent
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
skipSpace Parser Path UserAgent -> Parser () -> Parser Path UserAgent
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
endOfLine Parser Path UserAgent -> [Char] -> Parser Path UserAgent
forall i a. Parser i a -> [Char] -> Parser i a
<?> [Char]
"agent"


commentsP :: Parser ()
commentsP :: Parser ()
commentsP = Parser ()
skipSpace Parser () -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>>
            (   (Path -> Parser Path
string Path
"#" Parser Path -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Char -> Bool) -> Parser Path
takeTill Char -> Bool
AT.isEndOfLine Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
endOfLine)
            Parser () -> Parser () -> Parser ()
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
endOfLine
            Parser () -> Parser () -> Parser ()
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () -> Parser ()
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return ())


tokenP :: Parser ByteString
tokenP :: Parser Path
tokenP = Parser ()
skipSpace Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Char -> Bool) -> Parser Path
takeWhile1 (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace) Parser Path -> Parser () -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
skipSpace
tokenWithSpacesP :: Parser ByteString
tokenWithSpacesP :: Parser Path
tokenWithSpacesP = Parser ()
skipSpace Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Char -> Bool) -> Parser Path
takeWhile1 (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'#' Bool -> Bool -> Bool
|| Char -> Bool
AT.isEndOfLine Char
c))
                             Parser Path -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* (Char -> Bool) -> Parser Path
takeTill Char -> Bool
AT.isEndOfLine

-- I lack the art to make this prettier.
-- Currently does not take into account the CrawlDelay / Request Rate directives
canAccess :: ByteString -> Robot -> Path -> Bool
canAccess :: Path -> Robot -> Path -> Bool
canAccess Path
_ Robot
_ Path
"/robots.txt" = Bool
True -- special-cased
canAccess Path
agent ([([UserAgent], [Directive])]
robot,[Path]
_) Path
path = case [([UserAgent], [Directive])]
stanzas of
  [] -> Bool
True
  (([UserAgent]
_,[Directive]
directives):[([UserAgent], [Directive])]
_) -> [Directive] -> Bool
matchingDirective [Directive]
directives
  where stanzas :: [([UserAgent], [Directive])]
stanzas = [Maybe ([UserAgent], [Directive])] -> [([UserAgent], [Directive])]
forall a. [Maybe a] -> [a]
catMaybes [(([UserAgent], [Directive]) -> Bool)
-> [([UserAgent], [Directive])] -> Maybe ([UserAgent], [Directive])
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find ((UserAgent -> Bool) -> [UserAgent] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (UserAgent -> Path -> Bool
`isLiteralSubstring` Path
agent)  ([UserAgent] -> Bool)
-> (([UserAgent], [Directive]) -> [UserAgent])
-> ([UserAgent], [Directive])
-> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([UserAgent], [Directive]) -> [UserAgent]
forall a b. (a, b) -> a
fst) [([UserAgent], [Directive])]
robot,
                             (([UserAgent], [Directive]) -> Bool)
-> [([UserAgent], [Directive])] -> Maybe ([UserAgent], [Directive])
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find ((UserAgent
Wildcard UserAgent -> [UserAgent] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem`) ([UserAgent] -> Bool)
-> (([UserAgent], [Directive]) -> [UserAgent])
-> ([UserAgent], [Directive])
-> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([UserAgent], [Directive]) -> [UserAgent]
forall a b. (a, b) -> a
fst) [([UserAgent], [Directive])]
robot]


        isLiteralSubstring :: UserAgent -> Path -> Bool
isLiteralSubstring (Literal Path
a) Path
us = Path
a Path -> Path -> Bool
`BS.isInfixOf` Path
us
        isLiteralSubstring UserAgent
_ Path
_ = Bool
False
        matchingDirective :: [Directive] -> Bool
matchingDirective [] = Bool
True
        matchingDirective (Directive
x:[Directive]
xs) = case Directive
x of
          Allow Path
robot_path ->
            Path
robot_path Path -> Path -> Bool
`BS.isPrefixOf` Path
path Bool -> Bool -> Bool
|| [Directive] -> Bool
matchingDirective [Directive]
xs
          Disallow Path
robot_path ->
            Bool -> Bool
not (Path
robot_path Path -> Path -> Bool
`BS.isPrefixOf` Path
path) Bool -> Bool -> Bool
&& [Directive] -> Bool
matchingDirective [Directive]
xs

          Directive
_ -> [Directive] -> Bool
matchingDirective [Directive]
xs