{-# LANGUAGE NoMonomorphismRestriction #-}
{-# LANGUAGE Arrows #-}
-- | Extraction and consolidation of JavaScript code in an HTML page.
module Text.Html.Consolidate (-- * Simple API
                              consolidate
                             ,extract
                             ,extractPretty
                              -- * Advanced arrow-based API
                             ,TArr                              
                             ,consolidateArr
                             ,extractJSArr
                             ,initialConsState
                             ,insertJSArr
                             ,parseHTML
                             ,renderHTML
                             ,ConsState
                             ) where

import Text.XML.HXT.Core hiding (swap)
import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow
import Text.XML.HXT.Arrow.XmlArrow
import Text.XML.HXT.TagSoup
import Language.ECMAScript3.Syntax
import Language.ECMAScript3.Syntax.Annotations
import Language.ECMAScript3.PrettyPrint
import Language.ECMAScript3.Parser
import Data.List (isInfixOf)
import Data.Default.Class
import Network.HTTP
import Network.Browser (Cookie)
import Network.Browser.Simple
import Network.URI
import Network.HTTP.Encoding
import Data.ByteString.Lazy (ByteString)
import System.Random
import Data.Char
import Data.Maybe (isJust, fromJust, maybeToList)
import Control.Monad hiding (when)

-- | Consolidation state
data ConsState = ConsState Bool  -- Ignore errors?
                           (Maybe URI) -- Base URI of the web page,
                                       -- for resolving relative URI's
                           [Cookie] -- Cookies to include with all
                                    -- HTTP requests
                           [Statement ()]

-- | A constructor function for making an initial consolidation state
-- (needed for running the arrows in the advanced API). Example usage:
--
-- > runXIOState (initialState $ initialConsState ignoreErrors muri []) $ someArrow
initialConsState :: Bool -- ^ Whether to ignore errors (parse errors,
                         -- resource not found etc.)
                 -> Maybe URI -- ^ base URI
                 -> [Cookie] -- ^ Cookies
                 -> ConsState
initialConsState grace base cookies = ConsState grace base cookies []

-- | Our XML transformation arrow type
type TArr a b = IOStateArrow ConsState a b

-- | A wrapper around the hxt parser with commonly used arguments
parseHTML :: String -> Maybe URI -> TArr a XmlTree
parseHTML s mbase_uri = 
  let config = map (withDefaultBaseURI . show) (maybeToList mbase_uri)
             ++[withParseHTML yes
               ,withTagSoup
               ,withValidate no
               ,withSubstDTDEntities no
               ,withSubstHTMLEntities yes
               ,withCanonicalize no
               ,withOutputHTML]
  in readString config s

-- | A wrapper around hxt to pretty print html out of the arrow
renderHTML :: ConsState -> TArr XmlTree XmlTree -> IO String
renderHTML ns a = 
  let state  = initialState ns
  in liftM head $ runXIOState state (single a >>> writeDocumentToString [withOutputHTML, withOutputEncoding utf8])

-- | Takes an HTML page source as a string and an optional base URI
-- (for resolving relative URI's) and produces an HTML page with all
-- the scripts consolidated in one inline script element.
consolidate :: String -> Maybe URI -> IO String
consolidate s mbase_uri = 
  renderHTML (initialConsState True mbase_uri []) $ 
  parseHTML s mbase_uri >>> consolidateArr

-- | The consolidation function with an arrow interface.
--
-- > consolidateArr = extractJSArr >>> insertJSArr
consolidateArr :: TArr XmlTree XmlTree
consolidateArr = extractJSArr >>> insertJSArr

-- | Extacts all JavaScript code in the given
-- HTML page source as a single program. Takes an optional base URI
-- for resolving relative URI's.
extract :: String -> Maybe URI -> IO (JavaScript ())
extract s mbase_uri =
  let state = initialState $ initialConsState True mbase_uri [] in
  do [(_, js)] <- runXIOState state $ single $ 
                  parseHTML s mbase_uri >>> extractJSArr
     return js

-- | Extacts and pretty-prints all JavaScript code in the given
-- HTML page source as a single program. Takes an optional base URI
-- for resolving relative URI's.
-- 
-- > extractPretty s muri = liftM (show . prettyPrint) $ extract s muri
extractPretty :: String -> Maybe URI -> IO String
extractPretty s muri = liftM (show . prettyPrint) $ extract s muri

-- | Extracts all JavaScript from HTML. There shouldn't be any
-- JavaScript in the resulting XmlTree
extractJSArr :: TArr XmlTree (XmlTree, JavaScript ())
extractJSArr =
  (choiceA [isAJavaScript :-> ifA (hasAttr "src") extractExternalScript
                                                   extractInlineScript
            ,(isElem >>> hasOneOfNames src_tags >>> hasOneOfAttrs src_attrs) :-> extractURLProp
            ,(isElem >>> hasOneOfAttrs event_handlers) :-> extractEventHandler
            ,this :-> this
            ]
   `processTopDownUntilAndWhenMatches` isFrame)
  >>>returnScript
  where returnScript = (returnA &&& getUserState)
                    >>> second (arr (\s -> let ConsState _ _ _ stmts = s 
                                           in  Script () stmts))
        isFrame = isElem >>> (hasName "frame" <+> hasName "iframe")
        hasOneOfNames tagNames = (getName >>> isA (`elem` tagNames)) `guards` this
        hasOneOfAttrs attrNames = (getAttrl >>> hasOneOfNames attrNames) `guards` this
        -- Elements and properties that might contain a 'javascript:' url 
        -- IMG.SRC, A.HREF, FORM.ACTION, FRAME.SRC, IFRAME.SRC, LINK.HREF
        src_tags = ["img", "a", "form", "frame", "iframe", "link"]
        src_attrs = ["src", "href", "action"]
        event_handlers = ["onabort", "onblur", "onclick", "oncompositionstart"
                         ,"oncompositionupdate", "oncompositionend", "ondblclick"
                         ,"onerror", "onfocus", "onfocusin", "onfocusout"
                         ,"onkeydown", "onkeypress", "onkeyup", "onload"
                         ,"onmousedown", "onmouseenter", "onmouseleave"
                         ,"onmousemove", "onmouseout", "onmouseover"
                         ,"onmouseup", "onreset", "onresize","onscroll"
                         ,"onselect", "onsubmit", "ontextinput", "onunload"
                         , "onwheel"]

-- | Like Control.Arrow.ArrowTree, but instead has a separate arrow
-- that serves as a predicate signalling that this subtree shouldn't
-- be transformed any further; 'processTopDownUntilMatches transformer
-- predicate'. Almost dual of 'processBottomUpWhenNot', but transforms
-- the node that matches the predicate as well (but doesn't look
-- inside that node).
processTopDownUntilAndWhenMatches :: (ArrowTree a, Tree t)
                                  => a (t b) (t b) -- ^ the transformer arrow
                                  -> a (t b) (t b) -- ^ the predicate arrow
                                  -> a (t b) (t b)
processTopDownUntilAndWhenMatches t p =
  t >>> (processChildren (processTopDownUntilAndWhenMatches t p) `whenNot` p)
  -- ifA p (t >>> processChildren (processTopDownUntilAndWhenMatches t p)) returnA
                       
-- | Inserts JavaScript at the end of the HTML body.
insertJSArr :: TArr (XmlTree, JavaScript a) XmlTree
insertJSArr = (swap ^<< second scriptElement) >>>
              arr2A (\scr ->  processTopDown $ changeChildren (++ [scr]) `when`
                              hasName "body")
              
-- extractors
-- | Extracts the contents of inline scripts
extractInlineScript :: TArr XmlTree XmlTree
extractInlineScript = 
  firstChild >>> 
  getText >>> 
  parseJS >>> 
  arr removeAnnotations >>> 
  appendScript >>>
  cmt "Removed Inline Script"
  
-- | Extracts the contents of externally references scripts
extractExternalScript :: TArr XmlTree XmlTree
extractExternalScript = 
  (getAttrValue "src" >>>
   (downloadArr >>>
    parseJS >>>
    arr removeAnnotations >>>  
    appendScript) 
   &&&
   (arr ("Removed External Script: " ++) >>>
    mkCmt)) >>>
  arr snd
  
-- |Downloads the content, considering the input as a URL; performs
-- decoding automatically.
downloadArr :: TArr String String
downloadArr = 
  (returnA &&& arr parseURIReference) >>> 
  arrIO (\(url, muri) -> 
          failIfNothing ("download: error parsing a URI: " ++ url) muri) >>>
  consolidateURI >>>
  (getUserState &&& returnA) >>>
  arrIO (\(ConsState _ _ cookies _, uri) -> liftM fst (download uri cookies))

  where failIfNothing :: String -> Maybe a -> IO a
        failIfNothing message Nothing  = fail message
        failIfNothing _       (Just x) = return x
        -- RFC 3986, section 4.1: if the URI-reference's prefix does
        -- not match the syntax of a scheme followed by its colon
        -- separator, then the URI reference is a relative reference.
        isURIRelative :: URI -> Bool
        isURIRelative = null . uriScheme
        consolidateURI :: TArr URI URI
        consolidateURI = 
          (getUserState &&& returnA) >>>
          arrIO (\(ConsState _ mbaseURI _ _, uri) ->
          return $ if isURIRelative uri && isJust mbaseURI
                   then uri `relativeTo` fromJust mbaseURI
                   else uri)
        
-- Removes the URL containing properties from elements and adds a
-- JavaScript assignment instead.
extractURLProp :: TArr XmlTree XmlTree
extractURLProp = 
  -- Elements and properties that might contain a 'javascript:' url 
  -- IMG.SRC, A.HREF, FORM.ACTION, FRAME.SRC, IFRAME.SRC, LINK.HREF
  -- isElem >>>
  -- selectTags ["img", "a", "form", "frame", "iframe", "link"] >>>
  addIdIfNotPresent >>>
  (((selectAttrValues ["src", "href", "action"] 
     &&& selectId) >>>
    arr (\((url, attrName), id) -> 
     Script () [ExprStmt () $ AssignExpr () OpAssign 
                (LDot () (CallExpr ()
                          (DotRef () (VarRef () (Id () "document")) (Id () "getElementById"))
                          [StringLit () id]) attrName) (StringLit () url)]) >>>
   appendScript) &&& 
  removeAttributes ["src", "href", "action"]) >>>
  arr snd
  
-- Removes the event handlers from HTML tags and converts them to
-- JavaScript assignments
extractEventHandler :: TArr XmlTree XmlTree
extractEventHandler = 
  -- Names of HTML tag attributes that are event handler declarations:
  let attrNames = ["onabort", "onblur", "onclick", "oncompositionstart",
                   "oncompositionupdate", "oncompositionend", "ondblclick", 
                   "onerror", "onfocus", "onfocusin", "onfocusout", "onkeydown",
                   "onkeypress", "onkeyup", "onload", "onmousedown",
                   "onmouseenter", "onmouseleave", "onmousemove", "onmouseout", 
                   "onmouseover", "onmouseup", "onreset", "onresize","onscroll",
                   "onselect", "onsubmit", "ontextinput", "onunload", "onwheel"]
  -- in isElem >>>
  --    hasAnyAttrs attrNames >>>
  in   addIdIfNotPresent >>>
     (((selectAttrValues attrNames &&& selectId) >>>
      arr (\((handler, attrName), id) ->
        Script () [ExprStmt () $ AssignExpr () OpAssign 
                   (LDot () (CallExpr ()
                             (DotRef () (VarRef () (Id () "document")) (Id () "getElementById"))
                            [StringLit () id]) attrName) (StringLit () handler)])
      >>> appendScript) &&&
     removeAttributes attrNames) >>>
     arr snd

parseJS :: TArr String (JavaScript SourcePos)
parseJS = arr (parse program "") >>> eitherToFailure

-- Arrow tools
-- | Failure reporting arrow constructor
arrowFail :: ArrowIO ar => String -> ar b c
arrowFail = arrIO . fail

isStrict :: TArr a Bool
isStrict = getUserState >>> arr (\s -> let ConsState strict _ _ _ = s in strict)

-- | If in strict mode, fails with the message given; otherwise,
-- behaves like an identity arrow
failIfStrict :: String -> TArr a a
failIfStrict msg = 
  proc a -> do is <- isStrict -< ()
               if is then arrowFail msg -< () else returnA -< a

eitherToFailure :: (Show err, Default a) => TArr (Either err a) a
eitherToFailure = (isStrict &&& returnA) >>> arrIO f
  where f (False, Left err) = return def
        f (True,  Left err) = fail $ show err
        f (_   ,  Right x)  = return x

maybeToFailure :: (Default a) => Maybe String -> TArr (Maybe a) a
maybeToFailure message = (isStrict &&& returnA) >>> arrIO f
  where f (False, Nothing) = return def
        f (True , Nothing) = 
          case message of
            Nothing  -> fail "Unexpected maybe in strict mode"
            Just msg -> fail msg
        f (_    , Just x)  = return x

appendStatements :: TArr [Statement ()] ()
appendStatements = (getUserState &&& returnA) >>>
                   arr (\(state, addScript) -> 
                         let ConsState grace baseURI cookies script = state in
                         ConsState grace baseURI cookies (script++addScript)) >>>
                   setUserState >>> arr (const ())

appendScript :: TArr (JavaScript ()) ()
appendScript = arr (\s -> let Script _ stmts = s in stmts) >>> appendStatements
                   
-- constructors
-- | Constructs a new JavaScript element
scriptElement :: ArrowXml ar => ar (JavaScript a) XmlTree
scriptElement = mkElement (mkName "script") (sattr "type" "text/javascript") $< arr (txt . show . prettyPrint)

-- Selectors
-- | A selector for SCRIPT tags with JavaScript or empty type
isAJavaScript :: ArrowXml ar => ar XmlTree XmlTree
isAJavaScript = 
  isElem >>> hasName "script" >>>
  (((hasAttr "language" >>> hasAttrValue "language" (isInfixOf "javascript")) <+>
    (hasAttr "type" >>> hasAttrValue "type" (isInfixOf "javascript"))) `orElse`
   returnA)
        
-- | Selects the first child of a node
firstChild :: (ArrowTree a, Tree t) => a (t b) (t b)
firstChild = single getChildren
  
-- | Selects the last child fo a node
lastChild :: (ArrowTree a, Tree t) => a (t b) (t b)
lastChild = getChildren >>. (take 1 . reverse)

-- | Selects the <html> tag
html :: ArrowXml a => a XmlTree XmlTree
html = deep $ hasName "html"

-- | Selects <body> tag
body :: ArrowXml a => a XmlTree XmlTree
body = html /> hasName "body"
  --deep $ hasName "html" /> hasName "body"

selectTags :: ArrowXml a => [String] -> a XmlTree XmlTree
selectTags = foldl (\arr tag -> arr <+> hasName tag) zeroArrow

selectAttrValues :: ArrowXml a => [String] -> a XmlTree (String, String)
selectAttrValues = foldl f zeroArrow
  where f :: ArrowXml a => 
             a XmlTree (String, String) -> String -> a XmlTree (String, String)
        f a attr = a <+> (hasAttr attr >>> 
                          (getAttrValue attr &&& arr (const attr)))
                   
hasAnyAttrs :: ArrowXml a => [String] -> a XmlTree XmlTree
hasAnyAttrs = foldl f zeroArrow
  where f :: ArrowXml a => a XmlTree XmlTree -> String -> a XmlTree XmlTree
        f a attr = a <+> hasAttr attr
                   
removeAttributes :: ArrowXml a => [String] -> a XmlTree XmlTree
removeAttributes = foldl f zeroArrow
  where f :: ArrowXml a => a XmlTree XmlTree -> String -> a XmlTree XmlTree
        f a attr = a >>> removeAttr attr
  

addIdIfNotPresent :: TArr XmlTree XmlTree
addIdIfNotPresent = proc node -> do
  idval <- getAttrValue "id" -< node
  if null idval
    then replaceChildren repFun -< node
    else returnA -< node
    where repFun = replaceChildren (genIdA >>> mkText)
                   `when` (isAttr >>> hasName "id")

-- | Selects the id of an element or adds a new one (and returns) if
-- it's not present
selectId :: TArr XmlTree String
selectId = isElem >>> getAttrValue "id"
                        
genIdA :: ArrowIO ar => ar a String
genIdA = arrIO $ const genId

genId :: IO String
genId = do firstLetter <- genLetter
           return [firstLetter]
           length <- getStdRandom $ randomR (minIdLength-1, maxIdLength-1)
           restId <- mapM (const genLetter) [1..length] 
           return $ firstLetter:restId
  where minIdLength :: Int
        minIdLength = 16
        maxIdLength = 32
        
genLetter :: IO Char
genLetter = do letter <- getStdRandom $ randomR (capitalACode, capitalZCode)
               lettercase <- getStdRandom $ randomR (0,1)
               return $ chr $ letter + lettercase * (lowercaseACode-capitalACode)
  where capitalACode = 65
        capitalZCode = 90
        lowercaseACode = 97
        
swap (a,b) = (b,a)