module Text.XML.Scraping where
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Text.XML as X
import Text.XML.Cursor
import Data.String
import qualified Text.HTML.DOM as H
import qualified Data.Map as M
import Data.List
import Data.Maybe
import System.Environment (getArgs)
import Text.Blaze.Html as Bl
import Text.Blaze.Html.Renderer.Text
import Data.Text.Lazy (fromStrict,toStrict, unpack)
import Text.XML.Selector
import Text.XML.Selector.Types
innerHtml :: [Cursor] -> TL.Text
innerHtml cs = renderNodes $ map node $ concatMap child cs
innerText :: [Cursor] -> T.Text
innerText cs = T.concat $ map (innerTextN . node) cs
renderNodes :: [Node] -> TL.Text
renderNodes ns = TL.concat $ map (renderHtml . Bl.toHtml) ns
toHtml :: [Cursor] -> TL.Text
toHtml cs = renderNodes $ map node cs
innerTextN :: Node -> T.Text
innerTextN (NodeElement (Element _ _ cs)) = T.concat $ map innerTextN cs
innerTextN (NodeContent txt) = txt
innerTextN _ = ""
ename :: Node -> Maybe T.Text
ename (NodeElement (Element n _ _)) = Just $ nameLocalName n
ename _ = Nothing
eid :: Node -> Maybe T.Text
eid (NodeElement (Element _ as _)) = M.lookup "id" as
eid _ = Nothing
eclass :: Node -> [T.Text]
eclass (NodeElement (Element _ as _)) = maybe [] T.words $ M.lookup "class" as
eclass _ = []
getMeta :: T.Text -> Cursor -> [T.Text]
getMeta n cursor = concat $ cursor $// element "meta" &| attributeIs "name" n &.// attribute "content"
remove :: (Node->Bool)->Node->Node
remove f (NodeElement (Element a b cs)) = NodeElement (Element a b (map (remove f) (filter (not . f) cs)))
remove _ n = n
removeDepth :: (Node->Bool)->Int->Node->Node
removeDepth _ (1) n = n
removeDepth f d (NodeElement (Element a b cs)) = NodeElement (Element a b (map (removeDepth f (d1)) (filter (not . f) cs)))
removeDepth _ _ n = n
removeTags :: [T.Text] -> [Node] -> [Node]
removeTags ts ns = map (remove (\n -> ename n `elem` map Just ts)) ns
removeQuery :: String -> [Node] -> [Node]
removeQuery q ns = map (remove (queryMatchNode q)) ns
removeQueries :: [String] -> [Node] -> [Node]
removeQueries qs ns = map (remove f) ns
where
f :: Node -> Bool
f n = any (flip queryMatchNode n) qs
nodeHaving :: (Node->Bool)->Node->Bool
nodeHaving f n@(NodeElement (Element _ _ cs)) = f n || any (nodeHaving f) cs
nodeHaving _ _ = False
rmElem :: String -> String -> [String] -> [Node] -> [Node]
rmElem tag id kl ns = map (remove f) ns
where
f :: Node -> Bool
f (NodeElement e) = selectorMatch (JQSelector Descendant (g tag) (g id) kl []) e
f _ = False
g :: String -> Maybe String
g "" = Nothing
g s = Just s