{-# LANGUAGE FlexibleContexts      #-}
{-# LANGUAGE FlexibleInstances     #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings     #-}
{-# LANGUAGE TypeSynonymInstances  #-}

module HaskellWorks.Data.Xml.Token.Tokenize
    ( IsChar(..)
    , XmlToken(..)
    , ParseXml(..)
    ) where

import Control.Applicative
import Data.Bits
import Data.Char
import Data.Word
import Data.Word8
import HaskellWorks.Data.Char.IsChar
import HaskellWorks.Data.Parser          as P
import HaskellWorks.Data.Xml.Token.Types

import qualified Data.Attoparsec.ByteString.Char8 as BC
import qualified Data.Attoparsec.Combinator       as AC
import qualified Data.Attoparsec.Types            as T
import qualified Data.ByteString                  as BS

hexDigitNumeric :: P.Parser t Word8 => T.Parser t Int
hexDigitNumeric = do
  c <- satisfyChar (\c -> '0' <= c && c <= '9')
  return $ ord c - ord '0'

hexDigitAlphaLower :: P.Parser t Word8 => T.Parser t Int
hexDigitAlphaLower = do
  c <- satisfyChar (\c -> 'a' <= c && c <= 'z')
  return $ ord c - ord 'a' + 10

hexDigitAlphaUpper :: P.Parser t Word8 => T.Parser t Int
hexDigitAlphaUpper = do
  c <- satisfyChar (\c -> 'A' <= c && c <= 'Z')
  return $ ord c - ord 'A' + 10

hexDigit :: P.Parser t Word8 => T.Parser t Int
hexDigit = hexDigitNumeric <|> hexDigitAlphaLower <|> hexDigitAlphaUpper

class ParseXml t s d where
  parseXmlTokenString     :: T.Parser t (XmlToken s d)
  parseXmlToken           :: T.Parser t (XmlToken s d)
  parseXmlTokenBraceL     :: T.Parser t (XmlToken s d)
  parseXmlTokenBraceR     :: T.Parser t (XmlToken s d)
  parseXmlTokenBracketL   :: T.Parser t (XmlToken s d)
  parseXmlTokenBracketR   :: T.Parser t (XmlToken s d)
  parseXmlTokenComma      :: T.Parser t (XmlToken s d)
  parseXmlTokenColon      :: T.Parser t (XmlToken s d)
  parseXmlTokenWhitespace :: T.Parser t (XmlToken s d)
  parseXmlTokenNull       :: T.Parser t (XmlToken s d)
  parseXmlTokenBoolean    :: T.Parser t (XmlToken s d)
  parseXmlTokenDouble     :: T.Parser t (XmlToken s d)

  parseXmlToken =
    parseXmlTokenString     <|>
    parseXmlTokenBraceL     <|>
    parseXmlTokenBraceR     <|>
    parseXmlTokenBracketL   <|>
    parseXmlTokenBracketR   <|>
    parseXmlTokenComma      <|>
    parseXmlTokenColon      <|>
    parseXmlTokenWhitespace <|>
    parseXmlTokenNull       <|>
    parseXmlTokenBoolean    <|>
    parseXmlTokenDouble

instance ParseXml BS.ByteString String Double where
  parseXmlTokenBraceL   = string "{"      >> return XmlTokenBraceL
  parseXmlTokenBraceR   = string "}"      >> return XmlTokenBraceR
  parseXmlTokenBracketL = string "["      >> return XmlTokenBracketL
  parseXmlTokenBracketR = string "]"      >> return XmlTokenBracketR
  parseXmlTokenComma    = string ","      >> return XmlTokenComma
  parseXmlTokenColon    = string ":"      >> return XmlTokenColon
  parseXmlTokenNull     = string "null"   >> return XmlTokenNull
  parseXmlTokenDouble   = XmlTokenNumber <$> rational

  parseXmlTokenString = do
    _ <- string "\""
    value <- many (verbatimChar <|> escapedChar <|> escapedCode)
    _ <- string "\""
    return $ XmlTokenString value
    where
      verbatimChar  = satisfyChar (BC.notInClass "\"\\") <?> "invalid string character"
      escapedChar   = do
        _ <- string "\\"
        (   char '"'  >> return '"'  ) <|>
          ( char 'b'  >> return '\b' ) <|>
          ( char 'n'  >> return '\n' ) <|>
          ( char 'f'  >> return '\f' ) <|>
          ( char 'r'  >> return '\r' ) <|>
          ( char 't'  >> return '\t' ) <|>
          ( char '\\' >> return '\\' ) <|>
          ( char '\'' >> return '\'' ) <|>
          ( char '/'  >> return '/'  )
      escapedCode :: T.Parser BS.ByteString Char
      escapedCode   = do
        _ <- string "\\u"
        a <- hexDigit
        b <- hexDigit
        c <- hexDigit
        d <- hexDigit
        return . chr $ a `shift` 24 .|. b `shift` 16 .|. c `shift` 8 .|. d

  parseXmlTokenWhitespace = do
    _ <- AC.many1' $ BC.choice [string " ", string "\t", string "\n", string "\r"]
    return XmlTokenWhitespace

  parseXmlTokenBoolean = true <|> false
    where
      true  = string "true"   >> return (XmlTokenBoolean True)
      false = string "false"  >> return (XmlTokenBoolean False)

instance ParseXml BS.ByteString BS.ByteString Double where
  parseXmlTokenBraceL   = string "{"      >> return XmlTokenBraceL
  parseXmlTokenBraceR   = string "}"      >> return XmlTokenBraceR
  parseXmlTokenBracketL = string "["      >> return XmlTokenBracketL
  parseXmlTokenBracketR = string "]"      >> return XmlTokenBracketR
  parseXmlTokenComma    = string ","      >> return XmlTokenComma
  parseXmlTokenColon    = string ":"      >> return XmlTokenColon
  parseXmlTokenNull     = string "null"   >> return XmlTokenNull
  parseXmlTokenDouble   = XmlTokenNumber <$> rational

  parseXmlTokenString = do
    _ <- string "\""
    value <- many (verbatimChar <|> escapedChar <|> escapedCode)
    _ <- string "\""
    return . XmlTokenString $ BS.pack value
    where
      word :: Word8 -> T.Parser BS.ByteString Word8
      word w = satisfy (== w)
      verbatimChar :: T.Parser BS.ByteString Word8
      verbatimChar  = satisfy (\w -> w /= _quotedbl && w /= _backslash) -- <?> "invalid string character"
      escapedChar :: T.Parser BS.ByteString Word8
      escapedChar   = do
        _ <- string "\\"
        (   word _quotedbl    >> return _quotedbl       ) <|>
          ( word _b           >> return 0x08            ) <|>
          ( word _n           >> return _lf             ) <|>
          ( word _f           >> return _np             ) <|>
          ( word _r           >> return _cr             ) <|>
          ( word _t           >> return _tab            ) <|>
          ( word _backslash   >> return _backslash      ) <|>
          ( word _quotesingle >> return _quotesingle    ) <|>
          ( word _slash       >> return _slash          )
      escapedCode :: T.Parser BS.ByteString Word8
      escapedCode   = do
        _ <- string "\\u"
        a <- hexDigit
        b <- hexDigit
        c <- hexDigit
        d <- hexDigit
        return . fromIntegral $ a `shift` 24 .|. b `shift` 16 .|. c `shift` 8 .|. d

  parseXmlTokenWhitespace = do
    _ <- AC.many1' $ BC.choice [string " ", string "\t", string "\n", string "\r"]
    return XmlTokenWhitespace

  parseXmlTokenBoolean = true <|> false
    where
      true  = string "true"   >> return (XmlTokenBoolean True)
      false = string "false"  >> return (XmlTokenBoolean False)