-- | This module provides a simple parser for UTF8. It converts a
-- string of bytes into a list of unicode tokens. Invalid input bytes
-- are converted to special 'Invalid' tokens, leaving it up to the
-- consuming application to decide what to do with them.

module Distribution.Superdoc.UTF8 (
  Token (..),
  parse_utf8,
  ) where

import Data.Bits
import Data.Char

-- ----------------------------------------------------------------------
-- * Documentation
  
-- | A token in a parsed UTF8 stream is either a valid Unicode
-- character or an invalid input character. 
data Token = Unicode Char | Invalid Char

-- | Is /c/ of the form 0xxxxxxx?
tok0 :: Char -> Bool
tok0 c = '\x00' <= c && c < '\x80'

-- | Is /c/ of the form 10xxxxxx?
tok1 :: Char -> Bool
tok1 c = '\x80' <= c && c < '\xc0'

-- | Is /c/ of the form 110xxxxx?
tok2 :: Char -> Bool
tok2 c = '\xc0' <= c && c < '\xe0'

-- | Is /c/ of the form 1110xxxx?
tok3 :: Char -> Bool
tok3 c = '\xe0' <= c && c < '\xf0'

-- | Is /c/ of the form 11110xxx?
tok4 :: Char -> Bool
tok4 c = '\xf0' <= c && c < '\xf8'

-- | Parse a UTF8 stream into tokens. Rejects overlong forms and code
-- points above 0x10ffff. Does not check validity of individual
-- unicode code points.

parse_utf8 :: String -> [Token]

-- end of stream
parse_utf8 [] = []

-- 0xxxxxxx
parse_utf8 (c1:cs) 
  | tok0 c1
  = Unicode c1 : parse_utf8 cs

-- 110xxxxx 10xxxxxx
parse_utf8 (c1:c2:cs) 
  | tok2 c1 && tok1 c2 && 
    val >= 0x80
  = Unicode (chr val) : parse_utf8 cs
    where
      val = ((ord c1) .&. 0x1f) `shift` 6 .|. ((ord c2) .&. 0x3f)

-- 1110xxxx 10xxxxxx 10xxxxxx
parse_utf8 (c1:c2:c3:cs) 
  | tok3 c1 && tok1 c2 && tok1 c3 
    && val >= 0x800
  = Unicode (chr val) : parse_utf8 cs
    where
      val = ((ord c1) .&. 0x0f) `shift` 12 .|. ((ord c2) .&. 0x3f) `shift` 6 .|. ((ord c3) .&. 0x3f)

-- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
parse_utf8 (c1:c2:c3:c4:cs) 
  | tok4 c1 && tok1 c2 && tok1 c3 && tok1 c4 
    && val >= 0x10000 && val <= 0x10ffff
  = Unicode (chr val) : parse_utf8 cs
    where
      val = ((ord c1) .&. 0x07) `shift` 18 .|. ((ord c2) .&. 0x3f) `shift` 12 .|. ((ord c3) .&. 0x3f) `shift` 6 .|. ((ord c4) .&. 0x3f)
      
-- invalid
parse_utf8 (c:cs) = Invalid c : parse_utf8 cs