-- | This module provides a simple parser for UTF8. It converts a -- string of bytes into a list of unicode tokens. Invalid input bytes -- are converted to special 'Invalid' tokens, leaving it up to the -- consuming application to decide what to do with them. module Distribution.Superdoc.UTF8 ( Token (..), parse_utf8, ) where import Data.Bits import Data.Char -- ---------------------------------------------------------------------- -- * Documentation -- | A token in a parsed UTF8 stream is either a valid Unicode -- character or an invalid input character. data Token = Unicode Char | Invalid Char -- | Is /c/ of the form 0xxxxxxx? tok0 :: Char -> Bool tok0 c = '\x00' <= c && c < '\x80' -- | Is /c/ of the form 10xxxxxx? tok1 :: Char -> Bool tok1 c = '\x80' <= c && c < '\xc0' -- | Is /c/ of the form 110xxxxx? tok2 :: Char -> Bool tok2 c = '\xc0' <= c && c < '\xe0' -- | Is /c/ of the form 1110xxxx? tok3 :: Char -> Bool tok3 c = '\xe0' <= c && c < '\xf0' -- | Is /c/ of the form 11110xxx? tok4 :: Char -> Bool tok4 c = '\xf0' <= c && c < '\xf8' -- | Parse a UTF8 stream into tokens. Rejects overlong forms and code -- points above 0x10ffff. Does not check validity of individual -- unicode code points. parse_utf8 :: String -> [Token] -- end of stream parse_utf8 [] = [] -- 0xxxxxxx parse_utf8 (c1:cs) | tok0 c1 = Unicode c1 : parse_utf8 cs -- 110xxxxx 10xxxxxx parse_utf8 (c1:c2:cs) | tok2 c1 && tok1 c2 && val >= 0x80 = Unicode (chr val) : parse_utf8 cs where val = ((ord c1) .&. 0x1f) `shift` 6 .|. ((ord c2) .&. 0x3f) -- 1110xxxx 10xxxxxx 10xxxxxx parse_utf8 (c1:c2:c3:cs) | tok3 c1 && tok1 c2 && tok1 c3 && val >= 0x800 = Unicode (chr val) : parse_utf8 cs where val = ((ord c1) .&. 0x0f) `shift` 12 .|. ((ord c2) .&. 0x3f) `shift` 6 .|. ((ord c3) .&. 0x3f) -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx parse_utf8 (c1:c2:c3:c4:cs) | tok4 c1 && tok1 c2 && tok1 c3 && tok1 c4 && val >= 0x10000 && val <= 0x10ffff = Unicode (chr val) : parse_utf8 cs where val = ((ord c1) .&. 0x07) `shift` 18 .|. ((ord c2) .&. 0x3f) `shift` 12 .|. ((ord c3) .&. 0x3f) `shift` 6 .|. ((ord c4) .&. 0x3f) -- invalid parse_utf8 (c:cs) = Invalid c : parse_utf8 cs