{-| Module: Data.Text.Encoding.ANSEL Description: Decoder for the ANSEL character encoding Copyright: (c) Callum Lowcay, 2017 License: BSD3 Maintainer: cwslowcay@gmail.com Stability: experimental Portability: GHC ANSEL is a character set and associated encodings intended for bibliographic purposes. GEDCOM files use the 8-bit ANSEL encoding by default, so we need a way to decode it. ANSEL has combining diacritics, but they precede the character that they modify (Unicode has it the other way around). This means that the code points must be reordered when converting to Unicode. -} module Data.Text.Encoding.ANSEL ( decodeANSEL ) where import Control.Monad.Loops (whileM) import Control.Monad.State (State, evalState, get, put) import Data.Array import Data.Word import qualified Data.ByteString as B import qualified Data.Text.All as T -- | Decode an ANSEL string to Unicode decodeANSEL :: B.ByteString -- ^ The string to encode -> T.Text -- ^ Unicode text decodeANSEL bs = T.pack . concat . evalState (whileM ((not.null) <$> get) decodeANSELChar)$ B.unpack bs decodeANSELChar :: State [Word8] String decodeANSELChar = do (dias', rest) <- span isDiacritic <$> get let dias = filter (/= '\xFFFD').fmap (combiningTable!)$ dias' case rest of [] -> put [] >> if null dias then return "" else return "\xFFFD" r:rs -> put rs >> let c = encode r in if c == '\xFFFD' then return "\xFFFD" else return$ c:dias where encode r = if isAscii r then toEnum (fromIntegral r) else composedTable!r isAscii :: Word8 -> Bool isAscii x = x < 0x80 isDiacritic :: Word8 -> Bool isDiacritic x = x >= 0xE0 composedTable :: Array Word8 Char composedTable = accumArray (flip const) '\xFFFD' (0x00, 0xFF) [ (0xA1, 'Ł'), (0xA2, 'Ø'), (0xA3, 'Đ'), (0xA4, 'Þ'), (0xA5, 'Æ'), (0xA6, 'Œ'), (0xA7, 'ʹ'), (0xA8, '·'), (0xA9, '♭'), (0xAA, '®'), (0xAB, '±'), (0xAC, 'Ơ'), (0xAD, 'Ư'), (0xAE, 'ʼ'), (0xB0, 'ʻ'), (0xB1, 'ł'), (0xB2, 'ø'), (0xB3, 'đ'), (0xB4, 'þ'), (0xB5, 'æ'), (0xB6, 'œ'), (0xB7, 'ʺ'), (0xB8, 'ı'), (0xB9, '£'), (0xBA, 'ð'), (0xBC, 'ơ'), (0xBD, 'ư'), (0xBE, '□'), (0xBF, '■'), (0xC0, '°'), (0xC1, 'ℓ'), (0xC2, '℗'), (0xC3, '©'), (0xC4, '♯'), (0xC5, '¿'), (0xC6, '¡'), (0xCD, 'e'), (0xCE, 'o'), (0xCF, 'ß')] combiningTable :: Array Word8 Char combiningTable = accumArray (flip const) '\xFFFD' (0x00, 0xFF) [ (0xE0, '\x0309'), (0xE1, '\x0300'), (0xE2, '\x0301'), (0xE3, '\x0302'), (0xE4, '\x0303'), (0xE5, '\x0304'), (0xE6, '\x0306'), (0xE7, '\x0307'), (0xE8, '\x0308'), (0xE9, '\x030C'), (0xEA, '\x030A'), (0xEB, '\xFE20'), (0xEC, '\xFE21'), (0xED, '\x0315'), (0xEE, '\x030B'), (0xEF, '\x0310'), (0xF0, '\x0327'), (0xF1, '\x0328'), (0xF2, '\x0323'), (0xF3, '\x0324'), (0xF4, '\x0325'), (0xF5, '\x0333'), (0xF6, '\x0332'), (0xF7, '\x0326'), (0xF8, '\x031C'), (0xF9, '\x032E'), (0xFA, '\xFE22'), (0xFB, '\xFE23'), (0xFE, '\x0313'), (0xFF, '\x0338')]