{- | Copyright : (c) 2024 Pierre Le Marre Maintainer: dev@wismill.eu Stability : experimental Parser for properties files: * [DerivedCoreProperties.txt](https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt) * [PropList.txt](https://www.unicode.org/reports/tr44/#PropList.txt) * [DerivedNormalizationProps.txt](https://www.unicode.org/reports/tr44/#DerivedNormalizationProps.txt) * [extracted/DerivedCombiningClass.txt](https://www.unicode.org/reports/tr44/#DerivedCombiningClass.txt) @since 0.1.0 -} module UCD.Parser.Properties (Entry (..), parse) where import Data.ByteString qualified as B import Data.ByteString.Char8 qualified as B8 import Data.ByteString.Short qualified as BS import Data.List qualified as L import UCD.Parser.Common ( UnicodeRange, parseRange, pattern HashTag, pattern NewLine, pattern SemiColon, ) -- | An entry from a properties file -- -- @since 0.1.0 data Entry = Entry { _range ∷ !(UnicodeRange ()) , _property ∷ !BS.ShortByteString } deriving (Eq, Show) -- | A parser for properties files -- -- @since 0.1.0 parse ∷ B.ByteString → [Entry] parse = L.unfoldr go where go ∷ B.ByteString → Maybe (Entry, B.ByteString) go raw | B.null raw = Nothing | otherwise = case B.span (/= NewLine) raw of (B8.strip → line, B.drop 1 → raw') → case parsePropertyLine line of Nothing → go raw' Just entry → Just (entry, raw') parsePropertyLine ∷ B.ByteString → Maybe Entry parsePropertyLine line | B.null line || B.head line == HashTag = Nothing | otherwise = Just (parseLine line) where parseLine ∷ B.ByteString → Entry parseLine raw = let (rangeLn, line1) = B.span (/= SemiColon) raw property = B.takeWhile (/= HashTag) (B.tail line1) in Entry (parseRange (B8.strip rangeLn)) (BS.toShort (B8.strip property)) -------------------------------------------------------------------------------- -- Doctest -------------------------------------------------------------------------------- {- $ >>> parse "0009..000D ; White_Space # Cc [5] .." [Entry {_range = CharRange {_first = '\t', _last = '\r', _rangeName = ()}, _property = "White_Space"}] >>>parse "061C ; Bidi_Control # Cf ARABIC LETTER MARK" [Entry {_range = SingleChar {_first = '\1564'}, _property = "Bidi_Control"}] -}