{- |
Copyright : (c) 2024 Pierre Le Marre
Maintainer: dev@wismill.eu
Stability   : experimental

Parser for properties files:

* [DerivedCoreProperties.txt](https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt)
* [PropList.txt](https://www.unicode.org/reports/tr44/#PropList.txt)
* [DerivedNormalizationProps.txt](https://www.unicode.org/reports/tr44/#DerivedNormalizationProps.txt)
* [extracted/DerivedCombiningClass.txt](https://www.unicode.org/reports/tr44/#DerivedCombiningClass.txt)

@since 0.1.0
-}
module UCD.Parser.Properties (Entry (..), parse) where

import Data.ByteString qualified as B
import Data.ByteString.Char8 qualified as B8
import Data.ByteString.Short qualified as BS
import Data.List qualified as L
import UCD.Parser.Common (
  UnicodeRange,
  parseRange,
  pattern HashTag,
  pattern NewLine,
  pattern SemiColon,
 )

-- | An entry from a properties file
--
-- @since 0.1.0
data Entry = Entry
  { _range ∷ !(UnicodeRange ())
  , _property ∷ !BS.ShortByteString
  }
  deriving (Eq, Show)

-- | A parser for properties files
--
-- @since 0.1.0
parse ∷ B.ByteString → [Entry]
parse = L.unfoldr go
 where
  go ∷ B.ByteString → Maybe (Entry, B.ByteString)
  go raw
    | B.null raw = Nothing
    | otherwise = case B.span (/= NewLine) raw of
        (B8.strip → line, B.drop 1 → raw') →
          case parsePropertyLine line of
            Nothing → go raw'
            Just entry → Just (entry, raw')

parsePropertyLine ∷ B.ByteString → Maybe Entry
parsePropertyLine line
  | B.null line || B.head line == HashTag = Nothing
  | otherwise = Just (parseLine line)
 where
  parseLine ∷ B.ByteString → Entry
  parseLine raw =
    let (rangeLn, line1) = B.span (/= SemiColon) raw
        property = B.takeWhile (/= HashTag) (B.tail line1)
     in Entry (parseRange (B8.strip rangeLn)) (BS.toShort (B8.strip property))

--------------------------------------------------------------------------------
-- Doctest
--------------------------------------------------------------------------------

{- $
>>> parse "0009..000D    ; White_Space # Cc   [5] <control-0009>..<control-000D>"
[Entry {_range = CharRange {_first = '\t', _last = '\r', _rangeName = ()}, _property = "White_Space"}]
>>>parse "061C          ; Bidi_Control # Cf       ARABIC LETTER MARK"
[Entry {_range = SingleChar {_first = '\1564'}, _property = "Bidi_Control"}]
-}