{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Regex.Internal.Num
  ( mkNaturalDec
  , mkWordDecN
  , mkWordRangeDec
  , mkNaturalHex
  , mkWordHexN
  , mkWordRangeHex
  , mkSignedInteger
  , mkSignedIntRange
  ) where

#include "MachDeps.h"

import Control.Applicative
import Control.Monad
import Data.Primitive.PrimArray
import Data.Bits
import Numeric.Natural

import GHC.Num.Natural as Nat

import Regex.Internal.Regex (RE)
import qualified Regex.Internal.Regex as R

mkNaturalDec
  :: (Word -> Word -> RE c Word)  -- Decimal digit range
  -> RE c Natural
mkNaturalDec d =
      0 <$ d 0 0
  <|> liftA2 finishDec (d 1 9) (R.foldlMany' stepDec state0 (d 0 9))
  where
    state0 = NatParseState 0 1 WNil
    -- Start with len=1, it's reserved for the leading digit
{-# INLINE mkNaturalDec #-}

mkNaturalHex
  :: (Word -> Word -> RE c Word)  -- Hexadecimal digit range
  -> RE c Natural
mkNaturalHex d =
      0 <$ d 0 0
  <|> liftA2 finishHex (d 1 15) (R.foldlMany' stepHex state0 (d 0 15))
  where
    state0 = NatParseState 0 1 WNil
    -- Start with len=1, it's reserved for the leading digit
{-# INLINE mkNaturalHex #-}

mkSignedInteger :: RE c minus -> RE c plus -> RE c Natural -> RE c Integer
mkSignedInteger minus plus rnat = signed <*> rnat
  where
    signed = negate . fromIntegral <$ minus
         <|> fromIntegral <$ plus
         <|> pure fromIntegral

mkWordDecN
  :: (Word -> Word -> RE c Word)  -- Decimal digit range
  -> Int
  -> RE c Word
mkWordDecN d n0
  | n0 <= 0 = empty
  | maxBoundWordDecLen <= n0 =
      replicateM_ (n0 - maxBoundWordDecLen) d00 *>
      (    d00 *> go (maxBoundWordDecLen - 1)
       <|> mkWordRangeDec d (pow10 safeWordDecLen, maxBound) )
  | otherwise = go n0
  where
    go 1 = d09
    go n = R.liftA2' (\x y -> x * 10 + y) (go (n-1)) d09
    d00 = d 0 0
    d09 = d 0 9
{-# INLINE mkWordDecN #-}

mkWordHexN
  :: (Word -> Word -> RE c Word)  -- Hexadecimal digit range
  -> Int
  -> RE c Word
mkWordHexN d n0
  | n0 <= 0 = empty
  | maxBoundWordHexLen < n0 =
      replicateM_ (n0 - maxBoundWordHexLen) d00 *> go maxBoundWordHexLen
  | otherwise = go n0
  where
    go 1 = d0f
    go n = R.liftA2' (\x y -> x * 16 + y) (go (n-1)) d0f
    d00 = d 0 0
    d0f = d 0 15
{-# INLINE mkWordHexN #-}

mkWordRangeDec
  :: (Word -> Word -> RE c Word)  -- Decimal digit range
  -> (Word, Word)  -- Low high
  -> RE c Word
mkWordRangeDec d (l,h) = mkWordRangeBase 10 quotRemPow10 pow10 len10 d l h
  where
    quotRemPow10 i x = x `quotRem` pow10 i
{-# INLINE mkWordRangeDec #-}

mkWordRangeHex
  :: (Word -> Word -> RE c Word)  -- Hexadecimal digit range
  -> (Word, Word)  -- Low high
  -> RE c Word
mkWordRangeHex d (l,h) = mkWordRangeBase 16 quotRemPow16 pow16 len16 d l h
  where
    quotRemPow16 i x = (x `unsafeShiftR` (4*i), x .&. (pow16 i - 1))
{-# INLINE mkWordRangeHex #-}

mkSignedIntRange
  :: RE c minus
  -> RE c plus
  -> ((Word, Word) -> RE c Word)  -- Word range
  -> (Int, Int)  -- Low high
  -> RE c Int
mkSignedIntRange minus plus wordRangeDec (low,high) = case (negR, nonNegR) of
  (Nothing, Nothing) -> empty
  (Nothing, Just r2) -> r2
  (Just r1, Nothing) -> r1
  (Just r1, Just r2) -> r1 <|> r2
  where
    negR
      | low > 0 = Nothing
      | otherwise = Just $
        minus *>
        R.fmap' (negate . fromIntegral)
                (wordRangeDec (absw (min 0 high), absw low))
    nonNegR
      | high < 0 = Nothing
      | otherwise = Just $
        (void plus <|> pure ()) *>
        R.fmap' fromIntegral
                (wordRangeDec (fromIntegral (max 0 low), fromIntegral high))
{-# INLINE mkSignedIntRange #-}

absw :: Int -> Word
absw x = if x == minBound
         then fromIntegral (abs (x+1) + 1)
         else fromIntegral (abs x)

-------------------
-- Parsing ranges
-------------------

-- Make a tree based on the range. Keep the tree size small where possible.
-- This is hard to explain in words, so see here for some pictures:
-- https://github.com/meooow25/parser-regex/wiki/Visualizations#int-range

mkWordRangeBase
  :: forall c.
     Word  -- Base
  -> (Int -> Word -> (Word, Word)) -- quotRemPowBase
  -> (Int -> Word)  -- powBase
  -> (Word -> Int)  -- baseLen
  -> (Word -> Word -> RE c Word)  -- Decimal digit range
  -> Word  -- Low
  -> Word  -- High
  -> RE c Word
mkWordRangeBase _ _ _ _ _ low high | low > high = empty
mkWordRangeBase base quotRemPowBase powBase baseLen d low high
  = goTop (baseLen high - 1) True low high
  where
    goTop :: Int -> Bool -> Word -> Word -> RE c Word
    goTop 0 _ l h = d l h
    goTop i lz l h
      | dl == dh       = leading pBase dh dh (goTop (i-1) False l' h')
      | fullL && fullH = leading pBase dl dh (goFull (i-1))
      | fullH          = leading pBase (dl+1) dh (goFull (i-1)) <|> reL
      | fullL          = reH <|> leading pBase dl (dh-1) (goFull (i-1))
      | dl + 1 == dh   = reH <|> reL
      | otherwise      = reH <|> reM <|> reL
      where
        pBase = powBase i
        (dl,l') = quotRemPowBase i l
        (dh,h') = quotRemPowBase i h
        lz' = lz && dl == 0
        fullL = not lz' && l' == 0
        fullH = h' + 1 == pBase
        reL = if lz'
              then goL (i-1) True l'
              else leading pBase dl dl (goL (i-1) False l')
        reH = leading pBase dh dh (goH (i-1) h')
        reM = leading pBase (dl+1) (dh-1) (goFull (i-1))

    goL :: Int -> Bool -> Word -> RE c Word
    goL 0 _ l = d l (base-1)
    goL i lz l
      | not lz && l == 0 = goFull i
      | dl == base-1     = reL
      | otherwise        = reM <|> reL
      where
        pBase = powBase i
        (dl,l') = quotRemPowBase i l
        reL = if lz && dl == 0
              then goL (i-1) True l'
              else leading pBase dl dl (goL (i-1) False l')
        reM = leading pBase (dl+1) (base-1) (goFull (i-1))

    goH :: Int -> Word -> RE c Word
    goH 0 h = d 0 h
    goH i h
      | h + 1 == pBase * base = goFull i
      | dh == 0               = reH
      | otherwise             = reH <|> reM
      where
        pBase = powBase i
        (dh,h') = quotRemPowBase i h
        reH = leading pBase dh dh (goH (i-1) h')
        reM = leading pBase 0 (dh-1) (goFull (i-1))

    goFull :: Int -> RE c Word
    goFull 0 = d 0 (base-1)
    goFull i = leading (powBase i) 0 (base-1) (goFull (i-1))

    leading :: Word -> Word -> Word -> RE c Word -> RE c Word
    leading !pBase dl dh = R.liftA2' (\x y -> x * pBase + y) (d dl dh)
{-# INLINE mkWordRangeBase #-}

---------------------------------
-- Parsing hexadecimal Naturals
---------------------------------

-- Parsing hexadecimal is simple, there is no base conversion involved.
--
-- Step 1: Accumulate the hex digits, packed into Words
-- Step 2: Initialize a ByteArray and fill it with the Words
--
-- Because we create a Nat directly, this makes us depend on ghc-bignum and
-- GHC>=9.0.

stepHex :: NatParseState -> Word -> NatParseState
stepHex (NatParseState acc len ns) d
  | len < maxBoundWordHexLen = NatParseState (acc*16 + d) (len+1) ns
  | otherwise = NatParseState d 1 (WCons acc ns)

finishHex
  :: Word          -- ^ Leading digit
  -> NatParseState -- ^ Everything else
  -> Natural
finishHex !ld (NatParseState acc0 len0 ns0) = case ns0 of
  WNil -> Nat.naturalFromWord (ld `unsafeShiftL` (4*(len0-1)) + acc0)
  WCons n ns1 ->
    let lns = lengthWList ns1 + 2
        wsz = WORD_SIZE_IN_BITS
        !(PrimArray byteArray) = runPrimArray $ do
          ma <- newPrimArray lns
          if len0 == maxBoundWordHexLen
          then do
            let go i n1 WNil = do
                  let n1' = ld `unsafeShiftL` (4*(maxBoundWordHexLen-1)) + n1
                  writePrimArray ma i n1'
                go i n1 (WCons n2 ns2) = do
                  writePrimArray ma i n1
                  go (i+1) n2 ns2
            writePrimArray ma 0 acc0
            go 1 n ns1
          else do
            let go i prv n1 WNil = do
                  let n1' = ld `unsafeShiftL` (4*(maxBoundWordHexLen-1)) + n1
                  writePrimArray ma i (prv + n1' `unsafeShiftL` (4*len0))
                  writePrimArray ma (i+1) (n1' `unsafeShiftR` (wsz - 4*len0))
                go i prv n1 (WCons n2 ns2) = do
                  writePrimArray ma i (prv + n1 `unsafeShiftL` (4*len0))
                  go (i+1) (n1 `unsafeShiftR` (wsz - 4*len0)) n2 ns2
            go 0 acc0 n ns1
          pure ma
    in Nat.NB byteArray
-- finishHex does a bunch of unsafe stuff, so make sure things are correct:
-- * Bit shifts are in [0..wsz-1]
-- * Natural invariants:
--   * If the value fits in a word, it must be NS (via naturalFromWord here).
--   * Otherwise, use a ByteArray# with NB. The highest Word must not be 0.

-----------------------------
-- Parsing decimal Naturals
-----------------------------

-- The implementation below is adapted from the bytestring package.
-- https://github.com/haskell/bytestring/blob/7e11412b9bfb13bcd6b8e7c04765b8f5bd90fd34/Data/ByteString/Lazy/ReadNat.hs
--
-- Step 1: Accumulate the digits, packed into Words.
-- Step 2: Combine the packed Words bottom-up into the result. This is what
--         makes it better than foldl (\acc d -> acc * 10 + d)).
--
-- The obvious foldl approach is O(n^2) for n digits. The combine approach
-- performs O(n/2^i) multiplications of size O(2^i), for i in [0..log_2(n)].
-- If multiplication is O(n^k), this is also O(n^k). We have k < 2,
-- thanks to subquadratic multiplication of GMP-backed Naturals:
-- https://gmplib.org/manual/Multiplication-Algorithms.
--
-- For reference, here's how GMP converts any base (including 10) to a natural
-- using broadly the same approach.
-- https://github.com/alisw/GMP/blob/2bbd52703e5af82509773264bfbd20ff8464804f/mpn/generic/set_str.c

stepDec :: NatParseState -> Word -> NatParseState
stepDec (NatParseState acc len ns) d
  | len < safeWordDecLen = NatParseState (10*acc + d) (len+1) ns
  | otherwise = NatParseState d 1 (WCons acc ns)

finishDec
  :: Word          -- ^ Leading digit
  -> NatParseState -- ^ Everything else
  -> Natural
finishDec !ld (NatParseState acc0 len0 ns0) = combine acc0 len0 ns0
  where
    combine !acc !len ns = case ns of
      WNil -> w2n (10^(len-1) * ld + acc)
      WCons n ns1 -> 10^len * combine1 safeBaseDec (go n ns1) + w2n acc
      where
        go n WNil = let !n' = w2n (highMulDec * ld + n) in [n']
        go n (WCons m WNil) =
          let !n' = w2n (highMulDec * ld + m) * safeBaseDec + w2n n in [n']
        go n (WCons m (WCons n1 ns1)) =
          let !n' = w2n m * safeBaseDec + w2n n in n' : go n1 ns1

    combine1 _ [n] = n
    combine1 base ns1 = combine1 base1 (go ns1)
      where
        !base1 = base * base
        go (n:m:ns) = let !n' = m * base1 + n in n' : go ns
        go ns = ns

w2n :: Word -> Natural
w2n = fromIntegral

safeBaseDec :: Natural
safeBaseDec = fromIntegral (pow10 safeWordDecLen)

highMulDec :: Word
highMulDec = pow10 (safeWordDecLen - 1)

---------------------------
-- Common Natural parsing
---------------------------

data WList = WCons {-# UNPACK #-} !Word !WList | WNil

data NatParseState = NatParseState
  {-# UNPACK #-} !Word      -- ^ acc
  {-# UNPACK #-} !Int       -- ^ length of acc in some base
                 !WList     -- ^ accs, little endian

lengthWList :: WList -> Int
lengthWList = go 0
  where
    go !acc WNil = acc
    go acc (WCons _ ns) = go (acc+1) ns

--------------------
-- Low level stuff
--------------------

-- | Length in base 16.
len16 :: Word -> Int
len16 0 = 1
len16 x = maxBoundWordHexLen - (countLeadingZeros x `div` 4)

-- | 16^i. i must not be large enough to overflow a Word.
pow16 :: Int -> Word
pow16 i = 1 `unsafeShiftL` (4*i)

-- | Length in base 10.
len10 :: Word -> Int
len10 x = go 1 1
  where
    x' = x `quot` 10
    go p i | x' < p = i
    go p i = go (p*10) (i+1)

-- | "999..." repeated safeWordDecLen times is guaranteed to fit in a Word.
safeWordDecLen :: Int

-- | Decimal length of (maxBound :: Word)
maxBoundWordDecLen :: Int

-- | Hexadecimal length of (maxBound :: Word)
maxBoundWordHexLen :: Int

-- | 10^i. i must not be large enough to overflow a Word.
pow10 :: Int -> Word

#if WORD_SIZE_IN_BITS == 32 || WORD_SIZE_IN_BITS == 64

#if WORD_SIZE_IN_BITS == 64
safeWordDecLen = 19
maxBoundWordDecLen = 20
maxBoundWordHexLen = 16
#else
safeWordDecLen = 9
maxBoundWordDecLen = 10
maxBoundWordHexLen = 8
#endif

pow10 p = case p of
  0 -> 1
  1 -> 10
  2 -> 100
  3 -> 1000
  4 -> 10000
  5 -> 100000
  6 -> 1000000
  7 -> 10000000
  8 -> 100000000
  9 -> 1000000000
#if WORD_SIZE_IN_BITS == 64
  10 -> 10000000000
  11 -> 100000000000
  12 -> 1000000000000
  13 -> 10000000000000
  14 -> 100000000000000
  15 -> 1000000000000000
  16 -> 10000000000000000
  17 -> 100000000000000000
  18 -> 1000000000000000000
  19 -> 10000000000000000000
#endif
  _ -> errorWithoutStackTrace "Regex.Internal.Int.pow10: p too large"
#else
#error "unsupported word size"
#endif