module Duckling.Types.Document
  ( Document 
  , fromText
  , (!)
  , length
  , byteStringFromPos
  , isAdjacent
  , isRangeValid
  ) where
import Data.Array.Unboxed (UArray)
import Data.ByteString (ByteString)
import Data.List (scanl', foldl', foldr)
import Data.String
import Data.Text (Text)
import Prelude hiding (length)
import qualified Data.Array.Unboxed as Array
import qualified Data.ByteString as BS
import qualified Data.Char as Char
import qualified Data.Text.Unsafe as UText
import qualified Data.Text.Encoding as Text
import qualified Data.Text as Text
import qualified Data.Text.Internal.Unsafe.Char as UText
data Document = Document
  { rawInput :: !Text
  , utf8Encoded :: ByteString
  , indexable :: UArray Int Char 
  , firstNonAdjacent :: UArray Int Int
    
    
    
    
    
    
  , tDropToBSDrop :: UArray Int Int
    
    
  , bsDropToTDrop :: UArray Int Int
    
    
    
    
    
    
  , tDropToUtf16Drop :: UArray Int Int
    
  } deriving (Show)
instance IsString Document where
  fromString = fromText . fromString
fromText :: Text -> Document
fromText rawInput = Document{..}
  where
  utf8Encoded = Text.encodeUtf8 rawInput
  rawInputLength = Text.length rawInput
  unpacked = Text.unpack rawInput
  indexable = Array.listArray (0, rawInputLength  1) unpacked
  firstNonAdjacent = Array.listArray (0, rawInputLength  1) $ snd $
   foldr gen (rawInputLength, []) $ zip [0..] unpacked
  
  gen (ix, elem) (best, !acc)
    | isAdjacentSeparator elem = (best, best:acc)
    | otherwise = (ix, ix:acc)
  tDropToBSDropList = scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked
  tDropToBSDrop = Array.listArray (0, rawInputLength) tDropToBSDropList
  tDropToUtf16Drop = Array.listArray (0, rawInputLength) $
    scanl' (\acc a -> acc + utf16CharWidth a) 0 unpacked
  bsDropToTDrop = Array.listArray (0, BS.length utf8Encoded) $
    reverse $ snd $ foldl' fun (1, []) $ zip [0..] tDropToBSDropList
  fun (lastPos, !acc) (ix, elem) = (elem, replicate (elem  lastPos) ix ++ acc)
  utf8CharWidth c
    | w <= 0x7F = 1
    | w <= 0x7FF = 2
    | w <= 0xFFFF = 3
    | otherwise = 4
    where
    w = UText.ord c
  utf16CharWidth c
    | w < 0x10000 = 1
    | otherwise = 2
    where
    w = UText.ord c
isRangeValid :: Document -> Int -> Int -> Bool
isRangeValid doc start end =
  (start == 0 ||
      isDifferent (doc ! (start  1)) (doc ! start)) &&
  (end == length doc ||
      isDifferent (doc ! (end  1)) (doc ! end))
  where
    charClass :: Char -> Char
    charClass c
      | Char.isLower c || Char.isUpper c = 'c'
      | Char.isDigit c = 'd'
      | otherwise = c
    isDifferent :: Char -> Char -> Bool
    isDifferent a b = charClass a /= charClass b
isAdjacent :: Document -> Int -> Int -> Bool
isAdjacent Document{..} a b =
  b >= a && (firstNonAdjacent Array.! a >= b)
isAdjacentSeparator :: Char -> Bool
isAdjacentSeparator c = elem c [' ', '\t', '-']
(!) :: Document -> Int -> Char
(!) Document { indexable = s } ix = s Array.! ix
length :: Document -> Int
length Document { indexable = s } = Array.rangeSize $ Array.bounds s
byteStringFromPos
  :: Document
  -> Int
  -> ( ByteString
     , (Int, Int) -> Text
     , Int -> Int -> (Int, Int)
     )
byteStringFromPos
  Document { rawInput = rawInput
           , utf8Encoded = utf8Encoded
           , tDropToBSDrop = tDropToBSDrop
           , bsDropToTDrop = bsDropToTDrop
           , tDropToUtf16Drop = tDropToUtf16Drop
           }
  position = (substring, rangeToText, translateRange)
  where
  
  
  utf8Position = tDropToBSDrop Array.! position
  substring :: ByteString
  substring = BS.drop utf8Position utf8Encoded
  
  
  rangeToText :: (Int, Int) -> Text
  rangeToText (1, _) = ""
  
  rangeToText r = UText.takeWord16 (end16Pos  start16Pos) $
    UText.dropWord16 start16Pos rawInput
    where
    start16Pos = tDropToUtf16Drop Array.! startPos
    end16Pos = tDropToUtf16Drop Array.! endPos
    (startPos, endPos) = uncurry translateRange r
  
  translateRange :: Int -> Int -> (Int, Int)
  translateRange !bsStart !bsLen = startPos `seq` endPos `seq` res
    where
    res = (startPos, endPos)
    realBsStart = utf8Position + bsStart
    realBsEnd = realBsStart + bsLen
    startPos = bsDropToTDrop Array.! realBsStart
    endPos = bsDropToTDrop Array.! realBsEnd