{-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE RecordWildCards #-} ----------------------------------------------------------------------------- -- | -- Module : System.IO.LineIndexedCursor -- Maintainer : i@ak3n.com -- -- Line-indexed file reader. -- -- Lazily builds the index with the line numbers while reading the file -- making it possible to rewind to them quickly later. ----------------------------------------------------------------------------- module System.IO.LineIndexedCursor ( LineIndexedCursor(..) , mkLineIndexedCursor , mkLineIndexedCursorWithCapacity ) where import Data.Maybe (fromMaybe) import qualified Data.Array as A import Data.ByteString (ByteString, hGetLine) import Control.Concurrent.MVar (MVar, newMVar, readMVar, modifyMVar, modifyMVar_) import System.IO (Handle, hTell, hSeek, SeekMode(..), hIsEOF) defaultListCapacity :: Integer defaultListCapacity = 16384 -- | ADT with methods, hiding the internal state. -- -- 'LineIndexedCursor.getCurrentLine', 'LineIndexedCursor.getCurrentLineUnsafe', -- 'LineIndexedCursor.doFullScan', and 'LineIndexedCursor.goToLine', all throw 'System.IO.IOError'. data LineIndexedCursor = LineIndexedCursor { -- | Same as 'LineIndexedCursor.getCurrentLineUnsafe' but safely handles 'System.IO.EOF'. getCurrentLine :: IO (Maybe ByteString) -- | A wrapper around 'hGetLine'. Throws the same exceptions. , getCurrentLineUnsafe :: IO ByteString -- | Returns the current line number. , getCurrentLineNumber :: IO Integer -- | Reads from the latest known line until EOF to build the full index. , doFullScan :: IO () -- | Rewinds the file handle to the requsted line number. Stops at the EOF if it's too big, -- returning the reached line number. , goToLine :: Integer -> IO Integer -- | Returns the file 'Handle'. , getHandle :: Handle -- | Returns the current state of the cursor — all known line indexes. , getCursorState :: IO [Integer] } data CursorHandle = CursorHandle { fileHandle :: Handle , cursorState :: MVar CursorState , listCapacity :: Integer } data CursorState = CursorState { cursorLinesIdx :: ![Integer] , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer)) -- uses Maybe since can't be empty , cursorIdxSize :: !Integer , cursorCurrentLineNumber :: !Integer } {- | Builds 'LineIndexedCursor'. Resets the file handle's ofsset to the beginning. Use 'System.IO.hSetNewlineMode' if you want to configure 'System.IO.NewlineMode'. -} mkLineIndexedCursor :: Handle -> IO LineIndexedCursor mkLineIndexedCursor = flip mkLineIndexedCursorWithCapacity defaultListCapacity -- | Same as 'mkLineIndexedCursor' but allows to configure the list's capacity. mkLineIndexedCursorWithCapacity :: Handle -> Integer -> IO LineIndexedCursor mkLineIndexedCursorWithCapacity fileHandle listCapacity = do -- reset the handle's offset to the beginning hSeek fileHandle AbsoluteSeek 0 cursorState <- newMVar $ CursorState [0] Nothing 0 0 let cursorHandle = CursorHandle fileHandle cursorState listCapacity pure $ LineIndexedCursor { getCurrentLine = getCurrentLine' cursorHandle , getCurrentLineUnsafe = getCurrentLineUnsafe' cursorHandle , getCurrentLineNumber = getCurrentLineNumber' cursorHandle , doFullScan = doFullScan' cursorHandle , goToLine = goToLine' cursorHandle , getHandle = fileHandle , getCursorState = getCursorState' cursorHandle } getCurrentLine' :: CursorHandle -> IO (Maybe ByteString) getCurrentLine' CursorHandle{..} = hIsEOF fileHandle >>= \isEOF -> if isEOF then pure Nothing else do line <- hGetLine fileHandle offset <- hTell fileHandle modifyMVar_ cursorState $ \cs@(CursorState idx arr size cln) -> let latestIdx = getLatestIdx cs in pure $ if (offset <= latestIdx) -- we already know this offset, so just increment the current line number then cs { cursorCurrentLineNumber = cln + 1 } -- otherwise we need to add the offset else let (newIdx, newArr) = -- if we have exceed the list capacity if length (offset : idx) > fromIntegral listCapacity -- move the list content to the array and empty the list then let res = (offset : idx) ++ maybe [] A.elems arr in ([], Just $ A.listArray (0, toInteger $ length res - 1) res) -- otherwise keep the offset in the list else (offset : idx, arr) in CursorState { cursorLinesIdx = newIdx , cursorLinesArrIdx = newArr , cursorIdxSize = size + 1 , cursorCurrentLineNumber = cln + 1 } pure $ Just line getCurrentLineUnsafe' :: CursorHandle -> IO ByteString getCurrentLineUnsafe' ch = do cl <- getCurrentLine' ch pure $ fromMaybe (error "getCurrentLineUnsafe: couldn't get the current line") cl doFullScan' :: CursorHandle -> IO () doFullScan' CursorHandle{..} = do modifyMVar_ cursorState $ \cs@(CursorState idx arr size _) -> do -- go to the end of the index hSeek fileHandle AbsoluteSeek (getLatestIdx cs) -- try to read until the EOF idxTail <- readUntilEOF [] let newSize = size + (fromIntegral $ length idxTail) newState = CursorState { cursorLinesIdx = idxTail ++ idx , cursorLinesArrIdx = arr , cursorIdxSize = newSize , cursorCurrentLineNumber = newSize } pure newState where readUntilEOF idx = hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do _ <- hGetLine fileHandle offset <- hTell fileHandle readUntilEOF (fromInteger offset : idx) getCurrentLineNumber' :: CursorHandle -> IO Integer getCurrentLineNumber' CursorHandle{..} = do CursorState _ _ _ cln <- readMVar cursorState pure cln goToLine' :: CursorHandle -> Integer -> IO Integer goToLine' ch@CursorHandle{..} ln = -- handle negative input if (ln < 0) then getCurrentLineNumber' ch else modifyMVar cursorState $ \cs@(CursorState idx arr size _) -> do -- if the requested line number is out of the index's scope if ln > size then do -- go to the end of the index hSeek fileHandle AbsoluteSeek (getLatestIdx cs) -- try to read until the requested line number idxTail <- readUntil (ln - size) [] let newSize = size + (fromIntegral $ length idxTail) (newIdx, newArr) = -- if we have exceed the list capacity if newSize > listCapacity -- move the list content to the array and empty the list then let res = (idxTail ++ idx) ++ maybe [] A.elems arr in ([], Just $ A.listArray (0, toInteger $ length res - 1) res) -- otherwise add offsets to the list else (idxTail ++ idx, arr) newState = CursorState { cursorLinesIdx = newIdx , cursorLinesArrIdx = newArr , cursorIdxSize = newSize , cursorCurrentLineNumber = newSize } pure (newState, newSize) -- otherwise access the offset in the cache (list + array) else do let nextSeekIndex = size - ln -- if the seek index is bigger than the current list size if nextSeekIndex >= fromIntegral (length idx) -- try to access the array then case arr of Just a -> hSeek fileHandle AbsoluteSeek (a A.! (nextSeekIndex - fromIntegral (length idx))) Nothing -> error "goToLine: there is no array" -- otherwise take the offset from the list else hSeek fileHandle AbsoluteSeek (idx !! fromIntegral nextSeekIndex) pure (cs { cursorCurrentLineNumber = ln } , ln) where readUntil 0 idx = pure idx readUntil counter idx = hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do _ <- hGetLine fileHandle offset <- hTell fileHandle readUntil (counter - 1) (fromInteger offset : idx) getCursorState' :: CursorHandle -> IO [Integer] getCursorState' CursorHandle{..} = do CursorState l arr _ _ <- readMVar cursorState pure $ reverse $ l ++ maybe [] A.elems arr -- Utils getLatestIdx :: CursorState -> Integer getLatestIdx (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0 getLatestIdx (CursorState idx Nothing _ _) = idx !! 0