--
-- Line-indexed file reader.
--
--- Lazily builds the index of line numbers while reading the file
+-- Lazily builds the index with the line numbers while reading the file
-- making it possible to rewind to them quickly later.
-----------------------------------------------------------------------------
-module System.IO.LineIndexedCursor (
- LineIndexedCursor(..), mkLineIndexedCursor, mkLineIndexedCursorWithCapacity
+module System.IO.LineIndexedCursor
+ ( LineIndexedCursor(..)
+ , mkLineIndexedCursor
+ , mkLineIndexedCursorWithCapacity
) where
+import Data.Maybe (fromMaybe)
import qualified Data.Array as A
import Data.ByteString (ByteString, hGetLine)
import Control.Concurrent.MVar
defaultListCapacity = 16384
-- | ADT with methods, hiding the internal state.
+--
+-- 'LineIndexedCursor.getCurrentLine', 'LineIndexedCursor.getCurrentLineUnsafe',
+-- 'LineIndexedCursor.doFullScan', and 'LineIndexedCursor.goToLine', all throw 'System.IO.IOError'.
data LineIndexedCursor = LineIndexedCursor
{
- -- | Same as 'hGetLine' but safe.
+ -- | Same as 'LineIndexedCursor.getCurrentLineUnsafe' but safely handles 'System.IO.EOF'.
getCurrentLine :: IO (Maybe ByteString)
- -- | Returns current line number.
+ -- | A wrapper around 'hGetLine'. Throws the same exceptions.
+ , getCurrentLineUnsafe :: IO ByteString
+
+ -- | Returns the current line number.
, getCurrentLineNumber :: IO Integer
- -- | Reads from the latest line from index until EOF to build the full index.
+ -- | Reads from the latest known line until EOF to build the full index.
, doFullScan :: IO ()
- -- | Rewinds to the requsted line number. Stops at EOF if it's too big.
- -- Returns the reached line number.
+ -- | Rewinds the file handle to the requsted line number. Stops at the EOF if it's too big,
+ -- returning the reached line number.
, goToLine :: Integer -> IO Integer
-- | Returns the file 'Handle'.
, getHandle :: Handle
+
+ -- | Returns the current state of the cursor — all known line indexes.
+ , getCursorState :: IO [Integer]
}
data CursorHandle = CursorHandle
data CursorState = CursorState
{ cursorLinesIdx :: ![Integer]
- , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer))
+ , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer)) -- uses Maybe since can't be empty
, cursorIdxSize :: !Integer
, cursorCurrentLineNumber :: !Integer
}
-mElems :: (Maybe (A.Array Integer Integer)) -> [Integer]
-mElems = maybe [] A.elems
-
{- |
-Builds 'LineIndexedCursor'.
-
-Resets the file handle's ofsset to the beginning.
+Builds 'LineIndexedCursor'. Resets the file handle's ofsset to the beginning.
Use 'System.IO.hSetNewlineMode' if you want to configure 'System.IO.NewlineMode'.
mkLineIndexedCursor :: Handle -> IO LineIndexedCursor
mkLineIndexedCursor = flip mkLineIndexedCursorWithCapacity defaultListCapacity
+-- | Same as 'mkLineIndexedCursor' but allows to configure the list's capacity.
mkLineIndexedCursorWithCapacity :: Handle -> Integer -> IO LineIndexedCursor
mkLineIndexedCursorWithCapacity fileHandle listCapacity = do
-- reset the handle's offset to the beginning
let cursorHandle = CursorHandle fileHandle cursorState listCapacity
pure $ LineIndexedCursor
{ getCurrentLine = getCurrentLine' cursorHandle
+ , getCurrentLineUnsafe = getCurrentLineUnsafe' cursorHandle
, getCurrentLineNumber = getCurrentLineNumber' cursorHandle
, doFullScan = doFullScan' cursorHandle
, goToLine = goToLine' cursorHandle
, getHandle = fileHandle
+ , getCursorState = getCursorState' cursorHandle
}
getCurrentLine' :: CursorHandle -> IO (Maybe ByteString)
line <- hGetLine fileHandle
offset <- hTell fileHandle
- modifyMVar_ cursorState $ \(CursorState idx arr size cln) -> pure $
- if (not $ offset `elem` idx)
- then
- let
- (newIdx, newArr) =
- if length (offset : idx) > fromIntegral listCapacity
- then
- let res = (offset : idx) ++ mElems arr
- in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
- else (offset : idx, arr)
- in CursorState
- { cursorLinesIdx = newIdx
- , cursorLinesArrIdx = newArr
- , cursorIdxSize = size + 1
- , cursorCurrentLineNumber = cln + 1
- }
- else CursorState
- { cursorLinesIdx = idx
- , cursorLinesArrIdx = arr
- , cursorIdxSize = size
- , cursorCurrentLineNumber = cln + 1
- }
+ modifyMVar_ cursorState $ \cs@(CursorState idx arr size cln) ->
+ let latestIdx = getLatestIdx cs
+ in pure $
+ if (offset <= latestIdx)
+ -- we already know this offset, so just increment the current line number
+ then cs { cursorCurrentLineNumber = cln + 1 }
+ -- otherwise we need to add the offset
+ else
+ let
+ (newIdx, newArr) =
+ -- if we have exceed the list capacity
+ if length (offset : idx) > fromIntegral listCapacity
+ -- move the list content to the array and empty the list
+ then
+ let res = (offset : idx) ++ maybe [] A.elems arr
+ in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
+ -- otherwise keep the offset in the list
+ else (offset : idx, arr)
+ in CursorState
+ { cursorLinesIdx = newIdx
+ , cursorLinesArrIdx = newArr
+ , cursorIdxSize = size + 1
+ , cursorCurrentLineNumber = cln + 1
+ }
pure $ Just line
+getCurrentLineUnsafe' :: CursorHandle -> IO ByteString
+getCurrentLineUnsafe' ch = do
+ cl <- getCurrentLine' ch
+ pure $ fromMaybe (error "getCurrentLineUnsafe: couldn't get the current line") cl
+
doFullScan' :: CursorHandle -> IO ()
doFullScan' CursorHandle{..} = do
modifyMVar_ cursorState $ \cs@(CursorState idx arr size _) -> do
-- go to the end of the index
- hSeek fileHandle AbsoluteSeek (getFirst cs)
+ hSeek fileHandle AbsoluteSeek (getLatestIdx cs)
-- try to read until the EOF
idxTail <- readUntilEOF []
let
goToLine' :: CursorHandle -> Integer -> IO Integer
goToLine' ch@CursorHandle{..} ln =
+ -- handle negative input
if (ln < 0) then getCurrentLineNumber' ch
else modifyMVar cursorState $ \cs@(CursorState idx arr size _) -> do
+ -- if the requested line number is out of the index's scope
if ln > size then do
- hSeek fileHandle AbsoluteSeek (getFirst cs)
+ -- go to the end of the index
+ hSeek fileHandle AbsoluteSeek (getLatestIdx cs)
-- try to read until the requested line number
idxTail <- readUntil (ln - size) []
let
newSize = size + (fromIntegral $ length idxTail)
(newIdx, newArr) =
+ -- if we have exceed the list capacity
if newSize > listCapacity
+ -- move the list content to the array and empty the list
then
- let res = (idxTail ++ idx) ++ mElems arr
+ let res = (idxTail ++ idx) ++ maybe [] A.elems arr
in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
+ -- otherwise add offsets to the list
else (idxTail ++ idx, arr)
newState = CursorState
{ cursorLinesIdx = newIdx
, cursorCurrentLineNumber = newSize
}
pure (newState, newSize)
+ -- otherwise access the offset in the cache (list + array)
else do
let nextSeekIndex = size - ln
-
+ -- if the seek index is bigger than the current list size
if nextSeekIndex >= fromIntegral (length idx)
+ -- try to access the array
then case arr of
Just a -> hSeek fileHandle AbsoluteSeek (a A.! (nextSeekIndex - fromIntegral (length idx)))
Nothing -> error "goToLine: there is no array"
+ -- otherwise take the offset from the list
else hSeek fileHandle AbsoluteSeek (idx !! fromIntegral nextSeekIndex)
- let
- newState = CursorState
- { cursorLinesIdx = idx
- , cursorLinesArrIdx = arr
- , cursorIdxSize = size
- , cursorCurrentLineNumber = ln
- }
- pure (newState, ln)
+ pure (cs { cursorCurrentLineNumber = ln } , ln)
where
readUntil 0 idx = pure idx
readUntil counter idx =
offset <- hTell fileHandle
readUntil (counter - 1) (fromInteger offset : idx)
-getFirst :: CursorState -> Integer
-getFirst (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0
-getFirst (CursorState idx Nothing _ _) = idx !! 0
\ No newline at end of file
+getCursorState' :: CursorHandle -> IO [Integer]
+getCursorState' CursorHandle{..} = do
+ CursorState l arr _ _ <- readMVar cursorState
+ pure $ reverse $ l ++ maybe [] A.elems arr
+
+-- Utils
+
+getLatestIdx :: CursorState -> Integer
+getLatestIdx (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0
+getLatestIdx (CursorState idx Nothing _ _) = idx !! 0