From 526b2feec59a97d75cef90e068ff463c01f7039c Mon Sep 17 00:00:00 2001 From: Evgenii Akentev Date: Fri, 21 Jul 2023 07:17:31 +0400 Subject: [PATCH] Add benchmark + array based implementation --- bench/Bench.hs | 54 +++++++++++ line-indexed-cursor.cabal | 18 +++- src/System/IO/LineIndexedCursor.hs | 128 ++++++++++++++++++++++---- test/Main.hs | 143 ++++++++++++++++++----------- 4 files changed, 271 insertions(+), 72 deletions(-) create mode 100644 bench/Bench.hs diff --git a/bench/Bench.hs b/bench/Bench.hs new file mode 100644 index 0000000..a6d2cdc --- /dev/null +++ b/bench/Bench.hs @@ -0,0 +1,54 @@ +module Main (main) where + +import Criterion.Main +import System.IO +import System.Random (randomRs, mkStdGen) + +import System.IO.LineIndexedCursor + +main :: IO () +main = do + -- benchmarked against http://mattmahoney.net/dc/enwik9.zip + h <- openFile "test/enwik9" ReadMode + c <- mkLineIndexedCursor h + defaultMain + [ bgroup "LineIndexedCursor" + [ bench "" (nfIO $ goToLinesBench c) + ] + ] + +goToLinesBench :: LineIndexedCursor -> IO () +goToLinesBench c = do + mapM_ (goToLine c) randomOffsets + +randomOffsets :: [Integer] +randomOffsets = take 100 $ + randomRs (0, 13147025) (mkStdGen 343) +{-# NOINLINE randomOffsets #-} + + +{- +Array based results + +benchmarking LineIndexedCursor/ +time 115.8 μs (115.3 μs .. 116.4 μs) + 1.000 R² (1.000 R² .. 1.000 R²) +mean 116.4 μs (115.9 μs .. 117.0 μs) +std dev 1.782 μs (1.510 μs .. 2.271 μs) + +Benchmark bench: FINISH +-} + + +{- +List based results + +benchmarking LineIndexedCursor/ +time 15.13 s (9.490 s .. 22.09 s) + 0.976 R² (0.926 R² .. 1.000 R²) +mean 20.05 s (17.70 s .. 23.47 s) +std dev 3.576 s (624.5 ms .. 4.685 s) +variance introduced by outliers: 47% (moderately inflated) + +Benchmark bench: FINISH +-} \ No newline at end of file diff --git a/line-indexed-cursor.cabal b/line-indexed-cursor.cabal index 3aaec61..0cbf8fd 100644 --- a/line-indexed-cursor.cabal +++ b/line-indexed-cursor.cabal @@ -19,7 +19,7 @@ common warnings library import: warnings exposed-modules: System.IO.LineIndexedCursor - build-depends: base ^>=4.18.0.0, bytestring ^>= 0.11 + build-depends: base >= 4.7 && < 5, bytestring ^>= 0.11, array hs-source-dirs: src default-language: Haskell2010 @@ -33,3 +33,19 @@ test-suite line-indexed-cursor-test base >= 4.7 && < 5, hspec >= 2.10 && < 3, line-indexed-cursor + +benchmark bench + import: warnings + ghc-options: + -threaded + -rtsopts + -with-rtsopts=-N + default-language: Haskell2010 + type: exitcode-stdio-1.0 + hs-source-dirs: bench + main-is: Bench.hs + build-depends: + base >= 4.7 && < 5, + criterion, + random, + line-indexed-cursor diff --git a/src/System/IO/LineIndexedCursor.hs b/src/System/IO/LineIndexedCursor.hs index 10e2819..5d550eb 100644 --- a/src/System/IO/LineIndexedCursor.hs +++ b/src/System/IO/LineIndexedCursor.hs @@ -13,13 +13,17 @@ ----------------------------------------------------------------------------- module System.IO.LineIndexedCursor ( - LineIndexedCursor(..), mkLineIndexedCursor + LineIndexedCursor(..), mkLineIndexedCursor, mkLineIndexedCursorWithCapacity ) where +import qualified Data.Array as A import Data.ByteString (ByteString, hGetLine) import Control.Concurrent.MVar import System.IO (Handle, hTell, hSeek, SeekMode(..), hIsEOF) +defaultListCapacity :: Integer +defaultListCapacity = 16384 + -- | ADT with methods, hiding the internal state. data LineIndexedCursor = LineIndexedCursor { @@ -29,6 +33,9 @@ data LineIndexedCursor = LineIndexedCursor -- | Returns current line number. , getCurrentLineNumber :: IO Integer + -- | Reads from the latest line from index until EOF to build the full index. + , doFullScan :: IO () + -- | Rewinds to the requsted line number. Stops at EOF if it's too big. -- Returns the reached line number. , goToLine :: Integer -> IO Integer @@ -39,9 +46,20 @@ data LineIndexedCursor = LineIndexedCursor data CursorHandle = CursorHandle { fileHandle :: Handle - , linesIdx :: MVar ([Integer], Integer, Integer) + , cursorState :: MVar CursorState + , listCapacity :: Integer + } + +data CursorState = CursorState + { cursorLinesIdx :: ![Integer] + , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer)) + , cursorIdxSize :: !Integer + , cursorCurrentLineNumber :: !Integer } +mElems :: (Maybe (A.Array Integer Integer)) -> [Integer] +mElems = maybe [] A.elems + {- | Builds 'LineIndexedCursor'. @@ -52,16 +70,20 @@ Use 'System.IO.hSetNewlineMode' if you want to configure 'System.IO.NewlineMode' -} mkLineIndexedCursor :: Handle -> IO LineIndexedCursor -mkLineIndexedCursor fileHandle = do +mkLineIndexedCursor = flip mkLineIndexedCursorWithCapacity defaultListCapacity + +mkLineIndexedCursorWithCapacity :: Handle -> Integer -> IO LineIndexedCursor +mkLineIndexedCursorWithCapacity fileHandle listCapacity = do -- reset the handle's offset to the beginning hSeek fileHandle AbsoluteSeek 0 - linesIdx <- newMVar ([0], 0, 0) + cursorState <- newMVar $ CursorState [0] Nothing 0 0 - let cursorHandle = CursorHandle fileHandle linesIdx + let cursorHandle = CursorHandle fileHandle cursorState listCapacity pure $ LineIndexedCursor { getCurrentLine = getCurrentLine' cursorHandle , getCurrentLineNumber = getCurrentLineNumber' cursorHandle + , doFullScan = doFullScan' cursorHandle , goToLine = goToLine' cursorHandle , getHandle = fileHandle } @@ -71,35 +93,107 @@ getCurrentLine' CursorHandle{..} = hIsEOF fileHandle >>= \isEOF -> if isEOF then pure Nothing else do line <- hGetLine fileHandle offset <- hTell fileHandle - modifyMVar_ linesIdx $ \(idx, size, cln) -> pure $ + + modifyMVar_ cursorState $ \(CursorState idx arr size cln) -> pure $ if (not $ offset `elem` idx) - then (offset : idx, size + 1, cln + 1) - else (idx, size, cln + 1) + then + let + (newIdx, newArr) = + if length (offset : idx) > fromIntegral listCapacity + then + let res = (offset : idx) ++ mElems arr + in ([], Just $ A.listArray (0, toInteger $ length res - 1) res) + else (offset : idx, arr) + in CursorState + { cursorLinesIdx = newIdx + , cursorLinesArrIdx = newArr + , cursorIdxSize = size + 1 + , cursorCurrentLineNumber = cln + 1 + } + else CursorState + { cursorLinesIdx = idx + , cursorLinesArrIdx = arr + , cursorIdxSize = size + , cursorCurrentLineNumber = cln + 1 + } pure $ Just line +doFullScan' :: CursorHandle -> IO () +doFullScan' CursorHandle{..} = do + modifyMVar_ cursorState $ \cs@(CursorState idx arr size _) -> do + -- go to the end of the index + hSeek fileHandle AbsoluteSeek (getFirst cs) + -- try to read until the EOF + idxTail <- readUntilEOF [] + let + newSize = size + (fromIntegral $ length idxTail) + newState = CursorState + { cursorLinesIdx = idxTail ++ idx + , cursorLinesArrIdx = arr + , cursorIdxSize = newSize + , cursorCurrentLineNumber = newSize + } + pure newState + where + readUntilEOF idx = + hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do + _ <- hGetLine fileHandle + offset <- hTell fileHandle + readUntilEOF (fromInteger offset : idx) + getCurrentLineNumber' :: CursorHandle -> IO Integer getCurrentLineNumber' CursorHandle{..} = do - (_, _, cln) <- readMVar linesIdx + CursorState _ _ _ cln <- readMVar cursorState pure cln goToLine' :: CursorHandle -> Integer -> IO Integer goToLine' ch@CursorHandle{..} ln = if (ln < 0) then getCurrentLineNumber' ch - else modifyMVar linesIdx $ \(idx, size, _) -> do + else modifyMVar cursorState $ \cs@(CursorState idx arr size _) -> do if ln > size then do - hSeek fileHandle AbsoluteSeek (idx !! 0) + hSeek fileHandle AbsoluteSeek (getFirst cs) -- try to read until the requested line number idxTail <- readUntil (ln - size) [] - let newSize = size + (fromIntegral $ length idxTail) - pure ((idxTail ++ idx, newSize, newSize), newSize) + let + newSize = size + (fromIntegral $ length idxTail) + (newIdx, newArr) = + if newSize > listCapacity + then + let res = (idxTail ++ idx) ++ mElems arr + in ([], Just $ A.listArray (0, toInteger $ length res - 1) res) + else (idxTail ++ idx, arr) + newState = CursorState + { cursorLinesIdx = newIdx + , cursorLinesArrIdx = newArr + , cursorIdxSize = newSize + , cursorCurrentLineNumber = newSize + } + pure (newState, newSize) else do - let nextSeekIndex = fromIntegral $ size - ln - hSeek fileHandle AbsoluteSeek (idx !! nextSeekIndex) - pure ((idx, size, ln), ln) + let nextSeekIndex = size - ln + + if nextSeekIndex >= fromIntegral (length idx) + then case arr of + Just a -> hSeek fileHandle AbsoluteSeek (a A.! (nextSeekIndex - fromIntegral (length idx))) + Nothing -> error "goToLine: there is no array" + else hSeek fileHandle AbsoluteSeek (idx !! fromIntegral nextSeekIndex) + + let + newState = CursorState + { cursorLinesIdx = idx + , cursorLinesArrIdx = arr + , cursorIdxSize = size + , cursorCurrentLineNumber = ln + } + pure (newState, ln) where readUntil 0 idx = pure idx readUntil counter idx = hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do _ <- hGetLine fileHandle offset <- hTell fileHandle - readUntil (counter - 1) (fromInteger offset : idx) \ No newline at end of file + readUntil (counter - 1) (fromInteger offset : idx) + +getFirst :: CursorState -> Integer +getFirst (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0 +getFirst (CursorState idx Nothing _ _) = idx !! 0 \ No newline at end of file diff --git a/test/Main.hs b/test/Main.hs index 9e74031..20cbf3a 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -4,87 +4,122 @@ module Main (main) where import System.IO import System.IO.LineIndexedCursor +import Data.Foldable import Test.Hspec main :: IO () main = hspec $ do let - mkCursor = do + mkCursor capacity = do h <- openFile "test/testdata" ReadMode - c <- mkLineIndexedCursor h + c <- mkLineIndexedCursorWithCapacity h capacity pure (h, c) - before mkCursor . after (\(h, _) -> hClose h) - $ describe "System.IO.LineIndexedCursor" $ do + forM_ [0 :: Integer ..20] $ \capacity -> do + before (mkCursor capacity) . after (\(h, _) -> hClose h) + $ describe ("System.IO.LineIndexedCursor with list capacity " ++ show capacity) $ do - it "getCurrentLine works" $ \(_, c) -> do - l <- getCurrentLine c - l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + it "getCurrentLine works" $ \(_, c) -> do + l <- getCurrentLine c + l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit." - it "goToLine works" $ \(_, c) -> do - ln <- goToLine c 10 - ln `shouldBe` 10 + ln <- goToLine c 3 + ln `shouldBe` 3 - l <- getCurrentLine c - l `shouldBe` Just "Sed elementum velit sit amet orci mollis tincidunt." + l' <- getCurrentLine c + l' `shouldBe` Just "Curabitur nec mi sit amet justo condimentum gravida." - it "goToLine is negative" $ \(_, c) -> do - ln <- goToLine c (-10) - ln `shouldBe` 0 + l'' <- getCurrentLine c + l'' `shouldBe` Just "Pellentesque accumsan dolor at nisl pulvinar, ut bibendum diam egestas." - l <- getCurrentLine c - l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + ln' <- goToLine c 3 + ln' `shouldBe` 3 - it "goToLine is too big" $ \(_, c) -> do - l <- getCurrentLine c - l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + ln'' <- goToLine c 2 + ln'' `shouldBe` 2 - ln <- goToLine c 30 - ln `shouldBe` 20 + ln''' <- goToLine c 1 + ln''' `shouldBe` 1 - l' <- getCurrentLine c - l' `shouldBe` Nothing + ln'''' <- goToLine c 0 + ln'''' `shouldBe` 0 - it "read line, then go to beginning and forth" $ \(_, c) -> do - cln <- getCurrentLineNumber c - cln `shouldBe` 0 + it "goToLine works" $ \(_, c) -> do + ln <- goToLine c 10 + ln `shouldBe` 10 - l <- getCurrentLine c - l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + l <- getCurrentLine c + l `shouldBe` Just "Sed elementum velit sit amet orci mollis tincidunt." - cln' <- getCurrentLineNumber c - cln' `shouldBe` 1 + it "goToLine is negative" $ \(_, c) -> do + ln <- goToLine c (-10) + ln `shouldBe` 0 - _ <- getCurrentLine c - _ <- getCurrentLine c - _ <- getCurrentLine c - _ <- getCurrentLine c - _ <- getCurrentLine c + l <- getCurrentLine c + l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit." - cln'' <- getCurrentLineNumber c - cln'' `shouldBe` 6 + it "goToLine is too big" $ \(_, c) -> do + l <- getCurrentLine c + l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit." - ln <- goToLine c 0 - ln `shouldBe` 0 + ln <- goToLine c 30 + ln `shouldBe` 20 - cln''' <- getCurrentLineNumber c - cln''' `shouldBe` 0 + l' <- getCurrentLine c + l' `shouldBe` Nothing - l' <- getCurrentLine c - l' `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + it "fullScan works" $ \(_, c) -> do + l <- getCurrentLine c + l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit." - ln' <- goToLine c 5 - ln' `shouldBe` 5 + doFullScan c - l'' <- getCurrentLine c - l'' `shouldBe` Just "Curabitur nec dui posuere, tincidunt turpis vitae, tincidunt magna." + cln <- getCurrentLineNumber c + cln `shouldBe` 20 - ln'' <- goToLine c 6 - ln'' `shouldBe` 6 + l' <- getCurrentLine c + l' `shouldBe` Nothing - ln''' <- goToLine c 7 - ln''' `shouldBe` 7 + it "read line, then go to beginning and forth" $ \(_, c) -> do + cln <- getCurrentLineNumber c + cln `shouldBe` 0 - ln'''' <- goToLine c 10 - ln'''' `shouldBe` 10 + l <- getCurrentLine c + l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + + cln' <- getCurrentLineNumber c + cln' `shouldBe` 1 + + _ <- getCurrentLine c + _ <- getCurrentLine c + _ <- getCurrentLine c + _ <- getCurrentLine c + _ <- getCurrentLine c + + cln'' <- getCurrentLineNumber c + cln'' `shouldBe` 6 + + ln <- goToLine c 0 + ln `shouldBe` 0 + + cln''' <- getCurrentLineNumber c + cln''' `shouldBe` 0 + + l' <- getCurrentLine c + l' `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + + ln' <- goToLine c 5 + ln' `shouldBe` 5 + + l'' <- getCurrentLine c + l'' `shouldBe` Just "Curabitur nec dui posuere, tincidunt turpis vitae, tincidunt magna." + + ln'' <- goToLine c 6 + ln'' `shouldBe` 6 + + ln''' <- goToLine c 7 + ln''' `shouldBe` 7 + + ln'''' <- goToLine c 10 + ln'''' `shouldBe` 10 -- 2.34.1