Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data.ByteString.Lazy.dropEnd: Use two-pointers technique #629

Merged
merged 8 commits into from
Feb 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 80 additions & 49 deletions Data/ByteString/Lazy.hs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
{-# OPTIONS_HADDOCK prune #-}
{-# LANGUAGE Trustworthy #-}

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ScopedTypeVariables #-}

-- |
-- Module : Data.ByteString.Lazy
-- Copyright : (c) Don Stewart 2006
Expand Down Expand Up @@ -237,9 +240,9 @@ import qualified Data.ByteString as P (ByteString) -- type name only
import qualified Data.ByteString as S -- S for strict (hmm...)
import qualified Data.ByteString.Internal.Type as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.ByteString.Lazy.Internal.Deque as D
import Data.ByteString.Lazy.Internal

import Control.Exception (assert)
import Control.Monad (mplus)
import Data.Word (Word8)
import Data.Int (Int64)
Expand Down Expand Up @@ -790,15 +793,75 @@ take i cs0 = take' i cs0
--
-- @since 0.11.2.0
takeEnd :: Int64 -> ByteString -> ByteString
takeEnd i _ | i <= 0 = Empty
takeEnd i cs0 = takeEnd' i cs0
where takeEnd' 0 _ = Empty
takeEnd' n cs =
snd $ foldrChunks takeTuple (n,Empty) cs
takeTuple _ (0, cs) = (0, cs)
takeTuple c (n, cs)
| n > fromIntegral (S.length c) = (n - fromIntegral (S.length c), Chunk c cs)
| otherwise = (0, Chunk (S.takeEnd (fromIntegral n) c) cs)
takeEnd i bs
| i <= 0 = Empty
| otherwise = splitAtEndFold (\_ res -> res) id i bs

-- | Helper function for implementing 'takeEnd' and 'dropEnd'
splitAtEndFold
:: forall result
. (S.StrictByteString -> result -> result)
-- ^ What to do when one chunk of output is ready
-- (The StrictByteString will not be empty.)
-> (ByteString -> result)
-- ^ What to do when the split-point is reached
-> Int64
-- ^ Number of bytes to leave at the end (must be strictly positive)
-> ByteString -- ^ Input ByteString
-> result
{-# INLINE splitAtEndFold #-}
splitAtEndFold step end len bs0 = assert (len > 0) $ case bs0 of
Empty -> end Empty
Chunk c t -> goR len c t t
where
-- Idea: Keep two references into the input ByteString:
-- "toSplit" tracks the current split point,
-- "toScan" tracks the yet-unprocessed tail.
-- When they are closer than "len" bytes apart, process more input. ("goR")
-- When they are at least "len" bytes apart, produce more output. ("goL")
-- We always have that "toScan" is a suffix of "toSplit",
-- and "toSplit" is a suffix of the original input (bs0).
goR :: Int64 -> S.StrictByteString -> ByteString -> ByteString -> result
goR !undershoot nextOutput@(S.BS noFp noLen) toSplit toScan =
assert (undershoot > 0) $
-- INVARIANT: length toSplit == length toScan + len - undershoot
-- (not 'assert'ed because that would break our laziness properties)
case toScan of
Empty
| undershoot >= intToInt64 noLen
-> end (Chunk nextOutput toSplit)
| undershootW <- fromIntegral @Int64 @Int undershoot
-- conversion Int64->Int is OK because 0 < undershoot < noLen
, splitIndex <- noLen - undershootW
, beforeSplit <- S.BS noFp splitIndex
, afterSplit <- S.BS (noFp `S.plusForeignPtr` splitIndex) undershootW
-> step beforeSplit $ end (Chunk afterSplit toSplit)

Chunk (S.BS _ cLen) newBsR
| cLen64 <- intToInt64 cLen
, undershoot > cLen64
-> goR (undershoot - cLen64) nextOutput toSplit newBsR
| undershootW <- fromIntegral @Int64 @Int undershoot
-> step nextOutput $ goL (cLen - undershootW) toSplit newBsR

goL :: Int -> ByteString -> ByteString -> result
goL !overshoot toSplit toScan =
assert (overshoot >= 0) $
-- INVARIANT: length toSplit == length toScan + len + intToInt64 overshoot
-- (not 'assert'ed because that would break our laziness properties)
case toSplit of
Empty -> splitAtEndFoldInvariantFailed
Chunk c@(S.BS _ cLen) newBsL
| overshoot >= cLen
-> step c $ goL (overshoot - cLen) newBsL toScan
| otherwise
-> goR (intToInt64 $ cLen - overshoot) c newBsL toScan

splitAtEndFoldInvariantFailed :: a
-- See Note [Float error calls out of INLINABLE things] in D.B.Internal.Type
splitAtEndFoldInvariantFailed =
moduleError "splitAtEndFold"
"internal error: toSplit not longer than toScan"

-- | /O(n\/c)/ 'drop' @n xs@ returns the suffix of @xs@ after the first @n@
-- elements, or 'empty' if @n > 'length' xs@.
Expand All @@ -824,44 +887,9 @@ drop i cs0 = drop' i cs0
--
-- @since 0.11.2.0
dropEnd :: Int64 -> ByteString -> ByteString
dropEnd i p | i <= 0 = p
dropEnd i p = go D.empty p
where go :: D.Deque -> ByteString -> ByteString
go deque (Chunk c cs)
| D.byteLength deque < i = go (D.snoc c deque) cs
| otherwise =
let (output, deque') = getOutput empty (D.snoc c deque)
in foldrChunks Chunk (go deque' cs) output
go deque Empty = fromDeque $ dropEndBytes deque i

len c = fromIntegral (S.length c)

-- get a `ByteString` from all the front chunks of the accumulating deque
-- for which we know they won't be dropped
getOutput :: ByteString -> D.Deque -> (ByteString, D.Deque)
getOutput out deque = case D.popFront deque of
Nothing -> (reverseChunks out, deque)
Just (x, deque') | D.byteLength deque' >= i ->
getOutput (Chunk x out) deque'
_ -> (reverseChunks out, deque)

-- reverse a `ByteString`s chunks, keeping all internal `S.StrictByteString`s
-- unchanged
reverseChunks = foldlChunks (flip Chunk) empty

-- drop n elements from the rear of the accumulating `deque`
dropEndBytes :: D.Deque -> Int64 -> D.Deque
dropEndBytes deque n = case D.popRear deque of
Nothing -> deque
Just (deque', x) | len x <= n -> dropEndBytes deque' (n - len x)
| otherwise ->
D.snoc (S.dropEnd (fromIntegral n) x) deque'

-- build a lazy ByteString from an accumulating `deque`
fromDeque :: D.Deque -> ByteString
fromDeque deque =
List.foldr chunk Empty (D.front deque) `append`
List.foldl' (flip chunk) Empty (D.rear deque)
dropEnd i p
| i <= 0 = p
| otherwise = splitAtEndFold Chunk (const Empty) i p

-- | /O(n\/c)/ 'splitAt' @n xs@ is equivalent to @('take' n xs, 'drop' n xs)@.
splitAt :: Int64 -> ByteString -> (ByteString, ByteString)
Expand Down Expand Up @@ -1688,6 +1716,9 @@ revNonEmptyChunks = List.foldl' (flip Chunk) Empty
revChunks :: [P.ByteString] -> ByteString
revChunks = List.foldl' (flip chunk) Empty

intToInt64 :: Int -> Int64
intToInt64 = fromIntegral @Int @Int64

-- $IOChunk
--
-- ⚠ Using lazy I\/O functions like 'readFile' or 'hGetContents'
Expand Down
65 changes: 0 additions & 65 deletions Data/ByteString/Lazy/Internal/Deque.hs

This file was deleted.

24 changes: 21 additions & 3 deletions bench/BenchAll.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import Data.Semigroup
import Data.String
import Test.Tasty.Bench
import Prelude hiding (words)
import qualified Data.List as List
import Control.DeepSeq

import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
Expand Down Expand Up @@ -99,9 +101,12 @@ lazyByteStringData = case S.splitAt (nRepl `div` 2) byteStringData of

{-# NOINLINE smallChunksData #-}
smallChunksData :: L.ByteString
smallChunksData
= L.fromChunks [S.take sz (S.drop n byteStringData)
| let sz = 48, n <- [0, sz .. S.length byteStringData]]
smallChunksData = L.fromChunks $ List.unfoldr step (byteStringData, 1)
where
step (!s, !i)
| S.null s = Nothing
| otherwise = case S.splitAt i s of
(!s1, !s2) -> Just (s1, (s2, i * 71 `mod` 97))

{-# NOINLINE byteStringChunksData #-}
byteStringChunksData :: [S.ByteString]
Expand Down Expand Up @@ -419,6 +424,19 @@ main = do
[ bench "strict" $ nf S.tails byteStringData
, bench "lazy" $ nf L.tails lazyByteStringData
]
, bgroup "splitAtEnd (lazy)" $ let
testSAE op = \bs -> [op i bs | i <- [0,5..L.length bs]] `deepseq` ()
{-# INLINE testSAE #-}
in
[ bench "takeEnd" $
nf (testSAE L.takeEnd) lazyByteStringData
, bench "takeEnd (small chunks)" $
nf (testSAE L.takeEnd) smallChunksData
, bench "dropEnd" $
nf (testSAE L.dropEnd) lazyByteStringData
, bench "dropEnd (small chunks)" $
nf (testSAE L.dropEnd) smallChunksData
]
, bgroup "sort" $ map (\s -> bench (S8.unpack s) $ nf S.sort s) sortInputs
, bgroup "stimes" $ let st = stimes :: Int -> S.ByteString -> S.ByteString
in
Expand Down
1 change: 0 additions & 1 deletion bytestring.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ library
Data.ByteString.Builder.RealFloat.Internal
Data.ByteString.Builder.RealFloat.TableGenerator
Data.ByteString.Internal.Type
Data.ByteString.Lazy.Internal.Deque
Data.ByteString.Lazy.ReadInt
Data.ByteString.Lazy.ReadNat
Data.ByteString.ReadInt
Expand Down
38 changes: 26 additions & 12 deletions tests/Properties/ByteString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ import Test.Tasty
import Test.Tasty.QuickCheck
import QuickCheckUtils

#ifdef BYTESTRING_LAZY
import Data.Int
#endif

#ifndef BYTESTRING_CHAR8
toElem :: Word8 -> Word8
toElem = id
Expand Down Expand Up @@ -114,16 +118,25 @@ instance Arbitrary Natural where

testRdInt :: forall a. (Arbitrary a, RdInt a) => String -> TestTree
testRdInt s = testGroup s $
[ testProperty "from string" $ \ prefix value suffix ->
[ testProperty "from string" $ int64OK $ \value prefix suffix ->
let si = show @a value
b = prefix <> B.pack si <> suffix
in fmap (second B.unpack) (bread @a b)
=== sread @a (B.unpack prefix ++ si ++ B.unpack suffix)
, testProperty "from number" $ \n ->
, testProperty "from number" $ int64OK $ \n ->
bread @a (B.pack (show n)) === Just (n, B.empty)
]
#endif

intToIndexTy :: Int -> IndexTy
#ifdef BYTESTRING_LAZY
type IndexTy = Int64
intToIndexTy = fromIntegral @Int @Int64
#else
type IndexTy = Int
intToIndexTy = id
#endif

tests :: [TestTree]
tests =
[ testProperty "pack . unpack" $
Expand Down Expand Up @@ -308,7 +321,7 @@ tests =
#endif

, testProperty "drop" $
\n x -> B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
\(intToIndexTy -> n) x -> B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
, testProperty "drop 10" $
\x -> let n = 10 in B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
, testProperty "drop 2^31" $
Expand All @@ -325,7 +338,7 @@ tests =
#endif

, testProperty "take" $
\n x -> B.unpack (B.take n x) === List.genericTake n (B.unpack x)
\(intToIndexTy -> n) x -> B.unpack (B.take n x) === List.genericTake n (B.unpack x)
, testProperty "take 10" $
\x -> let n = 10 in B.unpack (B.take n x) === List.genericTake n (B.unpack x)
, testProperty "take 2^31" $
Expand All @@ -342,11 +355,11 @@ tests =
#endif

, testProperty "dropEnd" $
\n x -> B.dropEnd n x === B.take (B.length x - n) x
\(intToIndexTy -> n) x -> B.dropEnd n x === B.take (B.length x - n) x
, testProperty "dropWhileEnd" $
\f x -> B.dropWhileEnd f x === B.reverse (B.dropWhile f (B.reverse x))
, testProperty "takeEnd" $
\n x -> B.takeEnd n x === B.drop (B.length x - n) x
\(intToIndexTy -> n) x -> B.takeEnd n x === B.drop (B.length x - n) x
, testProperty "takeWhileEnd" $
\f x -> B.takeWhileEnd f x === B.reverse (B.takeWhile f (B.reverse x))

Expand All @@ -366,7 +379,7 @@ tests =
, testProperty "compareLength 4" $
\x (toElem -> c) -> B.compareLength (B.snoc x c <> undefined) (B.length x) === GT
, testProperty "compareLength 5" $
\x n -> B.compareLength x n === compare (B.length x) n
\x (intToIndexTy -> n) -> B.compareLength x n === compare (B.length x) n
, testProperty "dropEnd lazy" $
\(toElem -> c) -> B.take 1 (B.dropEnd 1 (B.singleton c <> B.singleton c <> B.singleton c <> undefined)) === B.singleton c
, testProperty "dropWhileEnd lazy" $
Expand Down Expand Up @@ -470,7 +483,8 @@ tests =
(l1 == l2 || l1 == l2 + 1) && sum (map B.length splits) + l2 == B.length x

, testProperty "splitAt" $
\n x -> (B.unpack *** B.unpack) (B.splitAt n x) === List.genericSplitAt n (B.unpack x)
\(intToIndexTy -> n) x -> (B.unpack *** B.unpack) (B.splitAt n x)
=== List.genericSplitAt n (B.unpack x)
, testProperty "splitAt 10" $
\x -> let n = 10 in (B.unpack *** B.unpack) (B.splitAt n x) === List.genericSplitAt n (B.unpack x)
, testProperty "splitAt (2^31)" $
Expand Down Expand Up @@ -594,13 +608,13 @@ tests =
#endif

, testProperty "index" $
\(NonNegative n) x -> fromIntegral n < B.length x ==> B.index x (fromIntegral n) === B.unpack x !! n
\(NonNegative n) x -> intToIndexTy n < B.length x ==> B.index x (intToIndexTy n) === B.unpack x !! n
, testProperty "indexMaybe" $
\(NonNegative n) x -> fromIntegral n < B.length x ==> B.indexMaybe x (fromIntegral n) === Just (B.unpack x !! n)
\(NonNegative n) x -> intToIndexTy n < B.length x ==> B.indexMaybe x (intToIndexTy n) === Just (B.unpack x !! n)
, testProperty "indexMaybe Nothing" $
\n x -> (n :: Int) < 0 || fromIntegral n >= B.length x ==> B.indexMaybe x (fromIntegral n) === Nothing
\n x -> n < 0 || intToIndexTy n >= B.length x ==> B.indexMaybe x (intToIndexTy n) === Nothing
, testProperty "!?" $
\n x -> B.indexMaybe x (fromIntegral (n :: Int)) === x B.!? (fromIntegral n)
\(intToIndexTy -> n) x -> B.indexMaybe x n === x B.!? n

#ifdef BYTESTRING_CHAR8
, testProperty "isString" $
Expand Down
Loading
Loading