Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for big values in SeederV2 #4222

Merged
merged 15 commits into from
Dec 5, 2024
15 changes: 13 additions & 2 deletions tests/dragonfly/seeder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class SeederBase:
UID_COUNTER = 1 # multiple generators should not conflict on keys
CACHED_SCRIPTS = {}
DEFAULT_TYPES = ["STRING", "LIST", "SET", "HASH", "ZSET", "JSON"]
BIG_VALUE_TYPES = ["LIST", "SET", "HASH", "ZSET"]

def __init__(self, types: typing.Optional[typing.List[str]] = None):
self.uid = SeederBase.UID_COUNTER
Expand Down Expand Up @@ -137,6 +138,8 @@ def __init__(
data_size=100,
collection_size=None,
types: typing.Optional[typing.List[str]] = None,
huge_value_percentage=5,
huge_value_size=16384,
):
SeederBase.__init__(self, types)
self.key_target = key_target
Expand All @@ -146,6 +149,9 @@ def __init__(
else:
self.collection_size = collection_size

self.huge_value_percentage = huge_value_percentage
self.huge_value_size = huge_value_size

self.units = [
Seeder.Unit(
prefix=f"k-s{self.uid}u{i}-",
Expand All @@ -166,6 +172,8 @@ async def run(self, client: aioredis.Redis, target_ops=None, target_deviation=No
target_deviation if target_deviation is not None else -1,
self.data_size,
self.collection_size,
self.huge_value_percentage,
self.huge_value_size,
]

sha = await client.script_load(Seeder._load_script("generate"))
Expand Down Expand Up @@ -196,8 +204,11 @@ async def _run_unit(client: aioredis.Redis, sha: str, unit: Unit, using_stopkey,
unit.stop_key if using_stopkey else "",
] + args

unit.counter = await client.evalsha(sha, 0, *args)
result = await client.evalsha(sha, 0, *args)
result = result.split()
unit.counter = int(result[0])
huge_entries = int(result[1])

logging.debug(
f"running unit {unit.prefix}/{unit.type} took {time.time() - s}, target {args[4+0]}"
f"running unit {unit.prefix}/{unit.type} took {time.time() - s}, target {args[4+0]}, huge entries {huge_entries}"
)
11 changes: 8 additions & 3 deletions tests/dragonfly/seeder/script-generate.lua
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,18 @@ local total_ops = tonumber(ARGV[6])
local min_dev = tonumber(ARGV[7])
local data_size = tonumber(ARGV[8])
local collection_size = tonumber(ARGV[9])
-- Probability of each key in key_target to be a big value
local huge_value_percentage = tonumber(ARGV[10])
local huge_value_size = tonumber(ARGV[11])

-- collect all keys belonging to this script
-- assumes exclusive ownership
local keys = LU_collect_keys(prefix, type)

LG_funcs.init(data_size, collection_size)
LG_funcs.init(data_size, collection_size, huge_value_percentage, huge_value_size)
local addfunc = LG_funcs['add_' .. string.lower(type)]
local modfunc = LG_funcs['mod_' .. string.lower(type)]
local huge_entries = LG_funcs["get_huge_entries"]

local function action_add()
local key = prefix .. tostring(key_counter)
Expand Down Expand Up @@ -84,7 +88,8 @@ while true do
-- update probability only every 10 iterations
if counter % 10 == 0 then
-- calculate intensity (not normalized probabilities)
-- please see attached plots in PR to undertand convergence
-- please see attached plots in PR to understand convergence
-- https://github.com/dragonflydb/dragonfly/pull/2556
chakaz marked this conversation as resolved.
Show resolved Hide resolved

-- the add intensity is monotonically decreasing with keycount growing,
-- the delete intensity is monotonically increasing with keycount growing,
Expand Down Expand Up @@ -121,4 +126,4 @@ if stop_key ~= '' then
redis.call('DEL', stop_key)
end

return key_counter
return tostring(key_counter) .. " " .. tostring(huge_entries())
72 changes: 58 additions & 14 deletions tests/dragonfly/seeder/script-genlib.lua
Original file line number Diff line number Diff line change
@@ -1,9 +1,43 @@
local LG_funcs = {}

function LG_funcs.init(dsize, csize)
function LG_funcs.init(dsize, csize, large_val_perc, large_val_sz)
LG_funcs.dsize = dsize
LG_funcs.csize = csize
LG_funcs.esize = math.ceil(dsize / csize)
LG_funcs.huge_value_percentage = large_val_perc
LG_funcs.huge_value_size = large_val_sz
end

local huge_entries = 0

local function huge_entry()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to expose this as a metric such that once the seeder finishes it will preempt how many big values it created. However, since this code is a script I don't see a "smart way".

Maybe a seeder can create a key in dragonfly (set big_values number_of_big_values) which can then the poll ?

@chakaz any ideas/thoughts?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can simply iterate over all db keys in this lua script. That shouldn't be too hard, nor slow.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(we can use SCAN, TYPE and MEMORY USAGE in the script to get all the info we seek)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about this and we don't really need scan. In fact I baked this metric in the lua script which we return -- works perfectly

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't you want huge_entry() to depend on the key? Such that some keys are huge, while others aren't, based on (say) their hash?
The reason I say this is because the seeder uses many operations to generate the values, like many lpush, hset, etc. If we do 100 operations per key (just throwing numbers here), doing 5% huge will make them all roughly of the same size...

local ratio = LG_funcs.huge_value_percentage / 100
-- [0, 1]
local rand = math.random()
local huge_entry = (ratio > rand)
return huge_entry
end

local function randstr()
local str
if huge_entry() then
str = dragonfly.randstr(LG_funcs.huge_value_size)
huge_entries = huge_entries + 1
else
str = dragonfly.randstr(LG_funcs.esize)
end
return str
end

local function randstr_sequence()
local strs
if huge_entry() then
strs = dragonfly.randstr(LG_funcs.huge_value_size, LG_funcs.csize)
huge_entries = huge_entries + 1
else
strs = dragonfly.randstr(LG_funcs.esize, LG_funcs.csize)
end
return strs
end

-- strings
Expand All @@ -27,12 +61,11 @@ end
-- lists
-- store list of random blobs of default container/element sizes

function LG_funcs.add_list(key)
local elements = dragonfly.randstr(LG_funcs.esize, LG_funcs.csize)
redis.apcall('LPUSH', key, unpack(elements))
function LG_funcs.add_list(key, huge_value)
redis.apcall('LPUSH', key, unpack(randstr_sequence()))
end

function LG_funcs.mod_list(key)
function LG_funcs.mod_list(key, huge_value)
-- equally likely pops and pushes, we rely on the list size being large enough
-- to "highly likely" not get emptied out by consequitve pops
local action = math.random(1, 4)
Expand All @@ -41,9 +74,9 @@ function LG_funcs.mod_list(key)
elseif action == 2 then
redis.apcall('LPOP', key)
elseif action == 3 then
redis.apcall('LPUSH', key, dragonfly.randstr(LG_funcs.esize))
redis.apcall('LPUSH', key, randstr())
else
redis.apcall('RPUSH', key, dragonfly.randstr(LG_funcs.esize))
redis.apcall('RPUSH', key, randstr())
end
end

Expand All @@ -62,8 +95,7 @@ function LG_funcs.add_set(key, keys)
end
redis.apcall('SDIFFSTORE', key, keys[i1], keys[i2])
else
local elements = dragonfly.randstr(LG_funcs.esize, LG_funcs.csize)
redis.apcall('SADD', key, unpack(elements))
redis.apcall('SADD', key, unpack(randstr_sequence()))
end
end

Expand All @@ -72,7 +104,7 @@ function LG_funcs.mod_set(key)
if math.random() < 0.5 then
redis.apcall('SPOP', key)
else
redis.apcall('SADD', key, dragonfly.randstr(LG_funcs.esize))
redis.apcall('SADD', key, randstr())
end
end

Expand All @@ -82,7 +114,14 @@ end
-- where `value` is a random string for even indices and a number for odd indices

function LG_funcs.add_hash(key)
local blobs = dragonfly.randstr(LG_funcs.esize, LG_funcs.csize / 2)
local blobs
if huge_entry() then
blobs = dragonfly.randstr(LG_funcs.huge_value_size, LG_funcs.csize / 2)
huge_entries = huge_entries + 1
else
blobs = dragonfly.randstr(LG_funcs.esize, LG_funcs.csize / 2)
end

local htable = {}
for i = 1, LG_funcs.csize, 2 do
htable[i * 2 - 1] = tostring(i)
Expand All @@ -100,15 +139,16 @@ function LG_funcs.mod_hash(key)
if idx % 2 == 1 then
redis.apcall('HINCRBY', key, tostring(idx), 1)
else
redis.apcall('HSET', key, tostring(idx), dragonfly.randstr(LG_funcs.esize))
redis.apcall('HSET', key, tostring(idx), randstr())
end
end

-- sorted sets

function LG_funcs.add_zset(key, keys)
-- TODO: We don't support ZDIFFSTORE
local blobs = dragonfly.randstr(LG_funcs.esize, LG_funcs.csize)
local blobs = randstr_sequence()

local ztable = {}
for i = 1, LG_funcs.csize do
ztable[i * 2 - 1] = tostring(i)
Expand All @@ -120,7 +160,7 @@ end
function LG_funcs.mod_zset(key, dbsize)
local action = math.random(1, 4)
if action <= 2 then
redis.apcall('ZADD', key, math.random(0, LG_funcs.csize * 2), dragonfly.randstr(LG_funcs.esize))
redis.apcall('ZADD', key, math.random(0, LG_funcs.csize * 2), randstr())
elseif action == 3 then
redis.apcall('ZPOPMAX', key)
else
Expand Down Expand Up @@ -153,3 +193,7 @@ function LG_funcs.mod_json(key, dbsize)
redis.apcall('JSON.NUMINCRBY', key, '$.counters[' .. math.random(LG_funcs.csize ) .. ']', 1)
end
end

function LG_funcs.get_huge_entries()
return huge_entries
end
10 changes: 9 additions & 1 deletion tests/dragonfly/seeder_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,15 @@ async def check_list():

await async_client.flushall()

s = Seeder(units=1, key_target=10, data_size=10_000, collection_size=1, types=["LIST"])
s = Seeder(
units=1,
key_target=10,
data_size=10_000,
collection_size=1,
types=["LIST"],
huge_value_percentage=0,
huge_value_size=0,
Comment on lines +38 to +39
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we get really big containers which causes the memory to grow really fast. That's why I rather have two specific parameters. One for the size of each element on the container and one for the total elements per container

)
await s.run(async_client)
await check_list()

Expand Down
Loading