Skip to content

Commit

Permalink
chore: Enable masked out list, struct and array elements in parametri…
Browse files Browse the repository at this point in the history
…c tests (#20365)
  • Loading branch information
coastalwhite authored Dec 20, 2024
1 parent d309fd0 commit d68689f
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 11 deletions.
11 changes: 4 additions & 7 deletions crates/polars-parquet/src/arrow/write/binview/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,21 @@ pub(crate) fn encode_plain(
buffer: &mut Vec<u8>,
) {
if options.is_optional() && array.validity().is_some() {
// @NOTE: This capacity might overestimate the amount of bytes since the buffers might
// still contain data that is not referenced by any value.
let capacity =
array.total_bytes_len() + (array.len() - array.null_count()) * size_of::<u32>();

let len_before = buffer.len();
buffer.reserve(capacity);

encode_non_null_values(array.non_null_values_iter(), buffer);
// Append the non-null values.
debug_assert_eq!(buffer.len() - len_before, capacity);
} else {
// @NOTE: This capacity might overestimate the amount of bytes since the buffers might
// still contain data that is not referenced by any value.
let capacity = array.total_bytes_len() + array.len() * size_of::<u32>();

let len_before = buffer.len();
buffer.reserve(capacity);

encode_non_null_values(array.values_iter(), buffer);
// Append the non-null values.
debug_assert_eq!(buffer.len() - len_before, capacity);
}
}

Expand Down
35 changes: 33 additions & 2 deletions py-polars/polars/testing/parametric/strategies/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
import hypothesis.strategies as st
from hypothesis.errors import InvalidArgument

from polars import select, when
from polars._utils.deprecation import issue_deprecation_warning
from polars.dataframe import DataFrame
from polars.datatypes import DataType, DataTypeClass, Null
from polars.datatypes import Array, Boolean, DataType, DataTypeClass, List, Null, Struct
from polars.series import Series
from polars.string_cache import StringCache
from polars.testing.parametric.strategies._utils import flexhash
Expand Down Expand Up @@ -42,6 +43,7 @@ def series(
strategy: SearchStrategy[Any] | None = None,
allow_null: bool = True,
allow_chunks: bool = True,
allow_masked_out: bool = True,
unique: bool = False,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
Expand Down Expand Up @@ -74,6 +76,8 @@ def series(
Allow nulls as possible values and allow the `Null` data type by default.
allow_chunks : bool
Allow the Series to contain multiple chunks.
allow_masked_out : bool
Allow the nulls to contain masked out elements.
unique : bool, optional
indicate whether Series values should all be distinct.
allowed_dtypes : {list,set}, optional
Expand Down Expand Up @@ -200,14 +204,21 @@ def series(
if isinstance(name, st.SearchStrategy):
name = draw(name)

do_mask_out = (
allow_masked_out
and allow_null
and isinstance(dtype, (List, Array, Struct))
and draw(st.booleans())
)

if size == 0:
values = []
else:
# Create series using dtype-specific strategy to generate values
if strategy is None:
strategy = data(
dtype, # type: ignore[arg-type]
allow_null=allow_null,
allow_null=allow_null and not do_mask_out,
**kwargs,
)

Expand All @@ -222,6 +233,20 @@ def series(

s = Series(name=name, values=values, dtype=dtype)

# Apply masking out of values
if do_mask_out:
values = draw(
st.lists(
st.booleans(),
min_size=size,
max_size=size,
unique_by=(flexhash if unique else None),
)
)

mask = Series(name=None, values=values, dtype=Boolean)
s = select(when(mask).then(s).alias(s.name)).to_series()

# Apply chunking
if allow_chunks and size > 1 and draw(st.booleans()):
split_at = size // 2
Expand All @@ -242,6 +267,7 @@ def dataframes(
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = True,
allow_chunks: bool = True,
allow_masked_out: bool = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
allow_time_zones: bool = True,
Expand All @@ -261,6 +287,7 @@ def dataframes(
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = True,
allow_chunks: bool = True,
allow_masked_out: bool = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
allow_time_zones: bool = True,
Expand All @@ -282,6 +309,7 @@ def dataframes(
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = True,
allow_chunks: bool = True,
allow_masked_out: bool = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
allow_time_zones: bool = True,
Expand Down Expand Up @@ -321,6 +349,8 @@ def dataframes(
Accepts either a boolean or a mapping of column names to booleans.
allow_chunks : bool
Allow the DataFrame to contain multiple chunks.
allow_masked_out : bool
Allow the nulls to contain masked out elements.
allowed_dtypes : {list,set}, optional
when automatically generating data, allow only these dtypes.
excluded_dtypes : {list,set}, optional
Expand Down Expand Up @@ -475,6 +505,7 @@ def dataframes(
strategy=c.strategy,
allow_null=c.allow_null, # type: ignore[arg-type]
allow_chunks=allow_series_chunks,
allow_masked_out=allow_masked_out,
unique=c.unique,
allowed_dtypes=allowed_dtypes,
excluded_dtypes=excluded_dtypes,
Expand Down
10 changes: 9 additions & 1 deletion py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1374,7 +1374,13 @@ def test_struct_plain_encoded_statistics() -> None:
test_scan_round_trip(df)


@given(df=dataframes(min_size=5, excluded_dtypes=[pl.Decimal, pl.Categorical]))
@given(
df=dataframes(
min_size=5,
excluded_dtypes=[pl.Decimal, pl.Categorical],
allow_masked_out=False, # PyArrow does not support this
)
)
def test_scan_round_trip_parametric(df: pl.DataFrame) -> None:
test_scan_round_trip(df)

Expand Down Expand Up @@ -1435,6 +1441,7 @@ def test_null_array_dict_pages_18085() -> None:
pl.UInt32,
pl.UInt64,
],
allow_masked_out=False, # PyArrow does not support this
),
row_group_size=st.integers(min_value=10, max_value=1000),
)
Expand Down Expand Up @@ -1570,6 +1577,7 @@ def test_predicate_filtering(
pl.Enum,
pl.Struct, # See #19612.
],
allow_masked_out=False, # PyArrow does not support this
),
offset=st.integers(0, 10),
length=st.integers(0, 10),
Expand Down
5 changes: 4 additions & 1 deletion py-polars/tests/unit/operations/test_sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ def test_series_sort_idempotent(s: pl.Series) -> None:
pl.Object, # Unsortable type
pl.Null, # Bug, see: https://github.com/pola-rs/polars/issues/17007
pl.Decimal, # Bug, see: https://github.com/pola-rs/polars/issues/17009
]
pl.Categorical(
ordering="lexical"
), # Bug, see: https://github.com/pola-rs/polars/issues/20364
],
)
)
def test_df_sort_idempotent(df: pl.DataFrame) -> None:
Expand Down

0 comments on commit d68689f

Please sign in to comment.