Skip to content

Commit

Permalink
fix: Don't try to load non-existend List/FSL statistics (#20388)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Dec 20, 2024
1 parent 29fa3f7 commit e074e1a
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 250 deletions.
71 changes: 0 additions & 71 deletions crates/polars-parquet/src/arrow/read/statistics/dictionary.rs

This file was deleted.

94 changes: 0 additions & 94 deletions crates/polars-parquet/src/arrow/read/statistics/list.rs

This file was deleted.

110 changes: 25 additions & 85 deletions crates/polars-parquet/src/arrow/read/statistics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,13 @@ use crate::read::ColumnChunkMetadata;
mod binary;
mod binview;
mod boolean;
mod dictionary;
mod fixlen;
mod list;
mod map;
mod null;
mod primitive;
mod struct_;
mod utf8;

use self::list::DynMutableListArray;
use super::PrimitiveLogicalType;

/// Arrow-deserialized parquet Statistics of a file
Expand Down Expand Up @@ -67,22 +64,6 @@ impl From<MutableStatistics> for Statistics {
.unwrap()
.clone()
.boxed(),
PhysicalType::List => s
.null_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i32>>()
.unwrap()
.clone()
.boxed(),
PhysicalType::LargeList => s
.null_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i64>>()
.unwrap()
.clone()
.boxed(),
_ => s
.null_count
.as_box()
Expand All @@ -102,22 +83,6 @@ impl From<MutableStatistics> for Statistics {
.unwrap()
.clone()
.boxed(),
PhysicalType::List => s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i32>>()
.unwrap()
.clone()
.boxed(),
PhysicalType::LargeList => s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i64>>()
.unwrap()
.clone()
.boxed(),
_ => s
.distinct_count
.as_box()
Expand Down Expand Up @@ -162,13 +127,16 @@ fn make_mutable(dtype: &ArrowDataType, capacity: usize) -> PolarsResult<Box<dyn
Box::new(MutableFixedSizeBinaryArray::try_new(dtype.clone(), vec![], None).unwrap())
as _
},
PhysicalType::LargeList | PhysicalType::List | PhysicalType::FixedSizeList => Box::new(
DynMutableListArray::try_with_capacity(dtype.clone(), capacity)?,
)
as Box<dyn MutableArray>,
PhysicalType::Dictionary(_) => Box::new(
dictionary::DynMutableDictionary::try_with_capacity(dtype.clone(), capacity)?,
),
PhysicalType::LargeList | PhysicalType::List | PhysicalType::FixedSizeList => {
make_mutable(dtype.inner_dtype().unwrap(), capacity)?
},
PhysicalType::Dictionary(_) => {
let ArrowDataType::Dictionary(_, dtype, _) = &dtype else {
unreachable!();
};

make_mutable(dtype, capacity)?
},
PhysicalType::Struct => Box::new(struct_::DynMutableStructArray::try_with_capacity(
dtype.clone(),
capacity,
Expand Down Expand Up @@ -277,49 +245,21 @@ fn push(
distinct_count: &mut dyn MutableArray,
null_count: &mut dyn MutableArray,
) -> PolarsResult<()> {
match min.dtype().to_logical_type() {
List(_) | LargeList(_) | FixedSizeList(_, _) => {
let min = min
.as_mut_any()
.downcast_mut::<list::DynMutableListArray>()
.unwrap();
let max = max
.as_mut_any()
.downcast_mut::<list::DynMutableListArray>()
.unwrap();
let distinct_count = distinct_count
.as_mut_any()
.downcast_mut::<list::DynMutableListArray>()
.unwrap();
let null_count = null_count
.as_mut_any()
.downcast_mut::<list::DynMutableListArray>()
.unwrap();
return push(
stats,
min.inner.as_mut(),
max.inner.as_mut(),
distinct_count.inner.as_mut(),
null_count.inner.as_mut(),
);
},
Dictionary(_, _, _) => {
let min = min
.as_mut_any()
.downcast_mut::<dictionary::DynMutableDictionary>()
.unwrap();
let max = max
.as_mut_any()
.downcast_mut::<dictionary::DynMutableDictionary>()
.unwrap();
return push(
stats,
min.inner.as_mut(),
max.inner.as_mut(),
distinct_count,
null_count,
);
},
let mut logical_type = min.dtype().to_logical_type();

loop {
if let List(field) | LargeList(field) | FixedSizeList(field, _) = logical_type {
logical_type = field.dtype().to_logical_type();
continue;
}
if let Dictionary(_, dt, _) = logical_type {
logical_type = dt.to_logical_type();
}

break;
}

match logical_type {
Struct(fields) => {
if fields.is_empty() {
return Ok(());
Expand Down
14 changes: 14 additions & 0 deletions py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2710,3 +2710,17 @@ def test_boolean_slice_pushdown_20314() -> None:

f.seek(0)
assert pl.scan_parquet(f).slice(2, 1).collect().item()


def test_load_pred_pushdown_fsl_19241() -> None:
f = io.BytesIO()

fsl = pl.Series("a", [[[1, 2]]], pl.Array(pl.Array(pl.Int8, 2), 1))
filt = pl.Series("f", [1])

pl.DataFrame([fsl, filt]).write_parquet(f)

f.seek(0)
q = pl.scan_parquet(f, parallel="prefiltered").filter(pl.col.f != 4)

assert_frame_equal(q.collect(), pl.DataFrame([fsl, filt]))

0 comments on commit e074e1a

Please sign in to comment.