Skip to content

Commit

Permalink
fix: Properly handle to_physical_repr of nested types (#20413)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Dec 23, 2024
1 parent f5d8903 commit d4145fd
Show file tree
Hide file tree
Showing 13 changed files with 194 additions and 196 deletions.
37 changes: 37 additions & 0 deletions crates/polars-core/src/chunked_array/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
mod iterator;

use std::borrow::Cow;

use crate::prelude::*;

impl ArrayChunked {
Expand Down Expand Up @@ -29,6 +31,41 @@ impl ArrayChunked {
fld.coerce(DataType::Array(Box::new(inner_dtype), width))
}

/// Convert the datatype of the array into the physical datatype.
pub fn to_physical_repr(&self) -> Cow<ArrayChunked> {
let Cow::Owned(physical_repr) = self.get_inner().to_physical_repr() else {
return Cow::Borrowed(self);
};

assert_eq!(self.chunks().len(), physical_repr.chunks().len());

let width = self.width();
let chunks: Vec<_> = self
.downcast_iter()
.zip(physical_repr.into_chunks())
.map(|(chunk, values)| {
FixedSizeListArray::new(
ArrowDataType::FixedSizeList(
Box::new(ArrowField::new(
PlSmallStr::from_static("item"),
values.dtype().clone(),
true,
)),
width,
),
chunk.len(),
values,
chunk.validity().cloned(),
)
.to_boxed()
})
.collect();

let name = self.name().clone();
let dtype = DataType::Array(Box::new(self.inner_dtype().to_physical()), width);
Cow::Owned(unsafe { ArrayChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) })
}

/// Convert a non-logical [`ArrayChunked`] back into a logical [`ArrayChunked`] without casting.
///
/// # Safety
Expand Down
33 changes: 33 additions & 0 deletions crates/polars-core/src/chunked_array/list/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
//! Special list utility methods
pub(super) mod iterator;

use std::borrow::Cow;

use crate::prelude::*;

impl ListChunked {
Expand Down Expand Up @@ -36,6 +38,37 @@ impl ListChunked {
fld.coerce(DataType::List(Box::new(inner_dtype)))
}

/// Convert the datatype of the list into the physical datatype.
pub fn to_physical_repr(&self) -> Cow<ListChunked> {
let Cow::Owned(physical_repr) = self.get_inner().to_physical_repr() else {
return Cow::Borrowed(self);
};

assert_eq!(self.chunks().len(), physical_repr.chunks().len());

let chunks: Vec<_> = self
.downcast_iter()
.zip(physical_repr.into_chunks())
.map(|(chunk, values)| {
LargeListArray::new(
ArrowDataType::LargeList(Box::new(ArrowField::new(
PlSmallStr::from_static("item"),
values.dtype().clone(),
true,
))),
chunk.offsets().clone(),
values,
chunk.validity().cloned(),
)
.to_boxed()
})
.collect();

let name = self.name().clone();
let dtype = DataType::List(Box::new(self.inner_dtype().to_physical()));
Cow::Owned(unsafe { ListChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) })
}

/// Convert a non-logical [`ListChunked`] back into a logical [`ListChunked`] without casting.
///
/// # Safety
Expand Down
46 changes: 25 additions & 21 deletions crates/polars-core/src/chunked_array/ops/row_encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls(
))
}

pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option<RowEncodingContext> {
/// Get the [`RowEncodingContext`] for a certain [`DataType`].
///
/// This should be given the logical type in order to communicate Polars datatype information down
/// into the row encoding / decoding.
pub fn get_row_encoding_context(dtype: &DataType) -> Option<RowEncodingContext> {
match dtype {
DataType::Boolean
| DataType::UInt8
Expand Down Expand Up @@ -104,8 +108,8 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option<RowEncodingContex
},

#[cfg(feature = "dtype-array")]
DataType::Array(dtype, _) => get_row_encoding_dictionary(dtype),
DataType::List(dtype) => get_row_encoding_dictionary(dtype),
DataType::Array(dtype, _) => get_row_encoding_context(dtype),
DataType::List(dtype) => get_row_encoding_context(dtype),
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(revmap, ordering) | DataType::Enum(revmap, ordering) => {
let revmap = revmap.as_ref().unwrap();
Expand Down Expand Up @@ -161,28 +165,28 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option<RowEncodingContex
},
#[cfg(feature = "dtype-struct")]
DataType::Struct(fs) => {
let mut out = Vec::new();
let mut ctxts = Vec::new();

for (i, f) in fs.iter().enumerate() {
if let Some(dict) = get_row_encoding_dictionary(f.dtype()) {
out.reserve(fs.len());
out.extend(std::iter::repeat_n(None, i));
out.push(Some(dict));
if let Some(ctxt) = get_row_encoding_context(f.dtype()) {
ctxts.reserve(fs.len());
ctxts.extend(std::iter::repeat_n(None, i));
ctxts.push(Some(ctxt));
break;
}
}

if out.is_empty() {
if ctxts.is_empty() {
return None;
}

out.extend(
fs[out.len()..]
ctxts.extend(
fs[ctxts.len()..]
.iter()
.map(|f| get_row_encoding_dictionary(f.dtype())),
.map(|f| get_row_encoding_context(f.dtype())),
);

Some(RowEncodingContext::Struct(out))
Some(RowEncodingContext::Struct(ctxts))
},
}
}
Expand All @@ -198,7 +202,7 @@ pub fn encode_rows_unordered(by: &[Column]) -> PolarsResult<BinaryOffsetChunked>
pub fn _get_rows_encoded_unordered(by: &[Column]) -> PolarsResult<RowsEncoded> {
let mut cols = Vec::with_capacity(by.len());
let mut opts = Vec::with_capacity(by.len());
let mut dicts = Vec::with_capacity(by.len());
let mut ctxts = Vec::with_capacity(by.len());

// Since ZFS exists, we might not actually have any arrays and need to get the length from the
// columns.
Expand All @@ -210,13 +214,13 @@ pub fn _get_rows_encoded_unordered(by: &[Column]) -> PolarsResult<RowsEncoded> {
let by = by.as_materialized_series();
let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed();
let opt = RowEncodingOptions::new_unsorted();
let dict = get_row_encoding_dictionary(by.dtype());
let ctxt = get_row_encoding_context(by.dtype());

cols.push(arr);
opts.push(opt);
dicts.push(dict);
ctxts.push(ctxt);
}
Ok(convert_columns(num_rows, &cols, &opts, &dicts))
Ok(convert_columns(num_rows, &cols, &opts, &ctxts))
}

pub fn _get_rows_encoded(
Expand All @@ -229,7 +233,7 @@ pub fn _get_rows_encoded(

let mut cols = Vec::with_capacity(by.len());
let mut opts = Vec::with_capacity(by.len());
let mut dicts = Vec::with_capacity(by.len());
let mut ctxts = Vec::with_capacity(by.len());

// Since ZFS exists, we might not actually have any arrays and need to get the length from the
// columns.
Expand All @@ -241,13 +245,13 @@ pub fn _get_rows_encoded(
let by = by.as_materialized_series();
let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed();
let opt = RowEncodingOptions::new_sorted(*desc, *null_last);
let dict = get_row_encoding_dictionary(by.dtype());
let ctxt = get_row_encoding_context(by.dtype());

cols.push(arr);
opts.push(opt);
dicts.push(dict);
ctxts.push(ctxt);
}
Ok(convert_columns(num_rows, &cols, &opts, &dicts))
Ok(convert_columns(num_rows, &cols, &opts, &ctxts))
}

pub fn _get_rows_encoded_ca(
Expand Down
Loading

0 comments on commit d4145fd

Please sign in to comment.