From d4145fd12322fd51e490b8621d0f9afc6d8da41c Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Mon, 23 Dec 2024 15:04:10 +0100 Subject: [PATCH] fix: Properly handle `to_physical_repr` of nested types (#20413) --- .../src/chunked_array/array/mod.rs | 37 ++++++ .../polars-core/src/chunked_array/list/mod.rs | 33 +++++ .../src/chunked_array/ops/row_encode.rs | 46 +++---- .../src/chunked_array/struct_/mod.rs | 117 ++++++++++-------- crates/polars-core/src/series/mod.rs | 107 ++-------------- crates/polars-expr/src/groups/row_encoded.rs | 10 +- .../src/chunked_array/strings/json_path.rs | 4 +- .../executors/sinks/group_by/generic/eval.rs | 4 +- .../sinks/group_by/generic/hash_table.rs | 4 +- .../executors/sinks/joins/generic_build.rs | 8 +- .../src/executors/sinks/joins/row_values.rs | 8 +- .../src/executors/sinks/sort/sink_multiple.rs | 8 +- crates/polars-python/src/series/general.rs | 4 +- 13 files changed, 194 insertions(+), 196 deletions(-) diff --git a/crates/polars-core/src/chunked_array/array/mod.rs b/crates/polars-core/src/chunked_array/array/mod.rs index 291e51ec1532..26ef1d00ac32 100644 --- a/crates/polars-core/src/chunked_array/array/mod.rs +++ b/crates/polars-core/src/chunked_array/array/mod.rs @@ -2,6 +2,8 @@ mod iterator; +use std::borrow::Cow; + use crate::prelude::*; impl ArrayChunked { @@ -29,6 +31,41 @@ impl ArrayChunked { fld.coerce(DataType::Array(Box::new(inner_dtype), width)) } + /// Convert the datatype of the array into the physical datatype. + pub fn to_physical_repr(&self) -> Cow { + let Cow::Owned(physical_repr) = self.get_inner().to_physical_repr() else { + return Cow::Borrowed(self); + }; + + assert_eq!(self.chunks().len(), physical_repr.chunks().len()); + + let width = self.width(); + let chunks: Vec<_> = self + .downcast_iter() + .zip(physical_repr.into_chunks()) + .map(|(chunk, values)| { + FixedSizeListArray::new( + ArrowDataType::FixedSizeList( + Box::new(ArrowField::new( + PlSmallStr::from_static("item"), + values.dtype().clone(), + true, + )), + width, + ), + chunk.len(), + values, + chunk.validity().cloned(), + ) + .to_boxed() + }) + .collect(); + + let name = self.name().clone(); + let dtype = DataType::Array(Box::new(self.inner_dtype().to_physical()), width); + Cow::Owned(unsafe { ArrayChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) }) + } + /// Convert a non-logical [`ArrayChunked`] back into a logical [`ArrayChunked`] without casting. /// /// # Safety diff --git a/crates/polars-core/src/chunked_array/list/mod.rs b/crates/polars-core/src/chunked_array/list/mod.rs index 8e3a67b348b7..f097726933a7 100644 --- a/crates/polars-core/src/chunked_array/list/mod.rs +++ b/crates/polars-core/src/chunked_array/list/mod.rs @@ -1,6 +1,8 @@ //! Special list utility methods pub(super) mod iterator; +use std::borrow::Cow; + use crate::prelude::*; impl ListChunked { @@ -36,6 +38,37 @@ impl ListChunked { fld.coerce(DataType::List(Box::new(inner_dtype))) } + /// Convert the datatype of the list into the physical datatype. + pub fn to_physical_repr(&self) -> Cow { + let Cow::Owned(physical_repr) = self.get_inner().to_physical_repr() else { + return Cow::Borrowed(self); + }; + + assert_eq!(self.chunks().len(), physical_repr.chunks().len()); + + let chunks: Vec<_> = self + .downcast_iter() + .zip(physical_repr.into_chunks()) + .map(|(chunk, values)| { + LargeListArray::new( + ArrowDataType::LargeList(Box::new(ArrowField::new( + PlSmallStr::from_static("item"), + values.dtype().clone(), + true, + ))), + chunk.offsets().clone(), + values, + chunk.validity().cloned(), + ) + .to_boxed() + }) + .collect(); + + let name = self.name().clone(); + let dtype = DataType::List(Box::new(self.inner_dtype().to_physical())); + Cow::Owned(unsafe { ListChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) }) + } + /// Convert a non-logical [`ListChunked`] back into a logical [`ListChunked`] without casting. /// /// # Safety diff --git a/crates/polars-core/src/chunked_array/ops/row_encode.rs b/crates/polars-core/src/chunked_array/ops/row_encode.rs index e60cab0de122..c5a36afa7aac 100644 --- a/crates/polars-core/src/chunked_array/ops/row_encode.rs +++ b/crates/polars-core/src/chunked_array/ops/row_encode.rs @@ -70,7 +70,11 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls( )) } -pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option { +/// Get the [`RowEncodingContext`] for a certain [`DataType`]. +/// +/// This should be given the logical type in order to communicate Polars datatype information down +/// into the row encoding / decoding. +pub fn get_row_encoding_context(dtype: &DataType) -> Option { match dtype { DataType::Boolean | DataType::UInt8 @@ -104,8 +108,8 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option get_row_encoding_dictionary(dtype), - DataType::List(dtype) => get_row_encoding_dictionary(dtype), + DataType::Array(dtype, _) => get_row_encoding_context(dtype), + DataType::List(dtype) => get_row_encoding_context(dtype), #[cfg(feature = "dtype-categorical")] DataType::Categorical(revmap, ordering) | DataType::Enum(revmap, ordering) => { let revmap = revmap.as_ref().unwrap(); @@ -161,28 +165,28 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option { - let mut out = Vec::new(); + let mut ctxts = Vec::new(); for (i, f) in fs.iter().enumerate() { - if let Some(dict) = get_row_encoding_dictionary(f.dtype()) { - out.reserve(fs.len()); - out.extend(std::iter::repeat_n(None, i)); - out.push(Some(dict)); + if let Some(ctxt) = get_row_encoding_context(f.dtype()) { + ctxts.reserve(fs.len()); + ctxts.extend(std::iter::repeat_n(None, i)); + ctxts.push(Some(ctxt)); break; } } - if out.is_empty() { + if ctxts.is_empty() { return None; } - out.extend( - fs[out.len()..] + ctxts.extend( + fs[ctxts.len()..] .iter() - .map(|f| get_row_encoding_dictionary(f.dtype())), + .map(|f| get_row_encoding_context(f.dtype())), ); - Some(RowEncodingContext::Struct(out)) + Some(RowEncodingContext::Struct(ctxts)) }, } } @@ -198,7 +202,7 @@ pub fn encode_rows_unordered(by: &[Column]) -> PolarsResult pub fn _get_rows_encoded_unordered(by: &[Column]) -> PolarsResult { let mut cols = Vec::with_capacity(by.len()); let mut opts = Vec::with_capacity(by.len()); - let mut dicts = Vec::with_capacity(by.len()); + let mut ctxts = Vec::with_capacity(by.len()); // Since ZFS exists, we might not actually have any arrays and need to get the length from the // columns. @@ -210,13 +214,13 @@ pub fn _get_rows_encoded_unordered(by: &[Column]) -> PolarsResult { let by = by.as_materialized_series(); let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed(); let opt = RowEncodingOptions::new_unsorted(); - let dict = get_row_encoding_dictionary(by.dtype()); + let ctxt = get_row_encoding_context(by.dtype()); cols.push(arr); opts.push(opt); - dicts.push(dict); + ctxts.push(ctxt); } - Ok(convert_columns(num_rows, &cols, &opts, &dicts)) + Ok(convert_columns(num_rows, &cols, &opts, &ctxts)) } pub fn _get_rows_encoded( @@ -229,7 +233,7 @@ pub fn _get_rows_encoded( let mut cols = Vec::with_capacity(by.len()); let mut opts = Vec::with_capacity(by.len()); - let mut dicts = Vec::with_capacity(by.len()); + let mut ctxts = Vec::with_capacity(by.len()); // Since ZFS exists, we might not actually have any arrays and need to get the length from the // columns. @@ -241,13 +245,13 @@ pub fn _get_rows_encoded( let by = by.as_materialized_series(); let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed(); let opt = RowEncodingOptions::new_sorted(*desc, *null_last); - let dict = get_row_encoding_dictionary(by.dtype()); + let ctxt = get_row_encoding_context(by.dtype()); cols.push(arr); opts.push(opt); - dicts.push(dict); + ctxts.push(ctxt); } - Ok(convert_columns(num_rows, &cols, &opts, &dicts)) + Ok(convert_columns(num_rows, &cols, &opts, &ctxts)) } pub fn _get_rows_encoded_ca( diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index 2dc07984d8af..a9676531c3cf 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -1,5 +1,6 @@ mod frame; +use std::borrow::Cow; use std::fmt::Write; use arrow::array::StructArray; @@ -22,14 +23,14 @@ fn constructor<'a, I: ExactSizeIterator + Clone>( name: PlSmallStr, length: usize, fields: I, -) -> PolarsResult { +) -> StructChunked { if fields.len() == 0 { let dtype = DataType::Struct(Vec::new()); let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest()); let chunks = vec![StructArray::new(arrow_dtype, length, Vec::new(), None).boxed()]; // SAFETY: We construct each chunk above to have the `Struct` data type. - return Ok(unsafe { StructChunked::from_chunks_and_dtype(name, chunks, dtype) }); + return unsafe { StructChunked::from_chunks_and_dtype(name, chunks, dtype) }; } // Different chunk lengths: rechunk and recurse. @@ -48,26 +49,23 @@ fn constructor<'a, I: ExactSizeIterator + Clone>( .clone() .map(|field| field.chunks()[c_i].clone()) .collect::>(); + let chunk_length = fields[0].len(); - if !fields.iter().all(|arr| length == arr.len()) { - return Err(()); + if fields[1..].iter().any(|arr| chunk_length != arr.len()) { + return None; } - Ok(StructArray::new(arrow_dtype.clone(), length, fields, None).boxed()) + Some(StructArray::new(arrow_dtype.clone(), chunk_length, fields, None).boxed()) }) - .collect::, ()>>(); + .collect::>>(); match chunks { - Ok(chunks) => { + Some(chunks) => { // SAFETY: invariants checked above. - unsafe { - Ok(StructChunked::from_chunks_and_dtype_unchecked( - name, chunks, dtype, - )) - } + unsafe { StructChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) } }, - // Different chunk lengths: rechunk and recurse. - Err(_) => { + // Different chunks: rechunk and recurse. + None => { let fields = fields.map(|s| s.rechunk()).collect::>(); constructor(name, length, fields.iter()) }, @@ -117,14 +115,14 @@ impl StructChunked { } if !needs_to_broadcast { - return constructor(name, length, fields); + return Ok(constructor(name, length, fields)); } if length == 0 { // @NOTE: There are columns that are being broadcasted so we need to clear those. let new_fields = fields.map(|s| s.clear()).collect::>(); - return constructor(name, length, new_fields.iter()); + return Ok(constructor(name, length, new_fields.iter())); } let new_fields = fields @@ -136,7 +134,39 @@ impl StructChunked { } }) .collect::>(); - constructor(name, length, new_fields.iter()) + Ok(constructor(name, length, new_fields.iter())) + } + + /// Convert a struct to the underlying physical datatype. + pub fn to_physical_repr(&self) -> Cow { + let mut physicals = Vec::new(); + + let field_series = self.fields_as_series(); + for (i, s) in field_series.iter().enumerate() { + if let Cow::Owned(physical) = s.to_physical_repr() { + physicals.reserve(field_series.len()); + physicals.extend(field_series[..i].iter().cloned()); + physicals.push(physical); + break; + } + } + + if physicals.is_empty() { + return Cow::Borrowed(self); + } + + physicals.extend( + field_series[physicals.len()..] + .iter() + .map(|s| s.to_physical_repr().into_owned()), + ); + + let mut ca = constructor(self.name().clone(), self.length, physicals.iter()); + if self.null_count() > 0 { + ca.zip_outer_validity(self); + } + + Cow::Owned(ca) } /// Convert a non-logical [`StructChunked`] back into a logical [`StructChunked`] without casting. @@ -370,28 +400,31 @@ impl StructChunked { /// Combine the validities of two structs. pub fn zip_outer_validity(&mut self, other: &StructChunked) { + if other.null_count() == 0 { + return; + } + if self.chunks.len() != other.chunks.len() - || !self + || self .chunks .iter() - .zip(other.chunks.iter()) - .map(|(a, b)| a.len() == b.len()) - .all_equal() + .zip(other.chunks()) + .any(|(a, b)| a.len() != b.len()) { *self = self.rechunk(); let other = other.rechunk(); return self.zip_outer_validity(&other); } - if other.null_count > 0 { - // SAFETY: - // We keep length and dtypes the same. - unsafe { - for (a, b) in self.downcast_iter_mut().zip(other.downcast_iter()) { - let new = combine_validities_and(a.validity(), b.validity()); - a.set_validity(new) - } + + // SAFETY: + // We keep length and dtypes the same. + unsafe { + for (a, b) in self.downcast_iter_mut().zip(other.downcast_iter()) { + let new = combine_validities_and(a.validity(), b.validity()); + a.set_validity(new) } } + self.compute_len(); self.propagate_nulls(); } @@ -429,30 +462,4 @@ impl StructChunked { self.set_outer_validity(validity); self } - - pub fn with_outer_validity_chunked(mut self, validity: BooleanChunked) -> Self { - assert_eq!(self.len(), validity.len()); - if !self - .chunks - .iter() - .zip(validity.chunks.iter()) - .map(|(a, b)| a.len() == b.len()) - .all_equal() - || self.chunks.len() != validity.chunks().len() - { - let ca = self.rechunk(); - let validity = validity.rechunk(); - ca.with_outer_validity_chunked(validity) - } else { - unsafe { - for (arr, valid) in self.chunks_mut().iter_mut().zip(validity.downcast_iter()) { - assert!(valid.validity().is_none()); - *arr = arr.with_validity(Some(valid.values().clone())) - } - } - self.compute_len(); - self.propagate_nulls(); - self - } - } } diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 5a3aacf81453..01dbcf33db33 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -35,7 +35,6 @@ pub use from::*; pub use iterator::{SeriesIter, SeriesPhysIter}; use num_traits::NumCast; use polars_error::feature_gated; -use polars_utils::itertools::Itertools; pub use series_trait::{IsSorted, *}; use crate::chunked_array::cast::CastOptions; @@ -687,105 +686,21 @@ impl Series { }, #[cfg(feature = "dtype-decimal")] Decimal(_, _) => Cow::Owned(self.decimal().unwrap().0.clone().into_series()), - List(inner) => Cow::Owned(self.cast(&List(Box::new(inner.to_physical()))).unwrap()), - #[cfg(feature = "dtype-array")] - Array(inner, size) => Cow::Owned( - self.cast(&Array(Box::new(inner.to_physical()), *size)) - .unwrap(), - ), - #[cfg(feature = "dtype-struct")] - Struct(_) => { - let arr = self.struct_().unwrap(); - let fields: Vec<_> = arr - .fields_as_series() - .iter() - .map(|s| s.to_physical_repr().into_owned()) - .collect(); - let mut ca = - StructChunked::from_series(self.name().clone(), arr.len(), fields.iter()) - .unwrap(); - - if arr.null_count() > 0 { - ca.zip_outer_validity(arr); - } - Cow::Owned(ca.into_series()) + List(_) => match self.list().unwrap().to_physical_repr() { + Cow::Borrowed(_) => Cow::Borrowed(self), + Cow::Owned(ca) => Cow::Owned(ca.into_series()), }, - _ => Cow::Borrowed(self), - } - } - - /// Attempts to convert a Series to dtype, only allowing conversions from - /// physical to logical dtypes--the inverse of to_physical_repr(). - /// - /// # Safety - /// When converting from UInt32 to Categorical it is not checked that the - /// values are in-bound for the categorical mapping. - pub unsafe fn to_logical_repr_unchecked(&self, dtype: &DataType) -> PolarsResult { - use DataType::*; - - let err = || { - Err( - polars_err!(ComputeError: "can't cast from {} to {} in to_logical_repr_unchecked", self.dtype(), dtype), - ) - }; - - match dtype { - dt if self.dtype() == dt => Ok(self.clone()), - #[cfg(feature = "dtype-date")] - Date => Ok(self.i32()?.clone().into_date().into_series()), - #[cfg(feature = "dtype-datetime")] - Datetime(u, z) => Ok(self - .i64()? - .clone() - .into_datetime(*u, z.clone()) - .into_series()), - #[cfg(feature = "dtype-duration")] - Duration(u) => Ok(self.i64()?.clone().into_duration(*u).into_series()), - #[cfg(feature = "dtype-time")] - Time => Ok(self.i64()?.clone().into_time().into_series()), - #[cfg(feature = "dtype-decimal")] - Decimal(precision, scale) => Ok(self - .i128()? - .clone() - .into_decimal(*precision, scale.unwrap())? - .into_series()), - #[cfg(feature = "dtype-categorical")] - Categorical { .. } | Enum { .. } => { - Ok(CategoricalChunked::from_cats_and_dtype_unchecked( - self.u32()?.clone(), - dtype.clone(), - ) - .into_series()) - }, - List(inner) => { - if let List(self_inner) = self.dtype() { - if inner.to_physical() == **self_inner { - return self.cast(dtype); - } - } - err() + #[cfg(feature = "dtype-array")] + Array(_, _) => match self.array().unwrap().to_physical_repr() { + Cow::Borrowed(_) => Cow::Borrowed(self), + Cow::Owned(ca) => Cow::Owned(ca.into_series()), }, #[cfg(feature = "dtype-struct")] - Struct(target_fields) => { - let ca = self.struct_().unwrap(); - if ca.struct_fields().len() != target_fields.len() { - return err(); - } - let fields = ca - .fields_as_series() - .iter() - .zip(target_fields) - .map(|(s, tf)| s.to_logical_repr_unchecked(tf.dtype())) - .try_collect_vec()?; - let mut result = - StructChunked::from_series(self.name().clone(), ca.len(), fields.iter())?; - if ca.null_count() > 0 { - result.zip_outer_validity(ca); - } - Ok(result.into_series()) + Struct(_) => match self.struct_().unwrap().to_physical_repr() { + Cow::Borrowed(_) => Cow::Borrowed(self), + Cow::Owned(ca) => Cow::Owned(ca.into_series()), }, - - _ => err(), + _ => Cow::Borrowed(self), } } diff --git a/crates/polars-expr/src/groups/row_encoded.rs b/crates/polars-expr/src/groups/row_encoded.rs index 1b90a8254bbe..885f8c6114e7 100644 --- a/crates/polars-expr/src/groups/row_encoded.rs +++ b/crates/polars-expr/src/groups/row_encoded.rs @@ -4,7 +4,7 @@ use polars_utils::cardinality_sketch::CardinalitySketch; use polars_utils::idx_map::bytes_idx_map::{BytesIndexMap, Entry}; use polars_utils::vec::PushUnchecked; -use self::row_encode::get_row_encoding_dictionary; +use self::row_encode::get_row_encoding_context; use super::*; use crate::hash_keys::HashKeys; @@ -39,14 +39,14 @@ impl RowEncodedHashGrouper { .iter() .map(|(_name, dt)| dt.to_physical().to_arrow(CompatLevel::newest())) .collect::>(); - let dicts = self + let ctxts = self .key_schema .iter() - .map(|(_, dt)| get_row_encoding_dictionary(dt)) + .map(|(_, dt)| get_row_encoding_context(dt)) .collect::>(); let fields = vec![RowEncodingOptions::new_unsorted(); key_dtypes.len()]; let key_columns = - unsafe { polars_row::decode::decode_rows(&mut key_rows, &fields, &dicts, &key_dtypes) }; + unsafe { polars_row::decode::decode_rows(&mut key_rows, &fields, &ctxts, &key_dtypes) }; let cols = self .key_schema @@ -54,7 +54,7 @@ impl RowEncodedHashGrouper { .zip(key_columns) .map(|((name, dt), col)| { let s = Series::try_from((name.clone(), col)).unwrap(); - unsafe { s.to_logical_repr_unchecked(dt) } + unsafe { s.from_physical_unchecked(dt) } .unwrap() .into_column() }) diff --git a/crates/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs index 32bd8d8e7f2e..7f52f983136e 100644 --- a/crates/polars-ops/src/chunked_array/strings/json_path.rs +++ b/crates/polars-ops/src/chunked_array/strings/json_path.rs @@ -141,6 +141,8 @@ impl Utf8JsonPathImpl for StringChunked {} #[cfg(test)] mod tests { + use arrow::bitmap::Bitmap; + use super::*; #[test] @@ -215,7 +217,7 @@ mod tests { .iter(), ) .unwrap() - .with_outer_validity_chunked(BooleanChunked::new("".into(), [false, true, true, false])) + .with_outer_validity(Some(Bitmap::from_iter([false, true, true, false]))) .into_series(); let expected_dtype = expected_series.dtype().clone(); diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs index e5e506c5ac04..07ebb4f2a5ce 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs @@ -2,7 +2,7 @@ use std::cell::UnsafeCell; use polars_row::{RowEncodingOptions, RowsEncoded}; -use self::row_encode::get_row_encoding_dictionary; +use self::row_encode::get_row_encoding_context; use super::*; use crate::executors::sinks::group_by::utils::prepare_key; use crate::executors::sinks::utils::hash_rows; @@ -77,7 +77,7 @@ impl Eval { let mut dicts = Vec::with_capacity(self.key_columns_expr.len()); for phys_e in self.key_columns_expr.iter() { let s = phys_e.evaluate(chunk, &context.execution_state)?; - dicts.push(get_row_encoding_dictionary(s.dtype())); + dicts.push(get_row_encoding_context(s.dtype())); let s = s.to_physical_repr().into_owned(); let s = prepare_key(&s, chunk); keys_columns.push(s.to_arrow(0, CompatLevel::newest())); diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs index 5c17243c4475..63d093f61827 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs @@ -1,7 +1,7 @@ use arrow::legacy::trusted_len::TrustedLenPush; use polars_utils::hashing::hash_to_partition; -use self::row_encode::get_row_encoding_dictionary; +use self::row_encode::get_row_encoding_context; use super::*; use crate::pipeline::PARTITION_SIZE; @@ -262,7 +262,7 @@ impl AggHashTable { .output_schema .iter_values() .take(self.num_keys) - .map(get_row_encoding_dictionary) + .map(get_row_encoding_context) .collect::>(); let fields = vec![Default::default(); self.num_keys]; let key_columns = diff --git a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs index f9a3e901b1a0..4096c8083f0d 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs @@ -9,7 +9,7 @@ use polars_utils::arena::Node; use polars_utils::pl_str::PlSmallStr; use polars_utils::unitvec; -use self::row_encode::get_row_encoding_dictionary; +use self::row_encode::get_row_encoding_context; use super::*; use crate::executors::operators::PlaceHolder; use crate::executors::sinks::joins::generic_probe_inner_left::GenericJoinProbe; @@ -137,17 +137,17 @@ impl GenericBuild { chunk: &DataChunk, ) -> PolarsResult<&BinaryArray> { debug_assert!(self.join_columns.is_empty()); - let mut dicts = Vec::with_capacity(self.join_columns_left.len()); + let mut ctxts = Vec::with_capacity(self.join_columns_left.len()); for phys_e in self.join_columns_left.iter() { let s = phys_e.evaluate(chunk, &context.execution_state)?; let arr = s.to_physical_repr().rechunk().array_ref(0).clone(); self.join_columns.push(arr); - dicts.push(get_row_encoding_dictionary(s.dtype())); + ctxts.push(get_row_encoding_context(s.dtype())); } let rows_encoded = polars_row::convert_columns_no_order( self.join_columns[0].len(), // @NOTE: does not work for ZFS &self.join_columns, - &dicts, + &ctxts, ) .into_array(); self.materialized_join_cols.push(rows_encoded); diff --git a/crates/polars-pipe/src/executors/sinks/joins/row_values.rs b/crates/polars-pipe/src/executors/sinks/joins/row_values.rs index 73287f518555..b4e0e8337bcc 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/row_values.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/row_values.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use arrow::array::{ArrayRef, BinaryArray, StaticArray}; use arrow::compute::utils::combine_validities_and_many; use polars_core::error::PolarsResult; -use polars_core::prelude::row_encode::get_row_encoding_dictionary; +use polars_core::prelude::row_encode::get_row_encoding_context; use polars_row::RowsEncoded; use crate::expressions::PhysicalPipedExpr; @@ -49,7 +49,7 @@ impl RowValues { let determine_idx = self.det_join_idx && self.join_column_idx.is_none(); let mut names = vec![]; - let mut dicts = Vec::with_capacity(self.join_column_eval.len()); + let mut ctxts = Vec::with_capacity(self.join_column_eval.len()); for phys_e in self.join_column_eval.iter() { let s = phys_e.evaluate(chunk, &context.execution_state)?; let mut s = s.to_physical_repr().rechunk(); @@ -60,7 +60,7 @@ impl RowValues { names.push(s.name().to_string()); } self.join_columns_material.push(s.array_ref(0).clone()); - dicts.push(get_row_encoding_dictionary(s.dtype())); + ctxts.push(get_row_encoding_context(s.dtype())); } // We determine the indices of the columns that have to be removed @@ -79,7 +79,7 @@ impl RowValues { polars_row::convert_columns_amortized_no_order( self.join_columns_material[0].len(), // @NOTE: does not work for ZFS &self.join_columns_material, - &dicts, + &ctxts, &mut self.current_rows, ); diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs index 175a6b707430..024dc8522503 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs @@ -7,7 +7,7 @@ use polars_core::series::IsSorted; use polars_row::decode::decode_rows_from_binary; use polars_row::{RowEncodingContext, RowEncodingOptions}; -use self::row_encode::get_row_encoding_dictionary; +use self::row_encode::get_row_encoding_context; use super::*; use crate::operators::{ DataChunk, FinalizedSink, PExecutionContext, Sink, SinkResult, Source, SourceResult, @@ -137,11 +137,11 @@ impl SortSinkMultiple { ) -> PolarsResult { let mut schema = (*output_schema).clone(); - let sort_dicts = sort_idx + let sort_ctxts = sort_idx .iter() .map(|i| { let (_, dtype) = schema.get_at_index(*i).unwrap(); - get_row_encoding_dictionary(dtype) + get_row_encoding_context(dtype) }) .collect::>(); @@ -182,7 +182,7 @@ impl SortSinkMultiple { sort_options, sort_idx: Arc::from(sort_idx), sort_opts: Arc::from(sort_fields), - sort_dicts: Arc::from(sort_dicts), + sort_dicts: Arc::from(sort_ctxts), sort_dtypes, sort_column: vec![], output_schema, diff --git a/crates/polars-python/src/series/general.rs b/crates/polars-python/src/series/general.rs index a3a80505596f..24ef7a752fa6 100644 --- a/crates/polars-python/src/series/general.rs +++ b/crates/polars-python/src/series/general.rs @@ -8,7 +8,7 @@ use pyo3::prelude::*; use pyo3::types::PyBytes; use pyo3::{IntoPyObjectExt, Python}; -use self::row_encode::get_row_encoding_dictionary; +use self::row_encode::get_row_encoding_context; use super::PySeries; use crate::dataframe::PyDataFrame; use crate::error::PyPolarsErr; @@ -549,7 +549,7 @@ impl PySeries { let dicts = dtypes .iter() - .map(|(_, dtype)| get_row_encoding_dictionary(&dtype.0)) + .map(|(_, dtype)| get_row_encoding_context(&dtype.0)) .collect::>(); // Get the BinaryOffset array.