You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Tested on a parquet file with a single Dictionary(UInt32, Utf8) column and bloom filters, datafusion-cli won't use them to prune.
To Reproduce
use arrow::array::RecordBatch;use arrow_schema::DataType;use bytes::{BufMut,BytesMut};use datafusion::datasource::schema_adapter::DefaultSchemaAdapterFactory;use parquet::arrow::ArrowWriter;use parquet::file::properties::WriterProperties;use std::sync::Arc;asyncfnwrite_record_batch(data_type:&DataType,suffix:&str){let schema = Arc::new(arrow::datatypes::Schema::new(vec![arrow::datatypes::Field::new("column",
data_type.clone(),
false,
)]));let batch = RecordBatch::try_new(Arc::new(arrow::datatypes::Schema::new(vec![arrow::datatypes::Field::new("column",
DataType::Utf8,
false,
)])),vec![Arc::new(arrow::array::StringArray::from(vec!["Hello, World!"]))],).unwrap();let batch = DefaultSchemaAdapterFactory::from_schema(schema).map_schema(batch.schema().as_ref()).unwrap().0.map_batch(batch).unwrap();letmut buf = BytesMut::new().writer();let schema = batch.schema();let props = WriterProperties::builder().set_bloom_filter_enabled(true).set_statistics_enabled(parquet::file::properties::EnabledStatistics::None).build();{letmut writer = ArrowWriter::try_new(&mut buf, schema,Some(props)).unwrap();
writer.write(&batch).unwrap();
writer.finish().unwrap();}
tokio::fs::write(format!("hello_world{}.parquet", suffix), buf.into_inner().freeze()).await.unwrap();}#[tokio::main]asyncfnmain(){// Create a RecordBatch with a single string column with a single row containing "Hello, World!"let data_type = arrow::datatypes::DataType::Utf8;write_record_batch(&data_type,"_plain").await;let data_type = arrow::datatypes::DataType::Dictionary(Box::new(arrow::datatypes::DataType::Int32),Box::new(arrow::datatypes::DataType::Utf8),);write_record_batch(&data_type,"_dict").await;let data_type = arrow::datatypes::DataType::Utf8View;write_record_batch(&data_type,"_view").await;}
You can now verify with parquet bloom-filter -c column -v 'Not Hello, World!' hello_world_dict.parquet that the value can be pruned via bloom fitlers.
But querying it with datafusion-cli using explain analyze select * from 'hello_world_dict.parquet' where column = 'Not Hello, World!'; confirms that bloom filters aren't used. They are used for _plain or _view.
For _dict:
ParquetExec: file_groups={1 group: [[Users/adriangb/GitHub/platform/hello_world_dict.parquet]]}, projection=[column], predicate=column@0 = Not Hellow, World!, pruning_predicate=CASE WHEN column_null_count@2 = column_row_count@3 THEN false ELSE column_min@0 <= Not Hellow, World! AND Not Hellow, World! <= column_max@1 END, required_guarantees=[column in (Not Hellow, World!)], metrics=[output_rows=1, elapsed_compute=1ns, bytes_scanned=1048657, file_open_errors=0, file_scan_errors=0, num_predicate_creation_errors=0, page_index_rows_matched=1, page_index_rows_pruned=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, row_groups_matched_bloom_filter=1, row_groups_matched_statistics=1, row_groups_pruned_bloom_filter=0, row_groups_pruned_statistics=0, bloom_filter_eval_time=367.251µs, metadata_load_time=713.793µs, page_index_eval_time=32.959µs, row_pushdown_eval_time=2ns, statistics_eval_time=406.126µs, time_elapsed_opening=1.826417ms, time_elapsed_processing=1.704624ms, time_elapsed_scanning_total=181.125µs, time_elapsed_scanning_until_data=157.083µs]
For _plain:
ParquetExec: file_groups={1 group: [[Users/adriangb/GitHub/platform/hello_world_plain.parquet]]}, projection=[column], predicate=column@0 = Not Hellow, World!, pruning_predicate=CASE WHEN column_null_count@2 = column_row_count@3 THEN false ELSE column_min@0 <= Not Hellow, World! AND Not Hellow, World! <= column_max@1 END, required_guarantees=[column in (Not Hellow, World!)], metrics=[output_rows=0, elapsed_compute=1ns, bytes_scanned=1048607, file_open_errors=0, file_scan_errors=0, num_predicate_creation_errors=0, page_index_rows_matched=0, page_index_rows_pruned=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, row_groups_matched_bloom_filter=0, row_groups_matched_statistics=1, row_groups_pruned_bloom_filter=1, row_groups_pruned_statistics=0, bloom_filter_eval_time=733.792µs, metadata_load_time=294.293µs, page_index_eval_time=876ns, row_pushdown_eval_time=2ns, statistics_eval_time=221.335µs, time_elapsed_opening=1.301542ms, time_elapsed_processing=722µs, time_elapsed_scanning_total=2.75µs, time_elapsed_scanning_until_data=2.75µs]
The text was updated successfully, but these errors were encountered:
Describe the bug
Tested on a parquet file with a single
Dictionary(UInt32, Utf8)
column and bloom filters, datafusion-cli won't use them to prune.To Reproduce
You can now verify with
parquet bloom-filter -c column -v 'Not Hello, World!' hello_world_dict.parquet
that the value can be pruned via bloom fitlers.But querying it with datafusion-cli using
explain analyze select * from 'hello_world_dict.parquet' where column = 'Not Hello, World!';
confirms that bloom filters aren't used. They are used for_plain
or_view
.For
_dict
:For
_plain
:The text was updated successfully, but these errors were encountered: