Skip to content

Commit

Permalink
Improve CommonSubexprEliminate identifier management (10% faster pl…
Browse files Browse the repository at this point in the history
…anning) (#10473)

* implement hash based CSE identifier

* move `is_volatile()` check out of visitor

* fix comments

* add transformed asserts when `rewrite_expr` is called

* update MSRV to 1.76

* cleanup

* better comments regarding volatile and short circuiting expressions

* address review comments
  • Loading branch information
peter-toth authored Jun 24, 2024
1 parent f0ef0e6 commit ede5598
Show file tree
Hide file tree
Showing 11 changed files with 723 additions and 358 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ homepage = "https://datafusion.apache.org"
license = "Apache-2.0"
readme = "README.md"
repository = "https://github.com/apache/datafusion"
rust-version = "1.75"
rust-version = "1.76"
version = "39.0.0"

[workspace.dependencies]
Expand Down
2 changes: 1 addition & 1 deletion datafusion-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ license = "Apache-2.0"
homepage = "https://datafusion.apache.org"
repository = "https://github.com/apache/datafusion"
# Specify MSRV here as `cargo msrv` doesn't support workspace version
rust-version = "1.75"
rust-version = "1.76"
readme = "README.md"

[dependencies]
Expand Down
2 changes: 1 addition & 1 deletion datafusion/common/src/hash_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ use crate::error::{Result, _internal_err};

// Combines two hashes into one hash
#[inline]
fn combine_hashes(l: u64, r: u64) -> u64 {
pub fn combine_hashes(l: u64, r: u64) -> u64 {
let hash = (17 * 37u64).wrapping_add(l);
hash.wrapping_mul(37).wrapping_add(r)
}
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ authors = { workspace = true }
# Specify MSRV here as `cargo msrv` doesn't support workspace version and fails with
# "Unable to find key 'package.rust-version' (or 'package.metadata.msrv') in 'arrow-datafusion/Cargo.toml'"
# https://github.com/foresterre/cargo-msrv/issues/590
rust-version = "1.75"
rust-version = "1.76"

[lints]
workspace = true
Expand Down
173 changes: 172 additions & 1 deletion datafusion/expr/src/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
use std::collections::HashSet;
use std::fmt::{self, Display, Formatter, Write};
use std::hash::Hash;
use std::hash::{Hash, Hasher};
use std::mem;
use std::str::FromStr;
use std::sync::Arc;

Expand Down Expand Up @@ -1461,6 +1462,176 @@ impl Expr {
| Expr::Placeholder(..) => false,
}
}

/// Hashes the direct content of an `Expr` without recursing into its children.
///
/// This method is useful to incrementally compute hashes, such as in
/// `CommonSubexprEliminate` which builds a deep hash of a node and its descendants
/// during the bottom-up phase of the first traversal and so avoid computing the hash
/// of the node and then the hash of its descendants separately.
///
/// If a node doesn't have any children then this method is similar to `.hash()`, but
/// not necessarily returns the same value.
///
/// As it is pretty easy to forget changing this method when `Expr` changes the
/// implementation doesn't use wildcard patterns (`..`, `_`) to catch changes
/// compile time.
pub fn hash_node<H: Hasher>(&self, hasher: &mut H) {
mem::discriminant(self).hash(hasher);
match self {
Expr::Alias(Alias {
expr: _expr,
relation,
name,
}) => {
relation.hash(hasher);
name.hash(hasher);
}
Expr::Column(column) => {
column.hash(hasher);
}
Expr::ScalarVariable(data_type, name) => {
data_type.hash(hasher);
name.hash(hasher);
}
Expr::Literal(scalar_value) => {
scalar_value.hash(hasher);
}
Expr::BinaryExpr(BinaryExpr {
left: _left,
op,
right: _right,
}) => {
op.hash(hasher);
}
Expr::Like(Like {
negated,
expr: _expr,
pattern: _pattern,
escape_char,
case_insensitive,
})
| Expr::SimilarTo(Like {
negated,
expr: _expr,
pattern: _pattern,
escape_char,
case_insensitive,
}) => {
negated.hash(hasher);
escape_char.hash(hasher);
case_insensitive.hash(hasher);
}
Expr::Not(_expr)
| Expr::IsNotNull(_expr)
| Expr::IsNull(_expr)
| Expr::IsTrue(_expr)
| Expr::IsFalse(_expr)
| Expr::IsUnknown(_expr)
| Expr::IsNotTrue(_expr)
| Expr::IsNotFalse(_expr)
| Expr::IsNotUnknown(_expr)
| Expr::Negative(_expr) => {}
Expr::Between(Between {
expr: _expr,
negated,
low: _low,
high: _high,
}) => {
negated.hash(hasher);
}
Expr::Case(Case {
expr: _expr,
when_then_expr: _when_then_expr,
else_expr: _else_expr,
}) => {}
Expr::Cast(Cast {
expr: _expr,
data_type,
})
| Expr::TryCast(TryCast {
expr: _expr,
data_type,
}) => {
data_type.hash(hasher);
}
Expr::Sort(Sort {
expr: _expr,
asc,
nulls_first,
}) => {
asc.hash(hasher);
nulls_first.hash(hasher);
}
Expr::ScalarFunction(ScalarFunction { func, args: _args }) => {
func.hash(hasher);
}
Expr::AggregateFunction(AggregateFunction {
func_def,
args: _args,
distinct,
filter: _filter,
order_by: _order_by,
null_treatment,
}) => {
func_def.hash(hasher);
distinct.hash(hasher);
null_treatment.hash(hasher);
}
Expr::WindowFunction(WindowFunction {
fun,
args: _args,
partition_by: _partition_by,
order_by: _order_by,
window_frame,
null_treatment,
}) => {
fun.hash(hasher);
window_frame.hash(hasher);
null_treatment.hash(hasher);
}
Expr::InList(InList {
expr: _expr,
list: _list,
negated,
}) => {
negated.hash(hasher);
}
Expr::Exists(Exists { subquery, negated }) => {
subquery.hash(hasher);
negated.hash(hasher);
}
Expr::InSubquery(InSubquery {
expr: _expr,
subquery,
negated,
}) => {
subquery.hash(hasher);
negated.hash(hasher);
}
Expr::ScalarSubquery(subquery) => {
subquery.hash(hasher);
}
Expr::Wildcard { qualifier } => {
qualifier.hash(hasher);
}
Expr::GroupingSet(grouping_set) => {
mem::discriminant(grouping_set).hash(hasher);
match grouping_set {
GroupingSet::Rollup(_exprs) | GroupingSet::Cube(_exprs) => {}
GroupingSet::GroupingSets(_exprs) => {}
}
}
Expr::Placeholder(place_holder) => {
place_holder.hash(hasher);
}
Expr::OuterReferenceColumn(data_type, column) => {
data_type.hash(hasher);
column.hash(hasher);
}
Expr::Unnest(Unnest { expr: _expr }) => {}
};
}
}

// modifies expr if it is a placeholder with datatype of right
Expand Down
Loading

0 comments on commit ede5598

Please sign in to comment.