Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve CommonSubexprEliminate identifier management (10% faster planning) #10473

Merged
merged 10 commits into from
Jun 24, 2024
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ homepage = "https://datafusion.apache.org"
license = "Apache-2.0"
readme = "README.md"
repository = "https://github.com/apache/datafusion"
rust-version = "1.75"
rust-version = "1.76"
version = "39.0.0"

[workspace.dependencies]
Expand Down
2 changes: 1 addition & 1 deletion datafusion-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ license = "Apache-2.0"
homepage = "https://datafusion.apache.org"
repository = "https://github.com/apache/datafusion"
# Specify MSRV here as `cargo msrv` doesn't support workspace version
rust-version = "1.75"
rust-version = "1.76"
readme = "README.md"

[dependencies]
Expand Down
2 changes: 1 addition & 1 deletion datafusion/common/src/hash_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ use crate::error::{Result, _internal_err};

// Combines two hashes into one hash
#[inline]
fn combine_hashes(l: u64, r: u64) -> u64 {
pub fn combine_hashes(l: u64, r: u64) -> u64 {
let hash = (17 * 37u64).wrapping_add(l);
hash.wrapping_mul(37).wrapping_add(r)
}
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ authors = { workspace = true }
# Specify MSRV here as `cargo msrv` doesn't support workspace version and fails with
# "Unable to find key 'package.rust-version' (or 'package.metadata.msrv') in 'arrow-datafusion/Cargo.toml'"
# https://github.com/foresterre/cargo-msrv/issues/590
rust-version = "1.75"
rust-version = "1.76"

[lints]
workspace = true
Expand Down
173 changes: 172 additions & 1 deletion datafusion/expr/src/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@

use std::collections::HashSet;
use std::fmt::{self, Display, Formatter, Write};
use std::hash::Hash;
use std::hash::{Hash, Hasher};
use std::mem;
use std::str::FromStr;
use std::sync::Arc;

Expand Down Expand Up @@ -1461,6 +1462,176 @@ impl Expr {
| Expr::Placeholder(..) => false,
}
}

/// This method hashes the direct content of an expression node without recursing into
/// its children. This is useful because in `CommonSubexprEliminate` we can build up
/// the deep hash of a node and its descendants during the bottom-up phase of the
/// first traversal and so avoid computing the hash of the node and then the hash of
/// its descendants separately.
///
/// As it is pretty easy to forget changing this method when `Expr` changes the
/// implementation doesn't use wildcard patterns (`..`, `_`) to catch changes
/// compile time.
peter-toth marked this conversation as resolved.
Show resolved Hide resolved
pub fn hash_node<H: Hasher>(&self, hasher: &mut H) {
mem::discriminant(self).hash(hasher);
match self {
Expr::Alias(Alias {
expr: _expr,
relation,
name,
}) => {
relation.hash(hasher);
name.hash(hasher);
}
Expr::Column(column) => {
column.hash(hasher);
}
Expr::ScalarVariable(data_type, name) => {
data_type.hash(hasher);
name.hash(hasher);
}
Expr::Literal(scalar_value) => {
scalar_value.hash(hasher);
}
Expr::BinaryExpr(BinaryExpr {
left: _left,
op,
right: _right,
}) => {
op.hash(hasher);
}
Expr::Like(Like {
negated,
expr: _expr,
pattern: _pattern,
escape_char,
case_insensitive,
})
| Expr::SimilarTo(Like {
negated,
expr: _expr,
pattern: _pattern,
escape_char,
case_insensitive,
}) => {
negated.hash(hasher);
escape_char.hash(hasher);
case_insensitive.hash(hasher);
}
Expr::Not(_expr)
| Expr::IsNotNull(_expr)
| Expr::IsNull(_expr)
| Expr::IsTrue(_expr)
| Expr::IsFalse(_expr)
| Expr::IsUnknown(_expr)
| Expr::IsNotTrue(_expr)
| Expr::IsNotFalse(_expr)
| Expr::IsNotUnknown(_expr)
| Expr::Negative(_expr) => {}
Expr::Between(Between {
expr: _expr,
negated,
low: _low,
high: _high,
}) => {
negated.hash(hasher);
}
Expr::Case(Case {
expr: _expr,
when_then_expr: _when_then_expr,
else_expr: _else_expr,
}) => {}
Expr::Cast(Cast {
expr: _expr,
data_type,
})
| Expr::TryCast(TryCast {
expr: _expr,
data_type,
}) => {
data_type.hash(hasher);
}
Expr::Sort(Sort {
expr: _expr,
asc,
nulls_first,
}) => {
asc.hash(hasher);
nulls_first.hash(hasher);
}
Expr::ScalarFunction(ScalarFunction { func, args: _args }) => {
func.hash(hasher);
}
Expr::AggregateFunction(AggregateFunction {
func_def,
args: _args,
distinct,
filter: _filter,
order_by: _order_by,
null_treatment,
}) => {
func_def.hash(hasher);
distinct.hash(hasher);
null_treatment.hash(hasher);
}
Expr::WindowFunction(WindowFunction {
fun,
args: _args,
partition_by: _partition_by,
order_by: _order_by,
window_frame,
null_treatment,
}) => {
fun.hash(hasher);
window_frame.hash(hasher);
null_treatment.hash(hasher);
}
Expr::InList(InList {
expr: _expr,
list: _list,
negated,
}) => {
negated.hash(hasher);
}
Expr::Exists(Exists { subquery, negated }) => {
subquery.hash(hasher);
negated.hash(hasher);
}
Expr::InSubquery(InSubquery {
expr: _expr,
subquery,
negated,
}) => {
subquery.hash(hasher);
negated.hash(hasher);
}
Expr::ScalarSubquery(subquery) => {
subquery.hash(hasher);
}
Expr::Wildcard { qualifier } => {
qualifier.hash(hasher);
}
Expr::GroupingSet(grouping_set) => match grouping_set {
GroupingSet::Rollup(_exprs) => {
peter-toth marked this conversation as resolved.
Show resolved Hide resolved
hasher.write_u8(0);
}
GroupingSet::Cube(_exprs) => {
hasher.write_u8(1);
}
GroupingSet::GroupingSets(_exprs) => {
hasher.write_u8(2);
}
},
Expr::Placeholder(place_holder) => {
place_holder.hash(hasher);
}
Expr::OuterReferenceColumn(data_type, column) => {
data_type.hash(hasher);
column.hash(hasher);
}
Expr::Unnest(Unnest { expr: _expr }) => {}
};
}
}

// modifies expr if it is a placeholder with datatype of right
Expand Down
Loading