Skip to content

Commit

Permalink
implement hash based CSE identifier
Browse files Browse the repository at this point in the history
  • Loading branch information
peter-toth committed Jun 20, 2024
1 parent 0f80b92 commit 14644c3
Show file tree
Hide file tree
Showing 3 changed files with 698 additions and 317 deletions.
2 changes: 1 addition & 1 deletion datafusion/common/src/hash_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ use crate::error::{Result, _internal_err};

// Combines two hashes into one hash
#[inline]
fn combine_hashes(l: u64, r: u64) -> u64 {
pub fn combine_hashes(l: u64, r: u64) -> u64 {
let hash = (17 * 37u64).wrapping_add(l);
hash.wrapping_mul(37).wrapping_add(r)
}
Expand Down
176 changes: 175 additions & 1 deletion datafusion/expr/src/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
use std::collections::HashSet;
use std::fmt::{self, Display, Formatter, Write};
use std::hash::Hash;
use std::hash::{Hash, Hasher};
use std::mem;
use std::str::FromStr;
use std::sync::Arc;

Expand Down Expand Up @@ -1419,6 +1420,179 @@ impl Expr {
| Expr::Placeholder(..) => false,
}
}

/// This method hashes the direct content of an expression node without recursing into
/// its children. This is useful because in `CommonSubexprEliminate` we can build up
/// the deep hash of a node and its descendants during the bottom-up phase of the
/// first traversal and so avoid computing the hash of the node and then the hash of
/// its descendants separately.
///
/// As it is pretty easy to forget changing this method when `Expr` changes the
/// implementation doesn't use wildcard patterns (`..`, `_`) to catch changes
/// compile time.
pub fn hash_node<H: Hasher>(&self, hasher: &mut H) {
mem::discriminant(self).hash(hasher);
match self {
Expr::Alias(Alias {
expr: _expr,
relation,
name,
}) => {
relation.hash(hasher);
name.hash(hasher);
}
Expr::Column(column) => {
column.hash(hasher);
}
Expr::ScalarVariable(data_type, name) => {
data_type.hash(hasher);
name.hash(hasher);
}
Expr::Literal(scalar_value) => {
scalar_value.hash(hasher);
}
Expr::BinaryExpr(BinaryExpr {
left: _left,
op,
right: _right,
}) => {
op.hash(hasher);
}
Expr::Like(Like {
negated,
expr: _expr,
pattern: _pattern,
escape_char,
case_insensitive,
})
| Expr::SimilarTo(Like {
negated,
expr: _expr,
pattern: _pattern,
escape_char,
case_insensitive,
}) => {
negated.hash(hasher);
escape_char.hash(hasher);
case_insensitive.hash(hasher);
}
Expr::Not(_expr)
| Expr::IsNotNull(_expr)
| Expr::IsNull(_expr)
| Expr::IsTrue(_expr)
| Expr::IsFalse(_expr)
| Expr::IsUnknown(_expr)
| Expr::IsNotTrue(_expr)
| Expr::IsNotFalse(_expr)
| Expr::IsNotUnknown(_expr)
| Expr::Negative(_expr) => {}
Expr::Between(Between {
expr: _expr,
negated,
low: _low,
high: _high,
}) => {
negated.hash(hasher);
}
Expr::Case(Case {
expr: _expr,
when_then_expr: _when_then_expr,
else_expr: _else_expr,
}) => {}
Expr::Cast(Cast {
expr: _expr,
data_type,
})
| Expr::TryCast(TryCast {
expr: _expr,
data_type,
}) => {
data_type.hash(hasher);
}
Expr::Sort(Sort {
expr: _expr,
asc,
nulls_first,
}) => {
asc.hash(hasher);
nulls_first.hash(hasher);
}
Expr::ScalarFunction(ScalarFunction { func, args: _args }) => {
func.hash(hasher);
}
Expr::AggregateFunction(AggregateFunction {
func_def,
args: _args,
distinct,
filter: _filter,
order_by: _order_by,
null_treatment,
}) => {
func_def.hash(hasher);
distinct.hash(hasher);
null_treatment.hash(hasher);
}
Expr::WindowFunction(WindowFunction {
fun,
args: _args,
partition_by: _partition_by,
order_by: _order_by,
window_frame,
null_treatment,
}) => {
fun.hash(hasher);
window_frame.hash(hasher);
null_treatment.hash(hasher);
}
Expr::InList(InList {
expr: _expr,
list: _list,
negated,
}) => {
negated.hash(hasher);
}
Expr::Exists(Exists { subquery, negated }) => {
// TODO: do we need to hash the whole subquery?
subquery.hash(hasher);
negated.hash(hasher);
}
Expr::InSubquery(InSubquery {
expr: _expr,
subquery,
negated,
}) => {
// TODO: do we need to hash the whole subquery?
subquery.hash(hasher);
negated.hash(hasher);
}
Expr::ScalarSubquery(subquery) => {
// TODO: do we need to hash the whole subquery?
subquery.hash(hasher);
}
Expr::Wildcard { qualifier } => {
qualifier.hash(hasher);
}
Expr::GroupingSet(grouping_set) => match grouping_set {
GroupingSet::Rollup(_exprs) => {
hasher.write_u8(0);
}
GroupingSet::Cube(_exprs) => {
hasher.write_u8(1);
}
GroupingSet::GroupingSets(_exprs) => {
hasher.write_u8(2);
}
},
Expr::Placeholder(place_holder) => {
place_holder.hash(hasher);
}
Expr::OuterReferenceColumn(data_type, column) => {
data_type.hash(hasher);
column.hash(hasher);
}
Expr::Unnest(Unnest { expr: _expr }) => {}
};
}
}

// modifies expr if it is a placeholder with datatype of right
Expand Down
Loading

0 comments on commit 14644c3

Please sign in to comment.