From 5334e777f57b8050d926f0cecd92ce14a46ce13e Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Wed, 13 Nov 2024 20:56:02 -0500 Subject: [PATCH 1/3] fix implicit casting --- .../expr/src/type_coercion/functions.rs | 27 ++-- datafusion/functions/src/regex/regexpmatch.rs | 6 +- datafusion/sqllogictest/test_files/regexp.slt | 130 ++++++++++++++++++ 3 files changed, 150 insertions(+), 13 deletions(-) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 6836713d8016..8983a2304776 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -424,15 +424,24 @@ fn get_valid_types( let mut new_types = Vec::with_capacity(current_types.len()); for data_type in current_types.iter() { let logical_data_type: NativeType = data_type.into(); - if logical_data_type == NativeType::String { - new_types.push(data_type.to_owned()); - } else if logical_data_type == NativeType::Null { - // TODO: Switch to Utf8View if all the string functions supports Utf8View - new_types.push(DataType::Utf8); - } else { - return plan_err!( - "The signature expected NativeType::String but received {logical_data_type}" - ); + match logical_data_type { + NativeType::String => { + new_types.push(data_type.to_owned()); + } + // Allow implicit casting + NativeType::Null + | NativeType::Int32 + | NativeType::Int64 + | NativeType::Float32 + | NativeType::Float64 + | NativeType::Boolean => { + new_types.push(DataType::Utf8); + } + _ => { + return plan_err!( + "The signature expected NativeType::String but received {logical_data_type}" + ); + } } } diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index 019666bd7b2d..e58e17e44b7d 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -53,10 +53,8 @@ impl RegexpMatchFunc { // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8, Utf8)`. // If that fails, it proceeds to `(LargeUtf8, Utf8)`. // TODO: Native support Utf8View for regexp_match. - TypeSignature::Exact(vec![Utf8, Utf8]), - TypeSignature::Exact(vec![LargeUtf8, LargeUtf8]), - TypeSignature::Exact(vec![Utf8, Utf8, Utf8]), - TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), + TypeSignature::String(2), + TypeSignature::String(3), ], Volatility::Immutable, ), diff --git a/datafusion/sqllogictest/test_files/regexp.slt b/datafusion/sqllogictest/test_files/regexp.slt index 800026dd766d..0da2c94e5bb9 100644 --- a/datafusion/sqllogictest/test_files/regexp.slt +++ b/datafusion/sqllogictest/test_files/regexp.slt @@ -269,6 +269,136 @@ SELECT regexp_match('(?<=[A-Z]\w )Smith', 'John Smith', 'i'); ---- NULL +query ? +SELECT regexp_match(12345, '(\d{5})'); +---- +[12345] + +query ? +SELECT regexp_match(arrow_cast(12345, 'Int64'), '(\d{5})'); +---- +[12345] + +query ? +SELECT regexp_match(arrow_cast(-12345, 'Int32'), '^-\d{5}$'); +---- +[-12345] + +query ? +SELECT regexp_match(arrow_cast(1234567890123, 'Int64'), '(\d{13})'); +---- +[1234567890123] + +query ? +SELECT regexp_match(arrow_cast(12345, 'Int64'), '(\d{6})'); +---- +NULL + +query ? +SELECT regexp_match(arrow_cast(123.45, 'Float32'), '(\d+\.\d+)'); +---- +[123.45] + +query ? +SELECT regexp_match(arrow_cast(100.0, 'Float32'), '^100\.0$'); +---- +[100.0] + +query ? +SELECT regexp_match(arrow_cast(-456.78, 'Float32'), '^-\d+\.\d+$'); +---- +[-456.78] + +query ? +SELECT regexp_match(arrow_cast(123456.789012, 'Float64'), '(\d+\.\d{6})'); +---- +[123456.789012] + +query ? +SELECT regexp_match(arrow_cast(789.0, 'Float64'), '789\.00'); +---- +NULL + +query ? +SELECT regexp_match(arrow_cast(TRUE, 'Boolean'), '(true|false)'); +---- +[true] + +query ? +SELECT regexp_match(arrow_cast(FALSE, 'Boolean'), '(true|false)'); +---- +[false] + +query ? +SELECT regexp_match(arrow_cast(TRUE, 'Boolean'), '^true$'); +---- +[true] + +query ? +SELECT regexp_match(arrow_cast(FALSE, 'Boolean'), '^false$'); +---- +[false] + +query ? +SELECT regexp_match('abc-123', arrow_cast(123, 'Int32')); +---- +[123] + +query ? +SELECT regexp_match('abc-true', arrow_cast(TRUE, 'Boolean')); +---- +[true] + +query ? +SELECT regexp_match('value-456.78', arrow_cast(456.78, 'Float64')); +---- +[456.78] + +query ? +SELECT regexp_match(arrow_cast(789, 'Int32'), arrow_cast(789.0, 'Float64')); +---- +NULL + +query ? +SELECT regexp_match(arrow_cast(456.0, 'Float32'), arrow_cast(FALSE, 'Boolean')); +---- +NULL + +query ? +SELECT regexp_match(arrow_cast(NULL, 'Int32'), '.*-(\d)'); +---- +NULL + +query ? +SELECT regexp_match('aaa-0', arrow_cast(NULL, 'Boolean')); +---- +NULL + +query ? +SELECT regexp_match(arrow_cast(NULL, 'Float32'), arrow_cast(NULL, 'Boolean')); +---- +NULL + +query ? +SELECT regexp_match(arrow_cast(0, 'Int32'), '^0$'); +---- +[0] + +query ? +SELECT regexp_match(arrow_cast(FALSE, 'Boolean'), '^false$'); +---- +[false] + +query ? +SELECT regexp_match(arrow_cast(123.456, 'Float64'), '^123\.456$'); +---- +[123.456] + +query ? +SELECT regexp_match(arrow_cast(-789.012, 'Float64'), '^-\d+\.\d{3}$'); +---- +[-789.012] + # ported test query ? SELECT regexp_match('aaa-555', '.*-(\d*)'); From 7c83e8c9750ad5fb81540cd4b4b427c03b4bef9d Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Wed, 13 Nov 2024 20:59:15 -0500 Subject: [PATCH 2/3] put back todo --- datafusion/expr/src/type_coercion/functions.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 8983a2304776..c81d59267c7f 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -435,6 +435,7 @@ fn get_valid_types( | NativeType::Float32 | NativeType::Float64 | NativeType::Boolean => { + // TODO: Switch to Utf8View if all the string functions supports Utf8View new_types.push(DataType::Utf8); } _ => { From eaa6208113366edf0f329e32b80395bed3b13103 Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Thu, 14 Nov 2024 13:49:05 -0500 Subject: [PATCH 3/3] fixes --- datafusion/functions/src/regex/regexpmatch.rs | 6 +- datafusion/sqllogictest/test_files/expr.slt | 36 +++++ datafusion/sqllogictest/test_files/regexp.slt | 130 ------------------ datafusion/sqllogictest/test_files/scalar.slt | 3 - 4 files changed, 40 insertions(+), 135 deletions(-) diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index e58e17e44b7d..019666bd7b2d 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -53,8 +53,10 @@ impl RegexpMatchFunc { // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8, Utf8)`. // If that fails, it proceeds to `(LargeUtf8, Utf8)`. // TODO: Native support Utf8View for regexp_match. - TypeSignature::String(2), - TypeSignature::String(3), + TypeSignature::Exact(vec![Utf8, Utf8]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8]), + TypeSignature::Exact(vec![Utf8, Utf8, Utf8]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), ], Volatility::Immutable, ), diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index c653113fd438..b2df1cdb40ba 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -510,6 +510,42 @@ SELECT ltrim(NULL, 'xyz') ---- NULL +# implicit casting with TypeSignature test +query T +SELECT ltrim(NULL, NULL) +---- +NULL + +query T +SELECT ltrim(12345, '1') +---- +2345 + +query T +SELECT ltrim(10020, '0') +---- +10020 + +query T +SELECT ltrim(12.345, '12') +---- +.345 + +query T +SELECT ltrim(0.00123, '0.') +---- +123 + +query T +SELECT ltrim(true, 't') +---- +rue + +query T +SELECT ltrim(false, 'f') +---- +alse + query I SELECT octet_length('') ---- diff --git a/datafusion/sqllogictest/test_files/regexp.slt b/datafusion/sqllogictest/test_files/regexp.slt index 0da2c94e5bb9..800026dd766d 100644 --- a/datafusion/sqllogictest/test_files/regexp.slt +++ b/datafusion/sqllogictest/test_files/regexp.slt @@ -269,136 +269,6 @@ SELECT regexp_match('(?<=[A-Z]\w )Smith', 'John Smith', 'i'); ---- NULL -query ? -SELECT regexp_match(12345, '(\d{5})'); ----- -[12345] - -query ? -SELECT regexp_match(arrow_cast(12345, 'Int64'), '(\d{5})'); ----- -[12345] - -query ? -SELECT regexp_match(arrow_cast(-12345, 'Int32'), '^-\d{5}$'); ----- -[-12345] - -query ? -SELECT regexp_match(arrow_cast(1234567890123, 'Int64'), '(\d{13})'); ----- -[1234567890123] - -query ? -SELECT regexp_match(arrow_cast(12345, 'Int64'), '(\d{6})'); ----- -NULL - -query ? -SELECT regexp_match(arrow_cast(123.45, 'Float32'), '(\d+\.\d+)'); ----- -[123.45] - -query ? -SELECT regexp_match(arrow_cast(100.0, 'Float32'), '^100\.0$'); ----- -[100.0] - -query ? -SELECT regexp_match(arrow_cast(-456.78, 'Float32'), '^-\d+\.\d+$'); ----- -[-456.78] - -query ? -SELECT regexp_match(arrow_cast(123456.789012, 'Float64'), '(\d+\.\d{6})'); ----- -[123456.789012] - -query ? -SELECT regexp_match(arrow_cast(789.0, 'Float64'), '789\.00'); ----- -NULL - -query ? -SELECT regexp_match(arrow_cast(TRUE, 'Boolean'), '(true|false)'); ----- -[true] - -query ? -SELECT regexp_match(arrow_cast(FALSE, 'Boolean'), '(true|false)'); ----- -[false] - -query ? -SELECT regexp_match(arrow_cast(TRUE, 'Boolean'), '^true$'); ----- -[true] - -query ? -SELECT regexp_match(arrow_cast(FALSE, 'Boolean'), '^false$'); ----- -[false] - -query ? -SELECT regexp_match('abc-123', arrow_cast(123, 'Int32')); ----- -[123] - -query ? -SELECT regexp_match('abc-true', arrow_cast(TRUE, 'Boolean')); ----- -[true] - -query ? -SELECT regexp_match('value-456.78', arrow_cast(456.78, 'Float64')); ----- -[456.78] - -query ? -SELECT regexp_match(arrow_cast(789, 'Int32'), arrow_cast(789.0, 'Float64')); ----- -NULL - -query ? -SELECT regexp_match(arrow_cast(456.0, 'Float32'), arrow_cast(FALSE, 'Boolean')); ----- -NULL - -query ? -SELECT regexp_match(arrow_cast(NULL, 'Int32'), '.*-(\d)'); ----- -NULL - -query ? -SELECT regexp_match('aaa-0', arrow_cast(NULL, 'Boolean')); ----- -NULL - -query ? -SELECT regexp_match(arrow_cast(NULL, 'Float32'), arrow_cast(NULL, 'Boolean')); ----- -NULL - -query ? -SELECT regexp_match(arrow_cast(0, 'Int32'), '^0$'); ----- -[0] - -query ? -SELECT regexp_match(arrow_cast(FALSE, 'Boolean'), '^false$'); ----- -[false] - -query ? -SELECT regexp_match(arrow_cast(123.456, 'Float64'), '^123\.456$'); ----- -[123.456] - -query ? -SELECT regexp_match(arrow_cast(-789.012, 'Float64'), '^-\d+\.\d{3}$'); ----- -[-789.012] - # ported test query ? SELECT regexp_match('aaa-555', '.*-(\d*)'); diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index fe7d1a90c5bd..1ab5444db1a3 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1940,9 +1940,6 @@ select position('' in '') ---- 1 -query error DataFusion error: Error during planning: Error during planning: The signature expected NativeType::String but received NativeType::Int64 -select position(1 in 1) - query I select strpos('abc', 'c'); ----