Skip to content

Commit

Permalink
Migrate Regex Functions from static docs (apache#12886)
Browse files Browse the repository at this point in the history
* regex migrate

* small fixes

* update docs

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
2 people authored and hailelagi committed Oct 14, 2024
1 parent c29e25b commit 9159245
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 102 deletions.
49 changes: 47 additions & 2 deletions datafusion/functions/src/regex/regexpmatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ use datafusion_common::{arrow_datafusion_err, plan_err};
use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
};
use datafusion_expr::{ColumnarValue, TypeSignature};
use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use std::any::Any;
use std::sync::Arc;
use std::sync::{Arc, OnceLock};

#[derive(Debug)]
pub struct RegexpMatchFunc {
Expand Down Expand Up @@ -106,7 +107,51 @@ impl ScalarUDFImpl for RegexpMatchFunc {
result.map(ColumnarValue::Array)
}
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_regexp_match_doc())
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_regexp_match_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder()
.with_doc_section(DOC_SECTION_REGEX)
.with_description("Returns a list of [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.")
.with_syntax_example("regexp_match(str, regexp[, flags])")
.with_sql_example(r#"```sql
> select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
+---------------------------------------------------------+
| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
+---------------------------------------------------------+
| [Köln] |
+---------------------------------------------------------+
SELECT regexp_match('aBc', '(b|d)', 'i');
+---------------------------------------------------+
| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
+---------------------------------------------------+
| [B] |
+---------------------------------------------------+
```
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
"#)
.with_standard_argument("str", "String")
.with_argument("regexp","Regular expression to match against.
Can be a constant, column, or function.")
.with_argument("flags",
r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?"#)
.build()
.unwrap()
})
}

fn regexp_match_func(args: &[ArrayRef]) -> Result<ArrayRef> {
match args[0].data_type() {
DataType::Utf8 => regexp_match::<i32>(args),
Expand Down
52 changes: 49 additions & 3 deletions datafusion/functions/src/regex/regexpreplace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,15 @@ use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
};
use datafusion_expr::function::Hint;
use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
use datafusion_expr::ColumnarValue;
use datafusion_expr::TypeSignature;
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use datafusion_expr::{Documentation, ScalarUDFImpl, Signature, Volatility};
use regex::Regex;
use std::any::Any;
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::OnceLock;
use std::sync::{Arc, OnceLock};

#[derive(Debug)]
pub struct RegexpReplaceFunc {
signature: Signature,
Expand Down Expand Up @@ -123,6 +124,51 @@ impl ScalarUDFImpl for RegexpReplaceFunc {
result.map(ColumnarValue::Array)
}
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_regexp_replace_doc())
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_regexp_replace_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder()
.with_doc_section(DOC_SECTION_REGEX)
.with_description("Replaces substrings in a string that match a [regular expression](https://docs.rs/regex/latest/regex/#syntax).")
.with_syntax_example("regexp_replace(str, regexp, replacement[, flags])")
.with_sql_example(r#"```sql
> select regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
+------------------------------------------------------------------------+
| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
+------------------------------------------------------------------------+
| fooXarYXazY |
+------------------------------------------------------------------------+
SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
+-------------------------------------------------------------------+
| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
+-------------------------------------------------------------------+
| aAbBac |
+-------------------------------------------------------------------+
```
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
"#)
.with_standard_argument("str", "String")
.with_argument("regexp","Regular expression to match against.
Can be a constant, column, or function.")
.with_standard_argument("replacement", "Replacement string")
.with_argument("flags",
r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **g**: (global) Search globally and don't return after the first match
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?"#)
.build()
.unwrap()
})
}

fn regexp_replace_func(args: &[ColumnarValue]) -> Result<ArrayRef> {
Expand Down
97 changes: 0 additions & 97 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -563,103 +563,6 @@ See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/

See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html)

## Regular Expression Functions

Apache DataFusion uses a [PCRE-like] regular expression [syntax]
(minus support for several features including look-around and backreferences).
The following regular expression functions are supported:

- [regexp_match](#regexp_match)
- [regexp_replace](#regexp_replace)

[pcre-like]: https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions
[syntax]: https://docs.rs/regex/latest/regex/#syntax

### `regexp_match`

Returns a list of [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.

```
regexp_match(str, regexp[, flags])
```

#### Arguments

- **str**: String expression to operate on.
Can be a constant, column, or function, and any combination of string operators.
- **regexp**: Regular expression to match against.
Can be a constant, column, or function.
- **flags**: Optional regular expression flags that control the behavior of the
regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?

#### Example

```sql
select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
+---------------------------------------------------------+
| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
+---------------------------------------------------------+
| [Köln] |
+---------------------------------------------------------+
SELECT regexp_match('aBc', '(b|d)', 'i');
+---------------------------------------------------+
| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
+---------------------------------------------------+
| [B] |
+---------------------------------------------------+
```

Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)

### `regexp_replace`

Replaces substrings in a string that match a [regular expression](https://docs.rs/regex/latest/regex/#syntax).

```
regexp_replace(str, regexp, replacement[, flags])
```

#### Arguments

- **str**: String expression to operate on.
Can be a constant, column, or function, and any combination of string operators.
- **regexp**: Regular expression to match against.
Can be a constant, column, or function.
- **replacement**: Replacement string expression.
Can be a constant, column, or function, and any combination of string operators.
- **flags**: Optional regular expression flags that control the behavior of the
regular expression. The following flags are supported:
- **g**: (global) Search globally and don't return after the first match
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?

#### Example

```sql
SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
+------------------------------------------------------------------------+
| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
+------------------------------------------------------------------------+
| fooXarYXazY |
+------------------------------------------------------------------------+
SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
+-------------------------------------------------------------------+
| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
+-------------------------------------------------------------------+
| aAbBac |
+-------------------------------------------------------------------+
```

Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)

### `position`

Returns the position of `substr` in `origstr` (counting from 1). If `substr` does
Expand Down
82 changes: 82 additions & 0 deletions docs/source/user-guide/sql/scalar_functions_new.md
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,8 @@ regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax)
The following regular expression functions are supported:

- [regexp_like](#regexp_like)
- [regexp_match](#regexp_match)
- [regexp_replace](#regexp_replace)

### `regexp_like`

Expand Down Expand Up @@ -1230,6 +1232,86 @@ SELECT regexp_like('aBc', '(b|d)', 'i');

Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)

### `regexp_match`

Returns a list of [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.

```
regexp_match(str, regexp[, flags])
```

#### Arguments

- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
- **regexp**: Regular expression to match against.
Can be a constant, column, or function.
- **flags**: Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?

#### Example

```sql
> select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
+---------------------------------------------------------+
| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
+---------------------------------------------------------+
| [Köln] |
+---------------------------------------------------------+
SELECT regexp_match('aBc', '(b|d)', 'i');
+---------------------------------------------------+
| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
+---------------------------------------------------+
| [B] |
+---------------------------------------------------+
```

Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)

### `regexp_replace`

Replaces substrings in a string that match a [regular expression](https://docs.rs/regex/latest/regex/#syntax).

```
regexp_replace(str, regexp, replacement[, flags])
```

#### Arguments

- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
- **regexp**: Regular expression to match against.
Can be a constant, column, or function.
- **replacement**: Replacement string expression to operate on. Can be a constant, column, or function, and any combination of operators.
- **flags**: Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **g**: (global) Search globally and don't return after the first match
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?

#### Example

```sql
> select regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
+------------------------------------------------------------------------+
| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
+------------------------------------------------------------------------+
| fooXarYXazY |
+------------------------------------------------------------------------+
SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
+-------------------------------------------------------------------+
| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
+-------------------------------------------------------------------+
| aAbBac |
+-------------------------------------------------------------------+
```

Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)

## Time and Date Functions

- [to_date](#to_date)
Expand Down

0 comments on commit 9159245

Please sign in to comment.