Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate Regex Functions from static docs #12886

Merged
merged 4 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 47 additions & 2 deletions datafusion/functions/src/regex/regexpmatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ use datafusion_common::{arrow_datafusion_err, plan_err};
use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
};
use datafusion_expr::{ColumnarValue, TypeSignature};
use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use std::any::Any;
use std::sync::Arc;
use std::sync::{Arc, OnceLock};

#[derive(Debug)]
pub struct RegexpMatchFunc {
Expand Down Expand Up @@ -106,7 +107,51 @@ impl ScalarUDFImpl for RegexpMatchFunc {
result.map(ColumnarValue::Array)
}
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_regexp_match_doc())
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_regexp_match_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder()
.with_doc_section(DOC_SECTION_REGEX)
.with_description("Returns a list of [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.")
.with_syntax_example("regexp_match(str, regexp[, flags])")
.with_sql_example(r#"```sql
> select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
+---------------------------------------------------------+
| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
+---------------------------------------------------------+
| [Köln] |
+---------------------------------------------------------+
SELECT regexp_match('aBc', '(b|d)', 'i');
+---------------------------------------------------+
| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
+---------------------------------------------------+
| [B] |
+---------------------------------------------------+
```
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
"#)
.with_standard_argument("str", "String")
.with_argument("regexp","Regular expression to match against.
Can be a constant, column, or function.")
.with_argument("flags",
r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?"#)
.build()
.unwrap()
})
}

fn regexp_match_func(args: &[ArrayRef]) -> Result<ArrayRef> {
match args[0].data_type() {
DataType::Utf8 => regexp_match::<i32>(args),
Expand Down
52 changes: 49 additions & 3 deletions datafusion/functions/src/regex/regexpreplace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,15 @@ use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
};
use datafusion_expr::function::Hint;
use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
use datafusion_expr::ColumnarValue;
use datafusion_expr::TypeSignature;
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use datafusion_expr::{Documentation, ScalarUDFImpl, Signature, Volatility};
use regex::Regex;
use std::any::Any;
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::OnceLock;
use std::sync::{Arc, OnceLock};

#[derive(Debug)]
pub struct RegexpReplaceFunc {
signature: Signature,
Expand Down Expand Up @@ -123,6 +124,51 @@ impl ScalarUDFImpl for RegexpReplaceFunc {
result.map(ColumnarValue::Array)
}
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_regexp_replace_doc())
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_regexp_replace_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder()
.with_doc_section(DOC_SECTION_REGEX)
.with_description("Replaces substrings in a string that match a [regular expression](https://docs.rs/regex/latest/regex/#syntax).")
.with_syntax_example("regexp_replace(str, regexp, replacement[, flags])")
.with_sql_example(r#"```sql
> select regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
+------------------------------------------------------------------------+
| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
+------------------------------------------------------------------------+
| fooXarYXazY |
+------------------------------------------------------------------------+
SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
+-------------------------------------------------------------------+
| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
+-------------------------------------------------------------------+
| aAbBac |
+-------------------------------------------------------------------+
```
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
"#)
.with_standard_argument("str", "String")
.with_argument("regexp","Regular expression to match against.
Can be a constant, column, or function.")
.with_standard_argument("replacement", "Replacement string")
.with_argument("flags",
r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **g**: (global) Search globally and don't return after the first match
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?"#)
.build()
.unwrap()
})
}

fn regexp_replace_func(args: &[ColumnarValue]) -> Result<ArrayRef> {
Expand Down
97 changes: 0 additions & 97 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -563,103 +563,6 @@ See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/

See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html)

## Regular Expression Functions

Apache DataFusion uses a [PCRE-like] regular expression [syntax]
(minus support for several features including look-around and backreferences).
The following regular expression functions are supported:

- [regexp_match](#regexp_match)
- [regexp_replace](#regexp_replace)

[pcre-like]: https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions
[syntax]: https://docs.rs/regex/latest/regex/#syntax

### `regexp_match`

Returns a list of [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.

```
regexp_match(str, regexp[, flags])
```

#### Arguments

- **str**: String expression to operate on.
Can be a constant, column, or function, and any combination of string operators.
- **regexp**: Regular expression to match against.
Can be a constant, column, or function.
- **flags**: Optional regular expression flags that control the behavior of the
regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?

#### Example

```sql
select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
+---------------------------------------------------------+
| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
+---------------------------------------------------------+
| [Köln] |
+---------------------------------------------------------+
SELECT regexp_match('aBc', '(b|d)', 'i');
+---------------------------------------------------+
| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
+---------------------------------------------------+
| [B] |
+---------------------------------------------------+
```

Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)

### `regexp_replace`

Replaces substrings in a string that match a [regular expression](https://docs.rs/regex/latest/regex/#syntax).

```
regexp_replace(str, regexp, replacement[, flags])
```

#### Arguments

- **str**: String expression to operate on.
Can be a constant, column, or function, and any combination of string operators.
- **regexp**: Regular expression to match against.
Can be a constant, column, or function.
- **replacement**: Replacement string expression.
Can be a constant, column, or function, and any combination of string operators.
- **flags**: Optional regular expression flags that control the behavior of the
regular expression. The following flags are supported:
- **g**: (global) Search globally and don't return after the first match
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?

#### Example

```sql
SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
+------------------------------------------------------------------------+
| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
+------------------------------------------------------------------------+
| fooXarYXazY |
+------------------------------------------------------------------------+
SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
+-------------------------------------------------------------------+
| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
+-------------------------------------------------------------------+
| aAbBac |
+-------------------------------------------------------------------+
```

Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)

### `position`

Returns the position of `substr` in `origstr` (counting from 1). If `substr` does
Expand Down
82 changes: 82 additions & 0 deletions docs/source/user-guide/sql/scalar_functions_new.md
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,8 @@ regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax)
The following regular expression functions are supported:

- [regexp_like](#regexp_like)
- [regexp_match](#regexp_match)
- [regexp_replace](#regexp_replace)

### `regexp_like`

Expand Down Expand Up @@ -1230,6 +1232,86 @@ SELECT regexp_like('aBc', '(b|d)', 'i');

Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)

### `regexp_match`

Returns a list of [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.

```
regexp_match(str, regexp[, flags])
```

#### Arguments

- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
- **regexp**: Regular expression to match against.
Can be a constant, column, or function.
- **flags**: Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?

#### Example

```sql
select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
+---------------------------------------------------------+
| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
+---------------------------------------------------------+
| [Köln] |
+---------------------------------------------------------+
SELECT regexp_match('aBc', '(b|d)', 'i');
+---------------------------------------------------+
| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
+---------------------------------------------------+
| [B] |
+---------------------------------------------------+
```

Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)

### `regexp_replace`

Replaces substrings in a string that match a [regular expression](https://docs.rs/regex/latest/regex/#syntax).

```
regexp_replace(str, regexp, replacement[, flags])
```

#### Arguments

- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
- **regexp**: Regular expression to match against.
Can be a constant, column, or function.
- **replacement**: Replacement string expression to operate on. Can be a constant, column, or function, and any combination of operators.
- **flags**: Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **g**: (global) Search globally and don't return after the first match
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?

#### Example

```sql
SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
+------------------------------------------------------------------------+
| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
+------------------------------------------------------------------------+
| fooXarYXazY |
+------------------------------------------------------------------------+
SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
+-------------------------------------------------------------------+
| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
+-------------------------------------------------------------------+
| aAbBac |
+-------------------------------------------------------------------+
```

Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)

## Time and Date Functions

- [to_date](#to_date)
Expand Down
Loading