Skip to content

Commit

Permalink
Migrate documentation for all core functions from scalar_functions.md…
Browse files Browse the repository at this point in the history
… to code (#12854)

* Migrate documentation for all core functions from scalar_functions.md to code #12801

* Fixed formatting issue, regenerated documentation

* Update docs/source/user-guide/sql/scalar_functions.md

Co-authored-by: Andrew Lamb <[email protected]>

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
Omega359 and alamb authored Oct 11, 2024
1 parent 3bc7714 commit eddade7
Showing 14 changed files with 768 additions and 267 deletions.
2 changes: 1 addition & 1 deletion datafusion/core/src/bin/print_functions_docs.rs
Original file line number Diff line number Diff line change
@@ -108,7 +108,7 @@ fn print_docs(
.collect::<Vec<_>>();

// write out section header
let _ = writeln!(docs, "## {} ", doc_section.label);
let _ = writeln!(docs, "\n## {} \n", doc_section.label);

if let Some(description) = doc_section.description {
let _ = writeln!(docs, "{description}");
2 changes: 1 addition & 1 deletion datafusion/expr/src/udf_docs.rs
Original file line number Diff line number Diff line change
@@ -155,7 +155,7 @@ impl DocumentationBuilder {
///
/// ```text
/// <arg_name>:
/// <expression_type> expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
/// <expression_type> expression to operate on. Can be a constant, column, or function, and any combination of operators.
/// ```
pub fn with_standard_argument(
self,
41 changes: 38 additions & 3 deletions datafusion/functions/src/core/arrow_cast.rs
Original file line number Diff line number Diff line change
@@ -17,17 +17,19 @@

//! [`ArrowCastFunc`]: Implementation of the `arrow_cast`
use std::any::Any;

use arrow::datatypes::DataType;
use datafusion_common::{
arrow_datafusion_err, internal_err, plan_datafusion_err, plan_err, DataFusionError,
ExprSchema, Result, ScalarValue,
};
use std::any::Any;
use std::sync::OnceLock;

use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER;
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
use datafusion_expr::{
ColumnarValue, Expr, ExprSchemable, ScalarUDFImpl, Signature, Volatility,
ColumnarValue, Documentation, Expr, ExprSchemable, ScalarUDFImpl, Signature,
Volatility,
};

/// Implements casting to arbitrary arrow types (rather than SQL types)
@@ -131,6 +133,39 @@ impl ScalarUDFImpl for ArrowCastFunc {
// return the newly written argument to DataFusion
Ok(ExprSimplifyResult::Simplified(new_expr))
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_arrow_cast_doc())
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_arrow_cast_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder()
.with_doc_section(DOC_SECTION_OTHER)
.with_description("Casts a value to a specific Arrow data type.")
.with_syntax_example("arrow_cast(expression, datatype)")
.with_sql_example(
r#"```sql
> select arrow_cast(-5, 'Int8') as a,
arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b,
arrow_cast('bar', 'LargeUtf8') as c,
arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d
;
+----+-----+-----+---------------------------+
| a | b | c | d |
+----+-----+-----+---------------------------+
| -5 | foo | bar | 2023-01-02T12:53:02+08:00 |
+----+-----+-----+---------------------------+
```"#,
)
.with_argument("expression", "Expression to cast. The expression can be a constant, column, or function, and any combination of operators.")
.with_argument("datatype", "[Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name to cast to, as a string. The format is the same as that returned by [`arrow_typeof`]")
.build()
.unwrap()
})
}

/// Returns the requested type from the arguments
35 changes: 34 additions & 1 deletion datafusion/functions/src/core/arrowtypeof.rs
Original file line number Diff line number Diff line change
@@ -17,9 +17,11 @@

use arrow::datatypes::DataType;
use datafusion_common::{exec_err, Result, ScalarValue};
use datafusion_expr::ColumnarValue;
use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER;
use datafusion_expr::{ColumnarValue, Documentation};
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use std::any::Any;
use std::sync::OnceLock;

#[derive(Debug)]
pub struct ArrowTypeOfFunc {
@@ -69,4 +71,35 @@ impl ScalarUDFImpl for ArrowTypeOfFunc {
"{input_data_type}"
))))
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_arrowtypeof_doc())
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_arrowtypeof_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder()
.with_doc_section(DOC_SECTION_OTHER)
.with_description(
"Returns the name of the underlying [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the expression.",
)
.with_syntax_example("arrow_typeof(expression)")
.with_sql_example(
r#"```sql
> select arrow_typeof('foo'), arrow_typeof(1);
+---------------------------+------------------------+
| arrow_typeof(Utf8("foo")) | arrow_typeof(Int64(1)) |
+---------------------------+------------------------+
| Utf8 | Int64 |
+---------------------------+------------------------+
```
"#,
)
.with_argument("expression", "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators.")
.build()
.unwrap()
})
}
43 changes: 26 additions & 17 deletions datafusion/functions/src/core/coalesce.rs
Original file line number Diff line number Diff line change
@@ -47,23 +47,6 @@ impl CoalesceFunc {
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_coalesce_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder()
.with_doc_section(DOC_SECTION_CONDITIONAL)
.with_description("Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.")
.with_syntax_example("coalesce(expression1[, ..., expression_n])")
.with_argument(
"expression1, expression_n",
"Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary."
)
.build()
.unwrap()
})
}

impl ScalarUDFImpl for CoalesceFunc {
fn as_any(&self) -> &dyn Any {
self
@@ -164,6 +147,32 @@ impl ScalarUDFImpl for CoalesceFunc {
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_coalesce_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder()
.with_doc_section(DOC_SECTION_CONDITIONAL)
.with_description("Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.")
.with_syntax_example("coalesce(expression1[, ..., expression_n])")
.with_sql_example(r#"```sql
> select coalesce(null, null, 'datafusion');
+----------------------------------------+
| coalesce(NULL,NULL,Utf8("datafusion")) |
+----------------------------------------+
| datafusion |
+----------------------------------------+
```"#,
)
.with_argument(
"expression1, expression_n",
"Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary."
)
.build()
.unwrap()
})
}

#[cfg(test)]
mod test {
use arrow::datatypes::DataType;
74 changes: 64 additions & 10 deletions datafusion/functions/src/core/getfield.rs
Original file line number Diff line number Diff line change
@@ -23,10 +23,11 @@ use datafusion_common::cast::{as_map_array, as_struct_array};
use datafusion_common::{
exec_err, plan_datafusion_err, plan_err, ExprSchema, Result, ScalarValue,
};
use datafusion_expr::{ColumnarValue, Expr, ExprSchemable};
use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER;
use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable};
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use std::any::Any;
use std::sync::Arc;
use std::sync::{Arc, OnceLock};

#[derive(Debug)]
pub struct GetFieldFunc {
@@ -133,7 +134,7 @@ impl ScalarUDFImpl for GetFieldFunc {
DataType::Struct(fields) if fields.len() == 2 => {
// Arrow's MapArray is essentially a ListArray of structs with two columns. They are
// often named "key", and "value", but we don't require any specific naming here;
// instead, we assume that the second columnis the "value" column both here and in
// instead, we assume that the second column is the "value" column both here and in
// execution.
let value_field = fields.get(1).expect("fields should have exactly two members");
Ok(value_field.data_type().clone())
@@ -155,7 +156,7 @@ impl ScalarUDFImpl for GetFieldFunc {
"Only UTF8 strings are valid as an indexed field in a struct"
),
(DataType::Null, _) => Ok(DataType::Null),
(other, _) => plan_err!("The expression to get an indexed field is only valid for `List`, `Struct`, `Map` or `Null` types, got {other}"),
(other, _) => plan_err!("The expression to get an indexed field is only valid for `Struct`, `Map` or `Null` types, got {other}"),
}
}

@@ -190,7 +191,7 @@ impl ScalarUDFImpl for GetFieldFunc {
let keys = arrow::compute::kernels::cmp::eq(&key_scalar, map_array.keys())?;

// note that this array has more entries than the expected output/input size
// because maparray is flatten
// because map_array is flattened
let original_data = map_array.entries().column(1).to_data();
let capacity = Capacities::Array(original_data.len());
let mut mutable =
@@ -205,7 +206,7 @@ impl ScalarUDFImpl for GetFieldFunc {
keys.slice(start, end-start).
iter().enumerate().
find(|(_, t)| t.unwrap());
if maybe_matched.is_none(){
if maybe_matched.is_none() {
mutable.extend_nulls(1);
continue
}
@@ -224,14 +225,67 @@ impl ScalarUDFImpl for GetFieldFunc {
}
}
(DataType::Struct(_), name) => exec_err!(
"get indexed field is only possible on struct with utf8 indexes. \
Tried with {name:?} index"
"get_field is only possible on struct with utf8 indexes. \
Received with {name:?} index"
),
(DataType::Null, _) => Ok(ColumnarValue::Scalar(ScalarValue::Null)),
(dt, name) => exec_err!(
"get indexed field is only possible on lists with int64 indexes or struct \
with utf8 indexes. Tried {dt:?} with {name:?} index"
"get_field is only possible on maps with utf8 indexes or struct \
with utf8 indexes. Received {dt:?} with {name:?} index"
),
}
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_getfield_doc())
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_getfield_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder()
.with_doc_section(DOC_SECTION_OTHER)
.with_description(r#"Returns a field within a map or a struct with the given key.
Note: most users invoke `get_field` indirectly via field access
syntax such as `my_struct_col['field_name']` which results in a call to
`get_field(my_struct_col, 'field_name')`."#)
.with_syntax_example("get_field(expression1, expression2)")
.with_sql_example(r#"```sql
> create table t (idx varchar, v varchar) as values ('data','fusion'), ('apache', 'arrow');
> select struct(idx, v) from t as c;
+-------------------------+
| struct(c.idx,c.v) |
+-------------------------+
| {c0: data, c1: fusion} |
| {c0: apache, c1: arrow} |
+-------------------------+
> select get_field((select struct(idx, v) from t), 'c0');
+-----------------------+
| struct(t.idx,t.v)[c0] |
+-----------------------+
| data |
| apache |
+-----------------------+
> select get_field((select struct(idx, v) from t), 'c1');
+-----------------------+
| struct(t.idx,t.v)[c1] |
+-----------------------+
| fusion |
| arrow |
+-----------------------+
```
"#)
.with_argument(
"expression1",
"The map or struct to retrieve a field for."
)
.with_argument(
"expression2",
"The field name in the map or struct to retrieve data for. Must evaluate to a string."
)
.build()
.unwrap()
})
}
47 changes: 45 additions & 2 deletions datafusion/functions/src/core/named_struct.rs
Original file line number Diff line number Diff line change
@@ -18,11 +18,12 @@
use arrow::array::StructArray;
use arrow::datatypes::{DataType, Field, Fields};
use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
use datafusion_expr::{ColumnarValue, Expr, ExprSchemable};
use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRUCT;
use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable};
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use hashbrown::HashSet;
use std::any::Any;
use std::sync::Arc;
use std::sync::{Arc, OnceLock};

/// put values in a struct array.
fn named_struct_expr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
@@ -161,4 +162,46 @@ impl ScalarUDFImpl for NamedStructFunc {
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
named_struct_expr(args)
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_named_struct_doc())
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_named_struct_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder()
.with_doc_section(DOC_SECTION_STRUCT)
.with_description("Returns an Arrow struct using the specified name and input expressions pairs.")
.with_syntax_example("named_struct(expression1_name, expression1_input[, ..., expression_n_name, expression_n_input])")
.with_sql_example(r#"
For example, this query converts two columns `a` and `b` to a single column with
a struct type of fields `field_a` and `field_b`:
```sql
> select * from t;
+---+---+
| a | b |
+---+---+
| 1 | 2 |
| 3 | 4 |
+---+---+
> select named_struct('field_a', a, 'field_b', b) from t;
+-------------------------------------------------------+
| named_struct(Utf8("field_a"),t.a,Utf8("field_b"),t.b) |
+-------------------------------------------------------+
| {field_a: 1, field_b: 2} |
| {field_a: 3, field_b: 4} |
+-------------------------------------------------------+
```
"#)
.with_argument(
"expression_n_name",
"Name of the column field. Must be a constant string."
)
.with_argument("expression_n_input", "Expression to include in the output struct. Can be a constant, column, or function, and any combination of arithmetic or string operators.")
.build()
.unwrap()
})
}
Loading

0 comments on commit eddade7

Please sign in to comment.