diff --git a/README.md b/README.md index 07cb004..85e92a5 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,14 @@ # metasequoia-sql -SQL 语法解析器,包含词法树解析(`lexical` 模块)、语法树解析(`core` 模块)和语法树分析(`analyzer` 模块)功能。 +metasequoia-sql 是一款注重性能的 SQL 语法的解析和分析器,适用于 SQL 的格式化、执行和分析场景,致力于打造性能最高的 Python 版 SQL 解析器。具有如下 3 个主要特性: -自 0.6.0 版本起,对外暴露的 API 将支持向前兼容。 +- 词法解析器与语法解析器相互独立,支持插件开发 +- 使用单一状态机实现词法解析,避免大量正则表达式的复杂逻辑 +- 除包含并列关系的节点外(例如 `ORDER BY` 多个字段),抽象语法树为完全的、根据计算优先级嵌套的一元和二元表达式结构 + +metasequoia-sql 包含词法树解析(`lexical` 模块)、语法树解析(`core` 模块)和语法树分析(`analyzer` 模块)等主要功能。 + +自 0.6.0 版本起,metasequoia-sql 的 public 方法 API 将尽可能支持向前兼容。 ## 安装方法 @@ -12,28 +18,69 @@ pip install metasequoia-sql ## 使用方法 -### 词法树解析 +### 词法解析 + +单纯的词法解析,可以应用于 SQL 语句格式化等场景。 -将 SQL 语句解析为一个抽象词法树节点的列表: +将 SQL 语句解析为一个词法节点的列表(demo_101),节点中包含对应的源代码(`source` 属性)以及节点标签(`marks` 属性)。 ```python from metasequoia_sql import FSMMachine -FSMMachine.parse("your sql") + +amt_tree = FSMMachine.parse("SELECT column1, '2' FROM table_1") +for node in amt_tree: + print(node) ``` -### 语法树解析 +对于有括号的 SQL 语句,会将括号生成一个 `AMTParenthesis` 类型节点,该节点包含一个 `PARENTHESIS` +标记,括号中的词法节点会被添加到括号节点的 `children` 属性中(demo_102)。 + +```python +from metasequoia_sql import FSMMachine + +amt_tree = FSMMachine.parse("SELECT column1, (column2 + 1) * 2 FROM table_1") +for node in amt_tree: + print(node) +``` -将 SQL 语句解析为一个抽象语法树,支持一次性解析多个 SQL 语句,支持解析 SQL 语句中的某个语法元素: +### 语法解析 + +将 SQL 语句解析为一个抽象语法树,返回抽象语法树的根节点。 + +词法解析支持一次性解析一个语句(demo_201): ```python -from metasequoia_sql import * +from metasequoia_sql import SQLParser -statements = SQLParser.parse_statements("your sql file") +statement = SQLParser.parse_select_statement("SELECT column1, '2' FROM table_1") +print(statement) ``` -### 语法树分析:数据血缘分析 +也支持一次性解析多个语句(demo_202): + +```python +from metasequoia_sql import SQLParser + +statements = SQLParser.parse_statements("SELECT column1 FROM table_1; SELECT column2 FROM table_2") +for statement in statements: + print(statement) +``` -分析 INSERT 语句的数据血缘。数据血缘分析需要依赖元数据,所以需要根据你的数据源继承 `CreateTableStatementGetter` 类并提供给数据血缘分析器。 +此外,也可以解析语句中的某个部分(demo_203): + +```python +from metasequoia_sql import SQLParser + +expression = SQLParser.parse_logical_and_level_expression("(`column1` > 2) AND (`column2` > 1)") +print(expression) +``` + +### 应用样例:数据血缘分析 + +通过基于语法解析器的数据血缘分析工具,可以实现对 SQL 语句的分析。例如: + +分析 INSERT 语句的数据血缘。数据血缘分析需要依赖元数据,所以需要根据你的数据源继承 `CreateTableStatementGetter` +类并提供给数据血缘分析器(demo_301): ```python from metasequoia_sql import * @@ -46,31 +93,56 @@ for statement in SQLParser.parse_statements("your sql file"): result = table_lineage_analyzer.get_insert_table_lineage(statement) ``` -## 实现原理 +### 插件样例:MyBatis 插件(暂未完善) -将词法分析与句法分析分离,先对解析 SQL 语句生成抽象词法树,然后解析抽象词法树生成抽象语法树。 +通过重写了语法解析器和词法解析器的插件,可以实现对特殊 SQL 语法的解析。例如: -在词法分析中,使用有限状态自动机进行解析。 +对 MyBatis 语法进行解析(demo_302): -在语法分析中,根据语法结构确定可能得元素类型后进行解析。 +```python +from metasequoia_sql.plugins.mybaitis import SQLParserMyBatis -## 参与贡献 +statements = SQLParserMyBatis.parse_statements("SELECT column_1 FROM Shohin " + "WHERE #{column_2} > 500 " + "GROUP BY #{column_3}") +for statement in statements: + print(statement) +``` + +### 工具样例:SQL on OTS(暂未发布) + +通过基于语法解析器的工具,可以实现一些实现 SQL 执行的工具。 + +## 性能比较 -### 已知问题 +- 测试样本:4482 个脚本,共 19880057 字节(18.96 MB)的 SQL 语句。 +- 测试 Python环境:Python 3.10 +- 测试 CPU:Intel(R) Core(TM) i7-10510U CPU @ 1.80GHz -- MySQL 中,使用连续的 `!` 符号的场景 +| | 解析时间 | 平均解析速度 | +|-----------------|----------|--------------| +| metasequoia-sql | 65.28 秒 | 297.4 KB / s | +| sqlglot | 182.74 秒 | 106.2 KB / s | + +## 基本特性 + +- 词法解析器与句法解析器分离 +- 使用单一、独立的状态机实现词法解析 +- 除了逻辑并列的场景外,抽象语法树为完全的嵌套二元表达式 + +## 参与贡献 - +单元测试当前不会自动检查): -### 提交前自检 +运行 `scripts/test/test_main.py` 脚本。如果有新增功能,也需要新增对应的单元测试。 -pylint 检查:在 Pull Request 时会自动执行检查。 +pylint 代码质量检查(在 Pull Request 时自动检查): ```bash pylint --max-line-length=120 metasequoia_sql ``` -单元测试覆盖率检查: +单元测试覆盖率检查(当前不会自动检查): ```bash # 将 metasequoia-sql 文件夹添加到 PYTHONPATH,并在 metasequoia-sql 文件夹下执行 diff --git a/demo/__init__.py b/demo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/demo/demo_101.py b/demo/demo_101.py new file mode 100644 index 0000000..6a0ceb4 --- /dev/null +++ b/demo/demo_101.py @@ -0,0 +1,16 @@ +""" +词法解析 Demo + + + + + + + +""" + +from metasequoia_sql import FSMMachine + +amt_tree = FSMMachine.parse("SELECT column1, '2' FROM table_1") +for node in amt_tree: + print(node) diff --git a/demo/demo_102.py b/demo/demo_102.py new file mode 100644 index 0000000..3ba299f --- /dev/null +++ b/demo/demo_102.py @@ -0,0 +1,18 @@ +""" +词法解析 Demo + + + + +, , ] marks=PARENTHESIS> + + + + +""" + +from metasequoia_sql import FSMMachine + +amt_tree = FSMMachine.parse("SELECT column1, (column2 + 1) * 2 FROM table_1") +for node in amt_tree: + print(node) diff --git a/demo/demo_201.py b/demo/demo_201.py new file mode 100644 index 0000000..8fbd969 --- /dev/null +++ b/demo/demo_201.py @@ -0,0 +1,8 @@ +""" +语法解析 Demo +""" + +from metasequoia_sql import SQLParser + +statement = SQLParser.parse_select_statement("SELECT column1, '2' FROM table_1") +print(statement) diff --git a/demo/demo_202.py b/demo/demo_202.py new file mode 100644 index 0000000..12eb2d2 --- /dev/null +++ b/demo/demo_202.py @@ -0,0 +1,9 @@ +""" +语法解析 Demo +""" + +from metasequoia_sql import SQLParser + +statements = SQLParser.parse_statements("SELECT column1 FROM table_1; SELECT column2 FROM table_2") +for statement in statements: + print(statement) diff --git a/demo/demo_203.py b/demo/demo_203.py new file mode 100644 index 0000000..54f7403 --- /dev/null +++ b/demo/demo_203.py @@ -0,0 +1,8 @@ +""" +语法解析 Demo +""" + +from metasequoia_sql import SQLParser + +expression = SQLParser.parse_logical_and_level_expression("(`column1` > 2) AND (`column2` > 1)") +print(expression) diff --git a/demo/demo_301.py b/demo/demo_301.py new file mode 100644 index 0000000..7a35729 --- /dev/null +++ b/demo/demo_301.py @@ -0,0 +1,13 @@ +""" +应用样例:数据血缘分析 +""" + +from metasequoia_sql import * +from metasequoia_sql.analyzer import CreateTableStatementGetter +from metasequoia_sql.analyzer.data_linage.table_lineage_analyzer import TableLineageAnalyzer + +table_lineage_analyzer = TableLineageAnalyzer(CreateTableStatementGetter(...)) +for statement in SQLParser.parse_statements("your sql"): + if isinstance(statement, ASTInsertSelectStatement): + result = table_lineage_analyzer.get_insert_table_lineage(statement) + print(result) diff --git a/demo/demo_302.py b/demo/demo_302.py new file mode 100644 index 0000000..1925ced --- /dev/null +++ b/demo/demo_302.py @@ -0,0 +1,11 @@ +""" +MyBatis 插件 Demo +""" + +from metasequoia_sql.plugins.mybaitis import SQLParserMyBatis + +statements = SQLParserMyBatis.parse_statements("SELECT column_1 FROM Shohin " + "WHERE #{column_2} > 500 " + "GROUP BY #{column_3}") +for statement in statements: + print(statement) diff --git "a/docs/\347\211\210\346\234\254\346\233\264\346\226\260\350\256\260\345\275\225.md" "b/docs/\347\211\210\346\234\254\346\233\264\346\226\260\350\256\260\345\275\225.md" index efd7f88..726625d 100644 --- "a/docs/\347\211\210\346\234\254\346\233\264\346\226\260\350\256\260\345\275\225.md" +++ "b/docs/\347\211\210\346\234\254\346\233\264\346\226\260\350\256\260\345\275\225.md" @@ -1,3 +1,16 @@ +## 0.6.0 + +新增功能: + +- [#16 优化一般表达式的解析逻辑](https://github.com/ChangxingJiang/metasequoia-sql/pull/16) +- [#19 优化抽象语法树节点和语法解析器的实现逻辑](https://github.com/ChangxingJiang/metasequoia-sql/pull/19) +- [#23 抽象语法树名称规范化 + 抽象语法树解析性能优化](https://github.com/ChangxingJiang/metasequoia-sql/pull/23) +- [#25 新增 DELETE 语句解析逻辑](https://github.com/ChangxingJiang/metasequoia-sql/pull/25) + +Bugfix: + +- [#27 修复 Hive 建表语句中 DECIMAL、VARCHAR、CHAR 类型没有参数的问题](https://github.com/ChangxingJiang/metasequoia-sql/pull/27) + ## 0.5.0 新增: diff --git a/metasequoia_sql/lexical/amt_node.py b/metasequoia_sql/lexical/amt_node.py index 131814f..55174f1 100644 --- a/metasequoia_sql/lexical/amt_node.py +++ b/metasequoia_sql/lexical/amt_node.py @@ -84,6 +84,14 @@ def source_in_set_use_upper(self, other: Set[str]) -> bool: """判断当前 AMT 节点的源代码的 **大写形式** 是否等于 token(适用于比较关键字)""" return self.source.upper() in other + def _get_mark_name_list(self) -> List[str]: + """获取状态压缩的 Mark 对应的标签名称列表""" + result = [] + for mark in AMTMark: + if self.marks & mark.value: + result.append(mark.name) + return result + class AMTSingle(AMTBase): """单元素节点""" @@ -94,8 +102,9 @@ def __init__(self, source: str, marks: int = 0): self.children = [] def __repr__(self) -> str: - format_source = self.source.replace("\n", r"\n") - return f"<{self.__class__.__name__} source={format_source}>" + source_str = self.source.replace("\n", r"\n") + mark_str = "|".join(self._get_mark_name_list()) + return f"<{self.__class__.__name__} source=\"{source_str}\" marks={mark_str}>" class AMTParenthesisBase(AMTBase): @@ -124,7 +133,8 @@ def equals(self, other: Union[str, AMTMark]) -> Union[bool, int]: return False # 插入语不尝试匹配源码值 def __repr__(self) -> str: - return f"<{self.__class__.__name__} children={self.children}>" + mark_str = "|".join(self._get_mark_name_list()) + return f"<{self.__class__.__name__} children={self.children} marks={mark_str}>" class AMTParenthesis(AMTParenthesisBase): diff --git a/metasequoia_sql/plugins/mybaitis.py b/metasequoia_sql/plugins/mybaitis.py index 14d41f8..f833880 100644 --- a/metasequoia_sql/plugins/mybaitis.py +++ b/metasequoia_sql/plugins/mybaitis.py @@ -14,7 +14,7 @@ from typing import Union, List from metasequoia_sql import SQLType, ASTBase -from metasequoia_sql.analyzer import AnalyzerRecursionASTToListBase, CurrentUsedQuoteColumn +from metasequoia_sql.analyzer import AnalyzerRecursionASTToListBase from metasequoia_sql.common import TokenScanner from metasequoia_sql.core import SQLParser, ASTSingleSelectStatement from metasequoia_sql.lexical import FSMMachine, FSMStatus, AMTMark, FSMMemory, FSMOperate @@ -108,22 +108,3 @@ def handle(cls, node: ASTBase) -> List[str]: if isinstance(node, ASTSingleSelectStatement): return cls.handle(node.group_by_clause) return cls.default_handle_node(node) - - -if __name__ == "__main__": - def test_main(): - """测试主逻辑""" - test_sql = "SELECT shohin_mei FROM Shohin WHERE #{hanbai_tanka} > 500 GROUP BY #{tanka};" - - statements = SQLParserMyBatis.parse_statements(test_sql) - for statement in statements: - if isinstance(statement, ASTSingleSelectStatement): - print(statement) - print(statement.source(SQLType.MYSQL)) - print(CurrentUsedQuoteColumn.handle(statement)) - print(GetAllMybatisParams().handle(statement)) - print(GetMybatisParamInWhereClause().handle(statement)) - print(GetMybatisParamInGroupByClause().handle(statement)) - - - test_main() diff --git a/demo/README.md b/scripts/demo_sql/README.md similarity index 100% rename from demo/README.md rename to scripts/demo_sql/README.md diff --git a/demo/dolphinscheduler_mysql.sql b/scripts/demo_sql/dolphinscheduler_mysql.sql similarity index 100% rename from demo/dolphinscheduler_mysql.sql rename to scripts/demo_sql/dolphinscheduler_mysql.sql diff --git a/scripts/tests/test_dolphinscheduler_mysql.py b/scripts/tests/test_dolphinscheduler_mysql.py index 1b35598..2e08bd8 100644 --- a/scripts/tests/test_dolphinscheduler_mysql.py +++ b/scripts/tests/test_dolphinscheduler_mysql.py @@ -13,7 +13,7 @@ class TestDolphinSchedulerMysql(unittest.TestCase): def test_parse_statements(self): project_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - demo_path = os.path.join(project_path, "demo", "dolphinscheduler_mysql.sql") + demo_path = os.path.join(project_path, "scripts", "demo_sql", "dolphinscheduler_mysql.sql") with open(demo_path, encoding="UTF-8") as file: for statement in core.SQLParser.parse_statements(file.read()): statement.source(core.SQLType.MYSQL)