From b6804252484f1c6ae115bb55aa4aaffbeff10aa4 Mon Sep 17 00:00:00 2001 From: -LAN- Date: Sun, 29 Sep 2024 18:15:22 +0800 Subject: [PATCH] refactor(api): consolidate allowed extensions handling - Unified allowed extensions into a single `DOCUMENT_EXTENSIONS` reference - Adjusted checks and imports in controllers and services to use the new constant - Enhanced text extraction to support additional file types (EPUB, EML, MSG) --- api/constants/__init__.py | 12 +++---- api/controllers/console/datasets/file.py | 6 ++-- .../document_extractor_node.py | 36 +++++++++++++++++++ api/services/file_service.py | 7 ++-- 4 files changed, 46 insertions(+), 15 deletions(-) diff --git a/api/constants/__init__.py b/api/constants/__init__.py index d6d260d97ebae0..71943e017ef583 100644 --- a/api/constants/__init__.py +++ b/api/constants/__init__.py @@ -1,3 +1,5 @@ +from configs import dify_config + HIDDEN_VALUE = "[__HIDDEN__]" UUID_NIL = "00000000-0000-0000-0000-000000000000" @@ -13,9 +15,7 @@ DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"] DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS]) -ALLOWED_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"] -ALLOWED_EXTENSIONS.extend([ext.upper() for ext in ALLOWED_EXTENSIONS]) - -UNSTRUCTURED_ALLOWED_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"] -UNSTRUCTURED_ALLOWED_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub")) -UNSTRUCTURED_ALLOWED_EXTENSIONS.extend([ext.upper() for ext in UNSTRUCTURED_ALLOWED_EXTENSIONS]) +if dify_config.ETL_TYPE == "Unstructured": + DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"] + DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub")) + DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS]) diff --git a/api/controllers/console/datasets/file.py b/api/controllers/console/datasets/file.py index daa0510d84c6c0..5ed9a615459e64 100644 --- a/api/controllers/console/datasets/file.py +++ b/api/controllers/console/datasets/file.py @@ -6,7 +6,7 @@ import services from configs import dify_config -from constants import ALLOWED_EXTENSIONS, UNSTRUCTURED_ALLOWED_EXTENSIONS +from constants import DOCUMENT_EXTENSIONS from controllers.console import api from controllers.console.datasets.error import ( FileTooLargeError, @@ -79,9 +79,7 @@ class FileSupportTypeApi(Resource): @login_required @account_initialization_required def get(self): - etl_type = dify_config.ETL_TYPE - allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS if etl_type == "Unstructured" else ALLOWED_EXTENSIONS - return {"allowed_extensions": allowed_extensions} + return {"allowed_extensions": DOCUMENT_EXTENSIONS} class RemoteFileInfoApi(Resource): diff --git a/api/core/workflow/nodes/document_extractor/document_extractor_node.py b/api/core/workflow/nodes/document_extractor/document_extractor_node.py index 2bc49df3fec81a..7ed572c9ae1ef6 100644 --- a/api/core/workflow/nodes/document_extractor/document_extractor_node.py +++ b/api/core/workflow/nodes/document_extractor/document_extractor_node.py @@ -5,6 +5,9 @@ import docx import pandas as pd import pypdfium2 +from unstructured.partition.email import partition_email +from unstructured.partition.epub import partition_epub +from unstructured.partition.msg import partition_msg from unstructured.partition.ppt import partition_ppt from unstructured.partition.pptx import partition_pptx @@ -96,6 +99,12 @@ def _extract_text(*, file_content: bytes, mime_type: str) -> str: return _extract_text_from_ppt(file_content) elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation": return _extract_text_from_pptx(file_content) + elif mime_type == "application/epub+zip": + return _extract_text_from_epub(file_content) + elif mime_type == "message/rfc822": + return _extract_text_from_eml(file_content) + elif mime_type == "application/vnd.ms-outlook": + return _extract_text_from_msg(file_content) else: raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}") @@ -210,3 +219,30 @@ def _extract_text_from_pptx(file_content: bytes) -> str: return "\n".join([getattr(element, "text", "") for element in elements]) except Exception as e: raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e + + +def _extract_text_from_epub(file_content: bytes) -> str: + try: + with io.BytesIO(file_content) as file: + elements = partition_epub(file=file) + return "\n".join([str(element) for element in elements]) + except Exception as e: + raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e + + +def _extract_text_from_eml(file_content: bytes) -> str: + try: + with io.BytesIO(file_content) as file: + elements = partition_email(file=file) + return "\n".join([str(element) for element in elements]) + except Exception as e: + raise TextExtractionError(f"Failed to extract text from EML: {str(e)}") from e + + +def _extract_text_from_msg(file_content: bytes) -> str: + try: + with io.BytesIO(file_content) as file: + elements = partition_msg(file=file) + return "\n".join([str(element) for element in elements]) + except Exception as e: + raise TextExtractionError(f"Failed to extract text from MSG: {str(e)}") from e diff --git a/api/services/file_service.py b/api/services/file_service.py index aeea1b6f3f1041..8772c60aae86e5 100644 --- a/api/services/file_service.py +++ b/api/services/file_service.py @@ -10,10 +10,9 @@ from configs import dify_config from constants import ( - ALLOWED_EXTENSIONS, AUDIO_EXTENSIONS, + DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, - UNSTRUCTURED_ALLOWED_EXTENSIONS, VIDEO_EXTENSIONS, ) from core.file import helpers as file_helpers @@ -147,9 +146,7 @@ def get_file_preview(file_id: str) -> str: # extract text from file extension = upload_file.extension - etl_type = dify_config.ETL_TYPE - allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS if etl_type == "Unstructured" else ALLOWED_EXTENSIONS - if extension.lower() not in allowed_extensions: + if extension.lower() not in DOCUMENT_EXTENSIONS: raise UnsupportedFileTypeError() text = ExtractProcessor.load_from_upload_file(upload_file, return_text=True)