Skip to content

Commit

Permalink
refactor(api): consolidate allowed extensions handling
Browse files Browse the repository at this point in the history
- Unified allowed extensions into a single `DOCUMENT_EXTENSIONS` reference
- Adjusted checks and imports in controllers and services to use the new constant
- Enhanced text extraction to support additional file types (EPUB, EML, MSG)
  • Loading branch information
laipz8200 committed Sep 29, 2024
1 parent d954e0e commit b680425
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 15 deletions.
12 changes: 6 additions & 6 deletions api/constants/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from configs import dify_config

HIDDEN_VALUE = "[__HIDDEN__]"
UUID_NIL = "00000000-0000-0000-0000-000000000000"

Expand All @@ -13,9 +15,7 @@
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])

ALLOWED_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
ALLOWED_EXTENSIONS.extend([ext.upper() for ext in ALLOWED_EXTENSIONS])

UNSTRUCTURED_ALLOWED_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
UNSTRUCTURED_ALLOWED_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
UNSTRUCTURED_ALLOWED_EXTENSIONS.extend([ext.upper() for ext in UNSTRUCTURED_ALLOWED_EXTENSIONS])
if dify_config.ETL_TYPE == "Unstructured":
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
6 changes: 2 additions & 4 deletions api/controllers/console/datasets/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import services
from configs import dify_config
from constants import ALLOWED_EXTENSIONS, UNSTRUCTURED_ALLOWED_EXTENSIONS
from constants import DOCUMENT_EXTENSIONS
from controllers.console import api
from controllers.console.datasets.error import (
FileTooLargeError,
Expand Down Expand Up @@ -79,9 +79,7 @@ class FileSupportTypeApi(Resource):
@login_required
@account_initialization_required
def get(self):
etl_type = dify_config.ETL_TYPE
allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS if etl_type == "Unstructured" else ALLOWED_EXTENSIONS
return {"allowed_extensions": allowed_extensions}
return {"allowed_extensions": DOCUMENT_EXTENSIONS}


class RemoteFileInfoApi(Resource):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import docx
import pandas as pd
import pypdfium2
from unstructured.partition.email import partition_email
from unstructured.partition.epub import partition_epub
from unstructured.partition.msg import partition_msg
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx

Expand Down Expand Up @@ -96,6 +99,12 @@ def _extract_text(*, file_content: bytes, mime_type: str) -> str:
return _extract_text_from_ppt(file_content)
elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
return _extract_text_from_pptx(file_content)
elif mime_type == "application/epub+zip":
return _extract_text_from_epub(file_content)
elif mime_type == "message/rfc822":
return _extract_text_from_eml(file_content)
elif mime_type == "application/vnd.ms-outlook":
return _extract_text_from_msg(file_content)
else:
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")

Expand Down Expand Up @@ -210,3 +219,30 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
return "\n".join([getattr(element, "text", "") for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e


def _extract_text_from_epub(file_content: bytes) -> str:
try:
with io.BytesIO(file_content) as file:
elements = partition_epub(file=file)
return "\n".join([str(element) for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e


def _extract_text_from_eml(file_content: bytes) -> str:
try:
with io.BytesIO(file_content) as file:
elements = partition_email(file=file)
return "\n".join([str(element) for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from EML: {str(e)}") from e


def _extract_text_from_msg(file_content: bytes) -> str:
try:
with io.BytesIO(file_content) as file:
elements = partition_msg(file=file)
return "\n".join([str(element) for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from MSG: {str(e)}") from e
7 changes: 2 additions & 5 deletions api/services/file_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,9 @@

from configs import dify_config
from constants import (
ALLOWED_EXTENSIONS,
AUDIO_EXTENSIONS,
DOCUMENT_EXTENSIONS,
IMAGE_EXTENSIONS,
UNSTRUCTURED_ALLOWED_EXTENSIONS,
VIDEO_EXTENSIONS,
)
from core.file import helpers as file_helpers
Expand Down Expand Up @@ -147,9 +146,7 @@ def get_file_preview(file_id: str) -> str:

# extract text from file
extension = upload_file.extension
etl_type = dify_config.ETL_TYPE
allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS if etl_type == "Unstructured" else ALLOWED_EXTENSIONS
if extension.lower() not in allowed_extensions:
if extension.lower() not in DOCUMENT_EXTENSIONS:
raise UnsupportedFileTypeError()

text = ExtractProcessor.load_from_upload_file(upload_file, return_text=True)
Expand Down

0 comments on commit b680425

Please sign in to comment.