From 5e34f938c18ef6b41da4bd2634274defa3523f6d Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Mon, 18 Dec 2023 23:24:06 +0800 Subject: [PATCH] Feat/add unstructured support (#1780) Co-authored-by: jyong --- api/.env.example | 3 ++ api/config.py | 6 ++- api/controllers/console/datasets/file.py | 15 ++++++ api/core/data_loader/file_extractor.py | 46 ++++++++++++++---- .../loader/unstructured/unstructured_eml.py | 41 ++++++++++++++++ .../unstructured/unstructured_markdown.py | 48 +++++++++++++++++++ .../loader/unstructured/unstructured_msg.py | 40 ++++++++++++++++ .../loader/unstructured/unstructured_ppt.py | 40 ++++++++++++++++ .../loader/unstructured/unstructured_pptx.py | 40 ++++++++++++++++ .../loader/unstructured/unstructured_text.py | 40 ++++++++++++++++ .../loader/unstructured/unstructured_xml.py | 40 ++++++++++++++++ api/core/indexing_runner.py | 2 +- api/models/dataset.py | 2 +- api/requirements.txt | 4 +- api/services/file_service.py | 8 +++- 15 files changed, 361 insertions(+), 14 deletions(-) create mode 100644 api/core/data_loader/loader/unstructured/unstructured_eml.py create mode 100644 api/core/data_loader/loader/unstructured/unstructured_markdown.py create mode 100644 api/core/data_loader/loader/unstructured/unstructured_msg.py create mode 100644 api/core/data_loader/loader/unstructured/unstructured_ppt.py create mode 100644 api/core/data_loader/loader/unstructured/unstructured_pptx.py create mode 100644 api/core/data_loader/loader/unstructured/unstructured_text.py create mode 100644 api/core/data_loader/loader/unstructured/unstructured_xml.py diff --git a/api/.env.example b/api/.env.example index 08c5ecc0df7f50..5681c7f800bfb8 100644 --- a/api/.env.example +++ b/api/.env.example @@ -117,3 +117,6 @@ HOSTED_ANTHROPIC_API_BASE= HOSTED_ANTHROPIC_API_KEY= HOSTED_ANTHROPIC_QUOTA_LIMIT=600000 HOSTED_ANTHROPIC_PAID_ENABLED=false + +ETL_TYPE=dify +UNSTRUCTURED_API_URL= \ No newline at end of file diff --git a/api/config.py b/api/config.py index e07af3ef21e1ec..f2d83e72628bb4 100644 --- a/api/config.py +++ b/api/config.py @@ -54,7 +54,8 @@ 'UPLOAD_IMAGE_FILE_SIZE_LIMIT': 10, 'OUTPUT_MODERATION_BUFFER_SIZE': 300, 'MULTIMODAL_SEND_IMAGE_FORMAT': 'base64', - 'INVITE_EXPIRY_HOURS': 72 + 'INVITE_EXPIRY_HOURS': 72, + 'ETL_TYPE': 'dify', } @@ -276,6 +277,9 @@ def __init__(self): self.HOSTED_MODERATION_ENABLED = get_bool_env('HOSTED_MODERATION_ENABLED') self.HOSTED_MODERATION_PROVIDERS = get_env('HOSTED_MODERATION_PROVIDERS') + self.ETL_TYPE = get_env('ETL_TYPE') + self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL') + class CloudEditionConfig(Config): diff --git a/api/controllers/console/datasets/file.py b/api/controllers/console/datasets/file.py index c6b73913755c34..e872269db3ef9b 100644 --- a/api/controllers/console/datasets/file.py +++ b/api/controllers/console/datasets/file.py @@ -69,5 +69,20 @@ def get(self, file_id): return {'content': text} +class FileeSupportTypApi(Resource): + @setup_required + @login_required + @account_initialization_required + def get(self): + etl_type = current_app.config['ETL_TYPE'] + if etl_type == 'Unstructured': + allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', + 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] + else: + allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] + return {'allowed_extensions': allowed_extensions} + + api.add_resource(FileApi, '/files/upload') api.add_resource(FilePreviewApi, '/files//preview') +api.add_resource(FileeSupportTypApi, '/files/support-type') diff --git a/api/core/data_loader/file_extractor.py b/api/core/data_loader/file_extractor.py index 40f0c1f20155e5..aab35b3a77254e 100644 --- a/api/core/data_loader/file_extractor.py +++ b/api/core/data_loader/file_extractor.py @@ -3,7 +3,8 @@ from typing import List, Union, Optional import requests -from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader +from flask import current_app +from langchain.document_loaders import TextLoader, Docx2txtLoader from langchain.schema import Document from core.data_loader.loader.csv_loader import CSVLoader @@ -11,6 +12,13 @@ from core.data_loader.loader.html import HTMLLoader from core.data_loader.loader.markdown import MarkdownLoader from core.data_loader.loader.pdf import PdfLoader +from core.data_loader.loader.unstructured.unstructured_eml import UnstructuredEmailLoader +from core.data_loader.loader.unstructured.unstructured_markdown import UnstructuredMarkdownLoader +from core.data_loader.loader.unstructured.unstructured_msg import UnstructuredMsgLoader +from core.data_loader.loader.unstructured.unstructured_ppt import UnstructuredPPTLoader +from core.data_loader.loader.unstructured.unstructured_pptx import UnstructuredPPTXLoader +from core.data_loader.loader.unstructured.unstructured_text import UnstructuredTextLoader +from core.data_loader.loader.unstructured.unstructured_xml import UnstructuredXmlLoader from extensions.ext_storage import storage from models.model import UploadFile @@ -49,14 +57,34 @@ def load_from_file(cls, file_path: str, return_text: bool = False, input_file = Path(file_path) delimiter = '\n' file_extension = input_file.suffix.lower() - if is_automatic: - loader = UnstructuredFileLoader( - file_path, strategy="hi_res", mode="elements" - ) - # loader = UnstructuredAPIFileLoader( - # file_path=filenames[0], - # api_key="FAKE_API_KEY", - # ) + etl_type = current_app.config['ETL_TYPE'] + unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL'] + if etl_type == 'Unstructured': + if file_extension == '.xlsx': + loader = ExcelLoader(file_path) + elif file_extension == '.pdf': + loader = PdfLoader(file_path, upload_file=upload_file) + elif file_extension in ['.md', '.markdown']: + loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url) + elif file_extension in ['.htm', '.html']: + loader = HTMLLoader(file_path) + elif file_extension == '.docx': + loader = Docx2txtLoader(file_path) + elif file_extension == '.csv': + loader = CSVLoader(file_path, autodetect_encoding=True) + elif file_extension == '.msg': + loader = UnstructuredMsgLoader(file_path, unstructured_api_url) + elif file_extension == '.eml': + loader = UnstructuredEmailLoader(file_path, unstructured_api_url) + elif file_extension == '.ppt': + loader = UnstructuredPPTLoader(file_path, unstructured_api_url) + elif file_extension == '.pptx': + loader = UnstructuredPPTXLoader(file_path, unstructured_api_url) + elif file_extension == '.xml': + loader = UnstructuredXmlLoader(file_path, unstructured_api_url) + else: + # txt + loader = UnstructuredTextLoader(file_path, unstructured_api_url) else: if file_extension == '.xlsx': loader = ExcelLoader(file_path) diff --git a/api/core/data_loader/loader/unstructured/unstructured_eml.py b/api/core/data_loader/loader/unstructured/unstructured_eml.py new file mode 100644 index 00000000000000..f7a67be42130cd --- /dev/null +++ b/api/core/data_loader/loader/unstructured/unstructured_eml.py @@ -0,0 +1,41 @@ +import logging +import re +from typing import Optional, List, Tuple, cast + +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.helpers import detect_file_encodings +from langchain.schema import Document + +logger = logging.getLogger(__name__) + + +class UnstructuredEmailLoader(BaseLoader): + """Load msg files. + + + Args: + file_path: Path to the file to load. + """ + + def __init__( + self, + file_path: str, + api_url: str, + ): + """Initialize with file path.""" + self._file_path = file_path + self._api_url = api_url + + + def load(self) -> List[Document]: + from unstructured.partition.email import partition_email + + elements = partition_email(filename=self._file_path, api_url=self._api_url) + from unstructured.chunking.title import chunk_by_title + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + documents = [] + for chunk in chunks: + text = chunk.text.strip() + documents.append(Document(page_content=text)) + + return documents diff --git a/api/core/data_loader/loader/unstructured/unstructured_markdown.py b/api/core/data_loader/loader/unstructured/unstructured_markdown.py new file mode 100644 index 00000000000000..cf6e7c9c8a7cfa --- /dev/null +++ b/api/core/data_loader/loader/unstructured/unstructured_markdown.py @@ -0,0 +1,48 @@ +import logging +from typing import List + +from langchain.document_loaders.base import BaseLoader +from langchain.schema import Document + +logger = logging.getLogger(__name__) + + +class UnstructuredMarkdownLoader(BaseLoader): + """Load md files. + + + Args: + file_path: Path to the file to load. + + remove_hyperlinks: Whether to remove hyperlinks from the text. + + remove_images: Whether to remove images from the text. + + encoding: File encoding to use. If `None`, the file will be loaded + with the default system encoding. + + autodetect_encoding: Whether to try to autodetect the file encoding + if the specified encoding fails. + """ + + def __init__( + self, + file_path: str, + api_url: str, + ): + """Initialize with file path.""" + self._file_path = file_path + self._api_url = api_url + + def load(self) -> List[Document]: + from unstructured.partition.md import partition_md + + elements = partition_md(filename=self._file_path, api_url=self._api_url) + from unstructured.chunking.title import chunk_by_title + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + documents = [] + for chunk in chunks: + text = chunk.text.strip() + documents.append(Document(page_content=text)) + + return documents diff --git a/api/core/data_loader/loader/unstructured/unstructured_msg.py b/api/core/data_loader/loader/unstructured/unstructured_msg.py new file mode 100644 index 00000000000000..1e18dbcdf5f4e1 --- /dev/null +++ b/api/core/data_loader/loader/unstructured/unstructured_msg.py @@ -0,0 +1,40 @@ +import logging +import re +from typing import Optional, List, Tuple, cast + +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.helpers import detect_file_encodings +from langchain.schema import Document + +logger = logging.getLogger(__name__) + + +class UnstructuredMsgLoader(BaseLoader): + """Load msg files. + + + Args: + file_path: Path to the file to load. + """ + + def __init__( + self, + file_path: str, + api_url: str + ): + """Initialize with file path.""" + self._file_path = file_path + self._api_url = api_url + + def load(self) -> List[Document]: + from unstructured.partition.msg import partition_msg + + elements = partition_msg(filename=self._file_path, api_url=self._api_url) + from unstructured.chunking.title import chunk_by_title + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + documents = [] + for chunk in chunks: + text = chunk.text.strip() + documents.append(Document(page_content=text)) + + return documents diff --git a/api/core/data_loader/loader/unstructured/unstructured_ppt.py b/api/core/data_loader/loader/unstructured/unstructured_ppt.py new file mode 100644 index 00000000000000..91750ea71164b7 --- /dev/null +++ b/api/core/data_loader/loader/unstructured/unstructured_ppt.py @@ -0,0 +1,40 @@ +import logging +import re +from typing import Optional, List, Tuple, cast + +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.helpers import detect_file_encodings +from langchain.schema import Document + +logger = logging.getLogger(__name__) + + +class UnstructuredPPTLoader(BaseLoader): + """Load msg files. + + + Args: + file_path: Path to the file to load. + """ + + def __init__( + self, + file_path: str, + api_url: str + ): + """Initialize with file path.""" + self._file_path = file_path + self._api_url = api_url + + def load(self) -> List[Document]: + from unstructured.partition.ppt import partition_ppt + + elements = partition_ppt(filename=self._file_path, api_url=self._api_url) + from unstructured.chunking.title import chunk_by_title + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + documents = [] + for chunk in chunks: + text = chunk.text.strip() + documents.append(Document(page_content=text)) + + return documents diff --git a/api/core/data_loader/loader/unstructured/unstructured_pptx.py b/api/core/data_loader/loader/unstructured/unstructured_pptx.py new file mode 100644 index 00000000000000..5e300070208743 --- /dev/null +++ b/api/core/data_loader/loader/unstructured/unstructured_pptx.py @@ -0,0 +1,40 @@ +import logging +import re +from typing import Optional, List, Tuple, cast + +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.helpers import detect_file_encodings +from langchain.schema import Document + +logger = logging.getLogger(__name__) + + +class UnstructuredPPTXLoader(BaseLoader): + """Load msg files. + + + Args: + file_path: Path to the file to load. + """ + + def __init__( + self, + file_path: str, + api_url: str + ): + """Initialize with file path.""" + self._file_path = file_path + self._api_url = api_url + + def load(self) -> List[Document]: + from unstructured.partition.pptx import partition_pptx + + elements = partition_pptx(filename=self._file_path, api_url=self._api_url) + from unstructured.chunking.title import chunk_by_title + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + documents = [] + for chunk in chunks: + text = chunk.text.strip() + documents.append(Document(page_content=text)) + + return documents diff --git a/api/core/data_loader/loader/unstructured/unstructured_text.py b/api/core/data_loader/loader/unstructured/unstructured_text.py new file mode 100644 index 00000000000000..f552f8bc86c204 --- /dev/null +++ b/api/core/data_loader/loader/unstructured/unstructured_text.py @@ -0,0 +1,40 @@ +import logging +import re +from typing import Optional, List, Tuple, cast + +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.helpers import detect_file_encodings +from langchain.schema import Document + +logger = logging.getLogger(__name__) + + +class UnstructuredTextLoader(BaseLoader): + """Load msg files. + + + Args: + file_path: Path to the file to load. + """ + + def __init__( + self, + file_path: str, + api_url: str + ): + """Initialize with file path.""" + self._file_path = file_path + self._api_url = api_url + + def load(self) -> List[Document]: + from unstructured.partition.text import partition_text + + elements = partition_text(filename=self._file_path, api_url=self._api_url) + from unstructured.chunking.title import chunk_by_title + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + documents = [] + for chunk in chunks: + text = chunk.text.strip() + documents.append(Document(page_content=text)) + + return documents diff --git a/api/core/data_loader/loader/unstructured/unstructured_xml.py b/api/core/data_loader/loader/unstructured/unstructured_xml.py new file mode 100644 index 00000000000000..8c09512fb91cbe --- /dev/null +++ b/api/core/data_loader/loader/unstructured/unstructured_xml.py @@ -0,0 +1,40 @@ +import logging +import re +from typing import Optional, List, Tuple, cast + +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.helpers import detect_file_encodings +from langchain.schema import Document + +logger = logging.getLogger(__name__) + + +class UnstructuredXmlLoader(BaseLoader): + """Load msg files. + + + Args: + file_path: Path to the file to load. + """ + + def __init__( + self, + file_path: str, + api_url: str + ): + """Initialize with file path.""" + self._file_path = file_path + self._api_url = api_url + + def load(self) -> List[Document]: + from unstructured.partition.xml import partition_xml + + elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url) + from unstructured.chunking.title import chunk_by_title + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + documents = [] + for chunk in chunks: + text = chunk.text.strip() + documents.append(Document(page_content=text)) + + return documents diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 2baecedfd7f431..0da5f297bb301e 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -397,7 +397,7 @@ def _load_data(self, dataset_document: DatasetDocument, automatic: bool = False) one_or_none() if file_detail: - text_docs = FileExtractor.load(file_detail, is_automatic=False) + text_docs = FileExtractor.load(file_detail, is_automatic=True) elif dataset_document.data_source_type == 'notion_import': loader = NotionLoader.from_document(dataset_document) text_docs = loader.load() diff --git a/api/models/dataset.py b/api/models/dataset.py index a40af353453ab6..908bcda158b3e7 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -135,7 +135,7 @@ class DatasetProcessRule(db.Model): ], 'segmentation': { 'delimiter': '\n', - 'max_tokens': 512 + 'max_tokens': 1000 } } diff --git a/api/requirements.txt b/api/requirements.txt index 4224ca6c5cd43b..b56c4c6c73caa0 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -53,4 +53,6 @@ zhipuai==1.0.7 werkzeug==2.3.7 pymilvus==2.3.0 qdrant-client==1.6.4 -cohere~=4.32 \ No newline at end of file +cohere~=4.32 +unstructured~=0.10.27 +unstructured[docx,pptx]~=0.10.27 \ No newline at end of file diff --git a/api/services/file_service.py b/api/services/file_service.py index 78a35a928649d9..af665cb6529419 100644 --- a/api/services/file_service.py +++ b/api/services/file_service.py @@ -27,7 +27,13 @@ class FileService: @staticmethod def upload_file(file: FileStorage, user: Union[Account, EndUser], only_image: bool = False) -> UploadFile: extension = file.filename.split('.')[-1] - if extension.lower() not in ALLOWED_EXTENSIONS: + etl_type = current_app.config['ETL_TYPE'] + if etl_type == 'Unstructured': + allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', + 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] + else: + allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] + if extension.lower() not in allowed_extensions: raise UnsupportedFileTypeError() elif only_image and extension.lower() not in IMAGE_EXTENSIONS: raise UnsupportedFileTypeError()