Feat/add unstructured support (#1780)

Co-authored-by: jyong <[email protected]>
langgenius · Dec 18, 2023 · 5e34f93 · 5e34f93
1 parent 2fd56cb
commit 5e34f93
Show file tree

Hide file tree

Showing 15 changed files with 361 additions and 14 deletions.
diff --git a/api/.env.example b/api/.env.example
@@ -117,3 +117,6 @@ HOSTED_ANTHROPIC_API_BASE=
 HOSTED_ANTHROPIC_API_KEY=
 HOSTED_ANTHROPIC_QUOTA_LIMIT=600000
 HOSTED_ANTHROPIC_PAID_ENABLED=false
+
+ETL_TYPE=dify
+UNSTRUCTURED_API_URL=
diff --git a/api/config.py b/api/config.py
@@ -54,7 +54,8 @@
     'UPLOAD_IMAGE_FILE_SIZE_LIMIT': 10,
     'OUTPUT_MODERATION_BUFFER_SIZE': 300,
     'MULTIMODAL_SEND_IMAGE_FORMAT': 'base64',
-    'INVITE_EXPIRY_HOURS': 72
+    'INVITE_EXPIRY_HOURS': 72,
+    'ETL_TYPE': 'dify',
 }
 
 
@@ -276,6 +277,9 @@ def __init__(self):
         self.HOSTED_MODERATION_ENABLED = get_bool_env('HOSTED_MODERATION_ENABLED')
         self.HOSTED_MODERATION_PROVIDERS = get_env('HOSTED_MODERATION_PROVIDERS')
 
+        self.ETL_TYPE = get_env('ETL_TYPE')
+        self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL')
+
 
 class CloudEditionConfig(Config):
 

diff --git a/api/controllers/console/datasets/file.py b/api/controllers/console/datasets/file.py
@@ -69,5 +69,20 @@ def get(self, file_id):
         return {'content': text}
 
 
+class FileeSupportTypApi(Resource):
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self):
+        etl_type = current_app.config['ETL_TYPE']
+        if etl_type == 'Unstructured':
+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
+                                  'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
+        else:
+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
+        return {'allowed_extensions': allowed_extensions}
+
+
 api.add_resource(FileApi, '/files/upload')
 api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview')
+api.add_resource(FileeSupportTypApi, '/files/support-type')
diff --git a/api/core/data_loader/file_extractor.py b/api/core/data_loader/file_extractor.py
@@ -3,14 +3,22 @@
 from typing import List, Union, Optional
 
 import requests
-from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader
+from flask import current_app
+from langchain.document_loaders import TextLoader, Docx2txtLoader
 from langchain.schema import Document
 
 from core.data_loader.loader.csv_loader import CSVLoader
 from core.data_loader.loader.excel import ExcelLoader
 from core.data_loader.loader.html import HTMLLoader
 from core.data_loader.loader.markdown import MarkdownLoader
 from core.data_loader.loader.pdf import PdfLoader
+from core.data_loader.loader.unstructured.unstructured_eml import UnstructuredEmailLoader
+from core.data_loader.loader.unstructured.unstructured_markdown import UnstructuredMarkdownLoader
+from core.data_loader.loader.unstructured.unstructured_msg import UnstructuredMsgLoader
+from core.data_loader.loader.unstructured.unstructured_ppt import UnstructuredPPTLoader
+from core.data_loader.loader.unstructured.unstructured_pptx import UnstructuredPPTXLoader
+from core.data_loader.loader.unstructured.unstructured_text import UnstructuredTextLoader
+from core.data_loader.loader.unstructured.unstructured_xml import UnstructuredXmlLoader
 from extensions.ext_storage import storage
 from models.model import UploadFile
 
@@ -49,14 +57,34 @@ def load_from_file(cls, file_path: str, return_text: bool = False,
         input_file = Path(file_path)
         delimiter = '\n'
         file_extension = input_file.suffix.lower()
-        if is_automatic:
-            loader = UnstructuredFileLoader(
-                file_path, strategy="hi_res", mode="elements"
-            )
-            # loader = UnstructuredAPIFileLoader(
-            #     file_path=filenames[0],
-            #     api_key="FAKE_API_KEY",
-            # )
+        etl_type = current_app.config['ETL_TYPE']
+        unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
+        if etl_type == 'Unstructured':
+            if file_extension == '.xlsx':
+                loader = ExcelLoader(file_path)
+            elif file_extension == '.pdf':
+                loader = PdfLoader(file_path, upload_file=upload_file)
+            elif file_extension in ['.md', '.markdown']:
+                loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url)
+            elif file_extension in ['.htm', '.html']:
+                loader = HTMLLoader(file_path)
+            elif file_extension == '.docx':
+                loader = Docx2txtLoader(file_path)
+            elif file_extension == '.csv':
+                loader = CSVLoader(file_path, autodetect_encoding=True)
+            elif file_extension == '.msg':
+                loader = UnstructuredMsgLoader(file_path, unstructured_api_url)
+            elif file_extension == '.eml':
+                loader = UnstructuredEmailLoader(file_path, unstructured_api_url)
+            elif file_extension == '.ppt':
+                loader = UnstructuredPPTLoader(file_path, unstructured_api_url)
+            elif file_extension == '.pptx':
+                loader = UnstructuredPPTXLoader(file_path, unstructured_api_url)
+            elif file_extension == '.xml':
+                loader = UnstructuredXmlLoader(file_path, unstructured_api_url)
+            else:
+                # txt
+                loader = UnstructuredTextLoader(file_path, unstructured_api_url)
         else:
             if file_extension == '.xlsx':
                 loader = ExcelLoader(file_path)

diff --git a/api/core/data_loader/loader/unstructured/unstructured_eml.py b/api/core/data_loader/loader/unstructured/unstructured_eml.py
@@ -0,0 +1,41 @@
+import logging
+import re
+from typing import Optional, List, Tuple, cast
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredEmailLoader(BaseLoader):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str,
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.email import partition_email
+
+        elements = partition_email(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
diff --git a/api/core/data_loader/loader/unstructured/unstructured_markdown.py b/api/core/data_loader/loader/unstructured/unstructured_markdown.py
@@ -0,0 +1,48 @@
+import logging
+from typing import List
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredMarkdownLoader(BaseLoader):
+    """Load md files.
+
+
+    Args:
+        file_path: Path to the file to load.
+
+        remove_hyperlinks: Whether to remove hyperlinks from the text.
+
+        remove_images: Whether to remove images from the text.
+
+        encoding: File encoding to use. If `None`, the file will be loaded
+        with the default system encoding.
+
+        autodetect_encoding: Whether to try to autodetect the file encoding
+            if the specified encoding fails.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str,
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.md import partition_md
+
+        elements = partition_md(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
diff --git a/api/core/data_loader/loader/unstructured/unstructured_msg.py b/api/core/data_loader/loader/unstructured/unstructured_msg.py
@@ -0,0 +1,40 @@
+import logging
+import re
+from typing import Optional, List, Tuple, cast
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredMsgLoader(BaseLoader):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.msg import partition_msg
+
+        elements = partition_msg(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
diff --git a/api/core/data_loader/loader/unstructured/unstructured_ppt.py b/api/core/data_loader/loader/unstructured/unstructured_ppt.py
@@ -0,0 +1,40 @@
+import logging
+import re
+from typing import Optional, List, Tuple, cast
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredPPTLoader(BaseLoader):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.ppt import partition_ppt
+
+        elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
diff --git a/api/core/data_loader/loader/unstructured/unstructured_pptx.py b/api/core/data_loader/loader/unstructured/unstructured_pptx.py
@@ -0,0 +1,40 @@
+import logging
+import re
+from typing import Optional, List, Tuple, cast
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredPPTXLoader(BaseLoader):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.pptx import partition_pptx
+
+        elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
diff --git a/api/core/data_loader/loader/unstructured/unstructured_text.py b/api/core/data_loader/loader/unstructured/unstructured_text.py
@@ -0,0 +1,40 @@
+import logging
+import re
+from typing import Optional, List, Tuple, cast
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredTextLoader(BaseLoader):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.text import partition_text
+
+        elements = partition_text(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents