Skip to content

Commit

Permalink
Feat/add unstructured support (#1780)
Browse files Browse the repository at this point in the history
Co-authored-by: jyong <[email protected]>
  • Loading branch information
JohnJyong and JohnJyong authored Dec 18, 2023
1 parent 2fd56cb commit 5e34f93
Show file tree
Hide file tree
Showing 15 changed files with 361 additions and 14 deletions.
3 changes: 3 additions & 0 deletions api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,6 @@ HOSTED_ANTHROPIC_API_BASE=
HOSTED_ANTHROPIC_API_KEY=
HOSTED_ANTHROPIC_QUOTA_LIMIT=600000
HOSTED_ANTHROPIC_PAID_ENABLED=false

ETL_TYPE=dify
UNSTRUCTURED_API_URL=
6 changes: 5 additions & 1 deletion api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@
'UPLOAD_IMAGE_FILE_SIZE_LIMIT': 10,
'OUTPUT_MODERATION_BUFFER_SIZE': 300,
'MULTIMODAL_SEND_IMAGE_FORMAT': 'base64',
'INVITE_EXPIRY_HOURS': 72
'INVITE_EXPIRY_HOURS': 72,
'ETL_TYPE': 'dify',
}


Expand Down Expand Up @@ -276,6 +277,9 @@ def __init__(self):
self.HOSTED_MODERATION_ENABLED = get_bool_env('HOSTED_MODERATION_ENABLED')
self.HOSTED_MODERATION_PROVIDERS = get_env('HOSTED_MODERATION_PROVIDERS')

self.ETL_TYPE = get_env('ETL_TYPE')
self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL')


class CloudEditionConfig(Config):

Expand Down
15 changes: 15 additions & 0 deletions api/controllers/console/datasets/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,5 +69,20 @@ def get(self, file_id):
return {'content': text}


class FileeSupportTypApi(Resource):
@setup_required
@login_required
@account_initialization_required
def get(self):
etl_type = current_app.config['ETL_TYPE']
if etl_type == 'Unstructured':
allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
else:
allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
return {'allowed_extensions': allowed_extensions}


api.add_resource(FileApi, '/files/upload')
api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview')
api.add_resource(FileeSupportTypApi, '/files/support-type')
46 changes: 37 additions & 9 deletions api/core/data_loader/file_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,22 @@
from typing import List, Union, Optional

import requests
from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader
from flask import current_app
from langchain.document_loaders import TextLoader, Docx2txtLoader
from langchain.schema import Document

from core.data_loader.loader.csv_loader import CSVLoader
from core.data_loader.loader.excel import ExcelLoader
from core.data_loader.loader.html import HTMLLoader
from core.data_loader.loader.markdown import MarkdownLoader
from core.data_loader.loader.pdf import PdfLoader
from core.data_loader.loader.unstructured.unstructured_eml import UnstructuredEmailLoader
from core.data_loader.loader.unstructured.unstructured_markdown import UnstructuredMarkdownLoader
from core.data_loader.loader.unstructured.unstructured_msg import UnstructuredMsgLoader
from core.data_loader.loader.unstructured.unstructured_ppt import UnstructuredPPTLoader
from core.data_loader.loader.unstructured.unstructured_pptx import UnstructuredPPTXLoader
from core.data_loader.loader.unstructured.unstructured_text import UnstructuredTextLoader
from core.data_loader.loader.unstructured.unstructured_xml import UnstructuredXmlLoader
from extensions.ext_storage import storage
from models.model import UploadFile

Expand Down Expand Up @@ -49,14 +57,34 @@ def load_from_file(cls, file_path: str, return_text: bool = False,
input_file = Path(file_path)
delimiter = '\n'
file_extension = input_file.suffix.lower()
if is_automatic:
loader = UnstructuredFileLoader(
file_path, strategy="hi_res", mode="elements"
)
# loader = UnstructuredAPIFileLoader(
# file_path=filenames[0],
# api_key="FAKE_API_KEY",
# )
etl_type = current_app.config['ETL_TYPE']
unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
if etl_type == 'Unstructured':
if file_extension == '.xlsx':
loader = ExcelLoader(file_path)
elif file_extension == '.pdf':
loader = PdfLoader(file_path, upload_file=upload_file)
elif file_extension in ['.md', '.markdown']:
loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url)
elif file_extension in ['.htm', '.html']:
loader = HTMLLoader(file_path)
elif file_extension == '.docx':
loader = Docx2txtLoader(file_path)
elif file_extension == '.csv':
loader = CSVLoader(file_path, autodetect_encoding=True)
elif file_extension == '.msg':
loader = UnstructuredMsgLoader(file_path, unstructured_api_url)
elif file_extension == '.eml':
loader = UnstructuredEmailLoader(file_path, unstructured_api_url)
elif file_extension == '.ppt':
loader = UnstructuredPPTLoader(file_path, unstructured_api_url)
elif file_extension == '.pptx':
loader = UnstructuredPPTXLoader(file_path, unstructured_api_url)
elif file_extension == '.xml':
loader = UnstructuredXmlLoader(file_path, unstructured_api_url)
else:
# txt
loader = UnstructuredTextLoader(file_path, unstructured_api_url)
else:
if file_extension == '.xlsx':
loader = ExcelLoader(file_path)
Expand Down
41 changes: 41 additions & 0 deletions api/core/data_loader/loader/unstructured/unstructured_eml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import logging
import re
from typing import Optional, List, Tuple, cast

from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document

logger = logging.getLogger(__name__)


class UnstructuredEmailLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""

def __init__(
self,
file_path: str,
api_url: str,
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url


def load(self) -> List[Document]:
from unstructured.partition.email import partition_email

elements = partition_email(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))

return documents
48 changes: 48 additions & 0 deletions api/core/data_loader/loader/unstructured/unstructured_markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import logging
from typing import List

from langchain.document_loaders.base import BaseLoader
from langchain.schema import Document

logger = logging.getLogger(__name__)


class UnstructuredMarkdownLoader(BaseLoader):
"""Load md files.
Args:
file_path: Path to the file to load.
remove_hyperlinks: Whether to remove hyperlinks from the text.
remove_images: Whether to remove images from the text.
encoding: File encoding to use. If `None`, the file will be loaded
with the default system encoding.
autodetect_encoding: Whether to try to autodetect the file encoding
if the specified encoding fails.
"""

def __init__(
self,
file_path: str,
api_url: str,
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

def load(self) -> List[Document]:
from unstructured.partition.md import partition_md

elements = partition_md(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))

return documents
40 changes: 40 additions & 0 deletions api/core/data_loader/loader/unstructured/unstructured_msg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import logging
import re
from typing import Optional, List, Tuple, cast

from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document

logger = logging.getLogger(__name__)


class UnstructuredMsgLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""

def __init__(
self,
file_path: str,
api_url: str
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

def load(self) -> List[Document]:
from unstructured.partition.msg import partition_msg

elements = partition_msg(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))

return documents
40 changes: 40 additions & 0 deletions api/core/data_loader/loader/unstructured/unstructured_ppt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import logging
import re
from typing import Optional, List, Tuple, cast

from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document

logger = logging.getLogger(__name__)


class UnstructuredPPTLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""

def __init__(
self,
file_path: str,
api_url: str
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

def load(self) -> List[Document]:
from unstructured.partition.ppt import partition_ppt

elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))

return documents
40 changes: 40 additions & 0 deletions api/core/data_loader/loader/unstructured/unstructured_pptx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import logging
import re
from typing import Optional, List, Tuple, cast

from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document

logger = logging.getLogger(__name__)


class UnstructuredPPTXLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""

def __init__(
self,
file_path: str,
api_url: str
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

def load(self) -> List[Document]:
from unstructured.partition.pptx import partition_pptx

elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))

return documents
40 changes: 40 additions & 0 deletions api/core/data_loader/loader/unstructured/unstructured_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import logging
import re
from typing import Optional, List, Tuple, cast

from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document

logger = logging.getLogger(__name__)


class UnstructuredTextLoader(BaseLoader):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""

def __init__(
self,
file_path: str,
api_url: str
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

def load(self) -> List[Document]:
from unstructured.partition.text import partition_text

elements = partition_text(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))

return documents
Loading

0 comments on commit 5e34f93

Please sign in to comment.