-
Notifications
You must be signed in to change notification settings - Fork 8.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feat/add unstructured support (#1780)
Co-authored-by: jyong <[email protected]>
- Loading branch information
Showing
15 changed files
with
361 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
api/core/data_loader/loader/unstructured/unstructured_eml.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import logging | ||
import re | ||
from typing import Optional, List, Tuple, cast | ||
|
||
from langchain.document_loaders.base import BaseLoader | ||
from langchain.document_loaders.helpers import detect_file_encodings | ||
from langchain.schema import Document | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class UnstructuredEmailLoader(BaseLoader): | ||
"""Load msg files. | ||
Args: | ||
file_path: Path to the file to load. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
file_path: str, | ||
api_url: str, | ||
): | ||
"""Initialize with file path.""" | ||
self._file_path = file_path | ||
self._api_url = api_url | ||
|
||
|
||
def load(self) -> List[Document]: | ||
from unstructured.partition.email import partition_email | ||
|
||
elements = partition_email(filename=self._file_path, api_url=self._api_url) | ||
from unstructured.chunking.title import chunk_by_title | ||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | ||
documents = [] | ||
for chunk in chunks: | ||
text = chunk.text.strip() | ||
documents.append(Document(page_content=text)) | ||
|
||
return documents |
48 changes: 48 additions & 0 deletions
48
api/core/data_loader/loader/unstructured/unstructured_markdown.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import logging | ||
from typing import List | ||
|
||
from langchain.document_loaders.base import BaseLoader | ||
from langchain.schema import Document | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class UnstructuredMarkdownLoader(BaseLoader): | ||
"""Load md files. | ||
Args: | ||
file_path: Path to the file to load. | ||
remove_hyperlinks: Whether to remove hyperlinks from the text. | ||
remove_images: Whether to remove images from the text. | ||
encoding: File encoding to use. If `None`, the file will be loaded | ||
with the default system encoding. | ||
autodetect_encoding: Whether to try to autodetect the file encoding | ||
if the specified encoding fails. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
file_path: str, | ||
api_url: str, | ||
): | ||
"""Initialize with file path.""" | ||
self._file_path = file_path | ||
self._api_url = api_url | ||
|
||
def load(self) -> List[Document]: | ||
from unstructured.partition.md import partition_md | ||
|
||
elements = partition_md(filename=self._file_path, api_url=self._api_url) | ||
from unstructured.chunking.title import chunk_by_title | ||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | ||
documents = [] | ||
for chunk in chunks: | ||
text = chunk.text.strip() | ||
documents.append(Document(page_content=text)) | ||
|
||
return documents |
40 changes: 40 additions & 0 deletions
40
api/core/data_loader/loader/unstructured/unstructured_msg.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import logging | ||
import re | ||
from typing import Optional, List, Tuple, cast | ||
|
||
from langchain.document_loaders.base import BaseLoader | ||
from langchain.document_loaders.helpers import detect_file_encodings | ||
from langchain.schema import Document | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class UnstructuredMsgLoader(BaseLoader): | ||
"""Load msg files. | ||
Args: | ||
file_path: Path to the file to load. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
file_path: str, | ||
api_url: str | ||
): | ||
"""Initialize with file path.""" | ||
self._file_path = file_path | ||
self._api_url = api_url | ||
|
||
def load(self) -> List[Document]: | ||
from unstructured.partition.msg import partition_msg | ||
|
||
elements = partition_msg(filename=self._file_path, api_url=self._api_url) | ||
from unstructured.chunking.title import chunk_by_title | ||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | ||
documents = [] | ||
for chunk in chunks: | ||
text = chunk.text.strip() | ||
documents.append(Document(page_content=text)) | ||
|
||
return documents |
40 changes: 40 additions & 0 deletions
40
api/core/data_loader/loader/unstructured/unstructured_ppt.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import logging | ||
import re | ||
from typing import Optional, List, Tuple, cast | ||
|
||
from langchain.document_loaders.base import BaseLoader | ||
from langchain.document_loaders.helpers import detect_file_encodings | ||
from langchain.schema import Document | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class UnstructuredPPTLoader(BaseLoader): | ||
"""Load msg files. | ||
Args: | ||
file_path: Path to the file to load. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
file_path: str, | ||
api_url: str | ||
): | ||
"""Initialize with file path.""" | ||
self._file_path = file_path | ||
self._api_url = api_url | ||
|
||
def load(self) -> List[Document]: | ||
from unstructured.partition.ppt import partition_ppt | ||
|
||
elements = partition_ppt(filename=self._file_path, api_url=self._api_url) | ||
from unstructured.chunking.title import chunk_by_title | ||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | ||
documents = [] | ||
for chunk in chunks: | ||
text = chunk.text.strip() | ||
documents.append(Document(page_content=text)) | ||
|
||
return documents |
40 changes: 40 additions & 0 deletions
40
api/core/data_loader/loader/unstructured/unstructured_pptx.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import logging | ||
import re | ||
from typing import Optional, List, Tuple, cast | ||
|
||
from langchain.document_loaders.base import BaseLoader | ||
from langchain.document_loaders.helpers import detect_file_encodings | ||
from langchain.schema import Document | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class UnstructuredPPTXLoader(BaseLoader): | ||
"""Load msg files. | ||
Args: | ||
file_path: Path to the file to load. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
file_path: str, | ||
api_url: str | ||
): | ||
"""Initialize with file path.""" | ||
self._file_path = file_path | ||
self._api_url = api_url | ||
|
||
def load(self) -> List[Document]: | ||
from unstructured.partition.pptx import partition_pptx | ||
|
||
elements = partition_pptx(filename=self._file_path, api_url=self._api_url) | ||
from unstructured.chunking.title import chunk_by_title | ||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | ||
documents = [] | ||
for chunk in chunks: | ||
text = chunk.text.strip() | ||
documents.append(Document(page_content=text)) | ||
|
||
return documents |
40 changes: 40 additions & 0 deletions
40
api/core/data_loader/loader/unstructured/unstructured_text.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import logging | ||
import re | ||
from typing import Optional, List, Tuple, cast | ||
|
||
from langchain.document_loaders.base import BaseLoader | ||
from langchain.document_loaders.helpers import detect_file_encodings | ||
from langchain.schema import Document | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class UnstructuredTextLoader(BaseLoader): | ||
"""Load msg files. | ||
Args: | ||
file_path: Path to the file to load. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
file_path: str, | ||
api_url: str | ||
): | ||
"""Initialize with file path.""" | ||
self._file_path = file_path | ||
self._api_url = api_url | ||
|
||
def load(self) -> List[Document]: | ||
from unstructured.partition.text import partition_text | ||
|
||
elements = partition_text(filename=self._file_path, api_url=self._api_url) | ||
from unstructured.chunking.title import chunk_by_title | ||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | ||
documents = [] | ||
for chunk in chunks: | ||
text = chunk.text.strip() | ||
documents.append(Document(page_content=text)) | ||
|
||
return documents |
Oops, something went wrong.