diff --git a/api/core/data_loader/loader/unstructured/unstructured_ppt.py b/api/core/data_loader/loader/unstructured/unstructured_ppt.py index 91750ea71164b7..4560c262e95337 100644 --- a/api/core/data_loader/loader/unstructured/unstructured_ppt.py +++ b/api/core/data_loader/loader/unstructured/unstructured_ppt.py @@ -30,11 +30,18 @@ def load(self) -> List[Document]: from unstructured.partition.ppt import partition_ppt elements = partition_ppt(filename=self._file_path, api_url=self._api_url) - from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + text_by_page = {} + for element in elements: + page = element.metadata.page_number + text = element.text + if page in text_by_page: + text_by_page[page] += "\n" + text + else: + text_by_page[page] = text + + combined_texts = list(text_by_page.values()) documents = [] - for chunk in chunks: - text = chunk.text.strip() + for combined_text in combined_texts: + text = combined_text.strip() documents.append(Document(page_content=text)) - return documents diff --git a/api/core/data_loader/loader/unstructured/unstructured_pptx.py b/api/core/data_loader/loader/unstructured/unstructured_pptx.py index 5e300070208743..7bb3c3af7132de 100644 --- a/api/core/data_loader/loader/unstructured/unstructured_pptx.py +++ b/api/core/data_loader/loader/unstructured/unstructured_pptx.py @@ -30,11 +30,19 @@ def load(self) -> List[Document]: from unstructured.partition.pptx import partition_pptx elements = partition_pptx(filename=self._file_path, api_url=self._api_url) - from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + text_by_page = {} + for element in elements: + page = element.metadata.page_number + text = element.text + if page in text_by_page: + text_by_page[page] += "\n" + text + else: + text_by_page[page] = text + + combined_texts = list(text_by_page.values()) documents = [] - for chunk in chunks: - text = chunk.text.strip() + for combined_text in combined_texts: + text = combined_text.strip() documents.append(Document(page_content=text)) return documents diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 0da5f297bb301e..89fba99cab76b9 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -529,6 +529,13 @@ def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter, hash = helper.generate_text_hash(document_node.page_content) document_node.metadata['doc_id'] = doc_id document_node.metadata['doc_hash'] = hash + # delete Spliter character + page_content = document_node.page_content + if page_content.startswith(".") or page_content.startswith("。"): + page_content = page_content[1:] + else: + page_content = page_content + document_node.page_content = page_content split_documents.append(document_node) all_documents.extend(split_documents) # processing qa document