ppt & pptx improve (#1790)

Co-authored-by: jyong <[email protected]>
langgenius · Dec 19, 2023 · df15099 · df15099
1 parent 185c2f8
commit df15099
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 9 deletions.
diff --git a/api/core/data_loader/loader/unstructured/unstructured_ppt.py b/api/core/data_loader/loader/unstructured/unstructured_ppt.py
@@ -30,11 +30,18 @@ def load(self) -> List[Document]:
         from unstructured.partition.ppt import partition_ppt
 
         elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
-        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        text_by_page = {}
+        for element in elements:
+            page = element.metadata.page_number
+            text = element.text
+            if page in text_by_page:
+                text_by_page[page] += "\n" + text
+            else:
+                text_by_page[page] = text
+
+        combined_texts = list(text_by_page.values())
         documents = []
-        for chunk in chunks:
-            text = chunk.text.strip()
+        for combined_text in combined_texts:
+            text = combined_text.strip()
             documents.append(Document(page_content=text))
-
         return documents
diff --git a/api/core/data_loader/loader/unstructured/unstructured_pptx.py b/api/core/data_loader/loader/unstructured/unstructured_pptx.py
@@ -30,11 +30,19 @@ def load(self) -> List[Document]:
         from unstructured.partition.pptx import partition_pptx
 
         elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
-        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        text_by_page = {}
+        for element in elements:
+            page = element.metadata.page_number
+            text = element.text
+            if page in text_by_page:
+                text_by_page[page] += "\n" + text
+            else:
+                text_by_page[page] = text
+
+        combined_texts = list(text_by_page.values())
         documents = []
-        for chunk in chunks:
-            text = chunk.text.strip()
+        for combined_text in combined_texts:
+            text = combined_text.strip()
             documents.append(Document(page_content=text))
 
         return documents
diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py
@@ -529,6 +529,13 @@ def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter,
                     hash = helper.generate_text_hash(document_node.page_content)
                     document_node.metadata['doc_id'] = doc_id
                     document_node.metadata['doc_hash'] = hash
+                    # delete Spliter character
+                    page_content = document_node.page_content
+                    if page_content.startswith(".") or page_content.startswith("。"):
+                        page_content = page_content[1:]
+                    else:
+                        page_content = page_content
+                    document_node.page_content = page_content
                     split_documents.append(document_node)
             all_documents.extend(split_documents)
         # processing qa document