Skip to content

Commit

Permalink
ppt & pptx improve (#1790)
Browse files Browse the repository at this point in the history
Co-authored-by: jyong <[email protected]>
  • Loading branch information
JohnJyong and JohnJyong authored Dec 19, 2023
1 parent 185c2f8 commit df15099
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 9 deletions.
17 changes: 12 additions & 5 deletions api/core/data_loader/loader/unstructured/unstructured_ppt.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,18 @@ def load(self) -> List[Document]:
from unstructured.partition.ppt import partition_ppt

elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
text_by_page = {}
for element in elements:
page = element.metadata.page_number
text = element.text
if page in text_by_page:
text_by_page[page] += "\n" + text
else:
text_by_page[page] = text

combined_texts = list(text_by_page.values())
documents = []
for chunk in chunks:
text = chunk.text.strip()
for combined_text in combined_texts:
text = combined_text.strip()
documents.append(Document(page_content=text))

return documents
16 changes: 12 additions & 4 deletions api/core/data_loader/loader/unstructured/unstructured_pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,19 @@ def load(self) -> List[Document]:
from unstructured.partition.pptx import partition_pptx

elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
text_by_page = {}
for element in elements:
page = element.metadata.page_number
text = element.text
if page in text_by_page:
text_by_page[page] += "\n" + text
else:
text_by_page[page] = text

combined_texts = list(text_by_page.values())
documents = []
for chunk in chunks:
text = chunk.text.strip()
for combined_text in combined_texts:
text = combined_text.strip()
documents.append(Document(page_content=text))

return documents
7 changes: 7 additions & 0 deletions api/core/indexing_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,13 @@ def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter,
hash = helper.generate_text_hash(document_node.page_content)
document_node.metadata['doc_id'] = doc_id
document_node.metadata['doc_hash'] = hash
# delete Spliter character
page_content = document_node.page_content
if page_content.startswith(".") or page_content.startswith("。"):
page_content = page_content[1:]
else:
page_content = page_content
document_node.page_content = page_content
split_documents.append(document_node)
all_documents.extend(split_documents)
# processing qa document
Expand Down

0 comments on commit df15099

Please sign in to comment.