Skip to content

Commit

Permalink
fix(document_extractor): pptx file type and missing metadata_filename…
Browse files Browse the repository at this point in the history
… UnstructuredIO (#11364)

Co-authored-by: Julian Huynh <[email protected]>
  • Loading branch information
hgbdev and Julian Huynh authored Dec 6, 2024
1 parent 1490a19 commit 9277156
Showing 1 changed file with 16 additions and 8 deletions.
24 changes: 16 additions & 8 deletions api/core/workflow/nodes/document_extractor/node.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import csv
import io
import json
import os
import tempfile

import docx
import pandas as pd
Expand Down Expand Up @@ -264,14 +266,20 @@ def _extract_text_from_ppt(file_content: bytes) -> str:

def _extract_text_from_pptx(file_content: bytes) -> str:
try:
with io.BytesIO(file_content) as file:
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
elements = partition_via_api(
file=file,
api_url=dify_config.UNSTRUCTURED_API_URL,
api_key=dify_config.UNSTRUCTURED_API_KEY,
)
else:
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file:
temp_file.write(file_content)
temp_file.flush()
with open(temp_file.name, "rb") as file:
elements = partition_via_api(
file=file,
metadata_filename=temp_file.name,
api_url=dify_config.UNSTRUCTURED_API_URL,
api_key=dify_config.UNSTRUCTURED_API_KEY,
)
os.unlink(temp_file.name)
else:
with io.BytesIO(file_content) as file:
elements = partition_pptx(file=file)
return "\n".join([getattr(element, "text", "") for element in elements])
except Exception as e:
Expand Down

0 comments on commit 9277156

Please sign in to comment.