Skip to content

Commit

Permalink
fix: excel in node only read one sheet, close #9661 (#11215)
Browse files Browse the repository at this point in the history
Signed-off-by: yihong0618 <[email protected]>
  • Loading branch information
yihong0618 authored Nov 30, 2024
1 parent d96a284 commit 5a9b785
Showing 1 changed file with 12 additions and 10 deletions.
22 changes: 12 additions & 10 deletions api/core/workflow/nodes/document_extractor/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import docx
import pandas as pd
import pypdfium2
import yaml
import pypdfium2 # type: ignore
import yaml # type: ignore
from unstructured.partition.api import partition_via_api
from unstructured.partition.email import partition_email
from unstructured.partition.epub import partition_epub
Expand Down Expand Up @@ -237,15 +237,17 @@ def _extract_text_from_csv(file_content: bytes) -> str:

def _extract_text_from_excel(file_content: bytes) -> str:
"""Extract text from an Excel file using pandas."""

try:
df = pd.read_excel(io.BytesIO(file_content))

# Drop rows where all elements are NaN
df.dropna(how="all", inplace=True)

# Convert DataFrame to Markdown table
markdown_table = df.to_markdown(index=False)
excel_file = pd.ExcelFile(io.BytesIO(file_content))
markdown_table = ""
for sheet_name in excel_file.sheet_names:
try:
df = excel_file.parse(sheet_name=sheet_name)
df.dropna(how="all", inplace=True)
# Create Markdown table two times to separate tables with a newline
markdown_table += df.to_markdown(index=False) + "\n\n"
except Exception as e:
continue
return markdown_table
except Exception as e:
raise TextExtractionError(f"Failed to extract text from Excel file: {str(e)}") from e
Expand Down

0 comments on commit 5a9b785

Please sign in to comment.