Skip to content

Commit

Permalink
add xlsx support hyperlink extract (#6722)
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnJyong authored Jul 26, 2024
1 parent 5d77dc4 commit cf258b7
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 15 deletions.
51 changes: 37 additions & 14 deletions api/core/rag/extractor/excel_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Optional

import pandas as pd
from openpyxl import load_workbook

from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
Expand All @@ -28,26 +29,48 @@ def __init__(
self._autodetect_encoding = autodetect_encoding

def extract(self) -> list[Document]:
""" Load from Excel file in xls or xlsx format using Pandas."""
""" Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
documents = []
# Determine the file extension
file_extension = os.path.splitext(self._file_path)[-1].lower()
# Read each worksheet of an Excel file using Pandas

if file_extension == '.xlsx':
excel_file = pd.ExcelFile(self._file_path, engine='openpyxl')
wb = load_workbook(self._file_path, data_only=True)
for sheet_name in wb.sheetnames:
sheet = wb[sheet_name]
data = sheet.values
cols = next(data)
df = pd.DataFrame(data, columns=cols)

df.dropna(how='all', inplace=True)

for index, row in df.iterrows():
page_content = []
for col_index, (k, v) in enumerate(row.items()):
if pd.notna(v):
cell = sheet.cell(row=index + 2,
column=col_index + 1) # +2 to account for header and 1-based index
if cell.hyperlink:
value = f"[{v}]({cell.hyperlink.target})"
page_content.append(f'"{k}":"{value}"')
else:
page_content.append(f'"{k}":"{v}"')
documents.append(Document(page_content=';'.join(page_content),
metadata={'source': self._file_path}))

elif file_extension == '.xls':
excel_file = pd.ExcelFile(self._file_path, engine='xlrd')
for sheet_name in excel_file.sheet_names:
df = excel_file.parse(sheet_name=sheet_name)
df.dropna(how='all', inplace=True)

for _, row in df.iterrows():
page_content = []
for k, v in row.items():
if pd.notna(v):
page_content.append(f'"{k}":"{v}"')
documents.append(Document(page_content=';'.join(page_content),
metadata={'source': self._file_path}))
else:
raise ValueError(f"Unsupported file extension: {file_extension}")
for sheet_name in excel_file.sheet_names:
df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name)

# filter out rows with all NaN values
df.dropna(how='all', inplace=True)

# transform each row into a Document
documents += [Document(page_content=';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)),
metadata={'source': self._file_path},
) for _, row in df.iterrows()]

return documents
2 changes: 1 addition & 1 deletion api/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ xinference-client = "0.9.4"
yarl = "~1.9.4"
zhipuai = "1.0.7"
rank-bm25 = "~0.2.2"
openpyxl = "^3.1.5"
############################################################
# Tool dependencies required by tool implementations
############################################################
Expand Down

0 comments on commit cf258b7

Please sign in to comment.