add xlsx support hyperlink extract (#6722)

langgenius · Jul 26, 2024 · cf258b7 · cf258b7
1 parent 5d77dc4
commit cf258b7
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 15 deletions.
diff --git a/api/core/rag/extractor/excel_extractor.py b/api/core/rag/extractor/excel_extractor.py
@@ -3,6 +3,7 @@
 from typing import Optional
 
 import pandas as pd
+from openpyxl import load_workbook
 
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@@ -28,26 +29,48 @@ def __init__(
         self._autodetect_encoding = autodetect_encoding
 
     def extract(self) -> list[Document]:
-        """ Load from Excel file in xls or xlsx format using Pandas."""
+        """ Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
         documents = []
-        # Determine the file extension
         file_extension = os.path.splitext(self._file_path)[-1].lower()
-        # Read each worksheet of an Excel file using Pandas
+
         if file_extension == '.xlsx':
-            excel_file = pd.ExcelFile(self._file_path, engine='openpyxl')
+            wb = load_workbook(self._file_path, data_only=True)
+            for sheet_name in wb.sheetnames:
+                sheet = wb[sheet_name]
+                data = sheet.values
+                cols = next(data)
+                df = pd.DataFrame(data, columns=cols)
+
+                df.dropna(how='all', inplace=True)
+
+                for index, row in df.iterrows():
+                    page_content = []
+                    for col_index, (k, v) in enumerate(row.items()):
+                        if pd.notna(v):
+                            cell = sheet.cell(row=index + 2,
+                                              column=col_index + 1)  # +2 to account for header and 1-based index
+                            if cell.hyperlink:
+                                value = f"[{v}]({cell.hyperlink.target})"
+                                page_content.append(f'"{k}":"{value}"')
+                            else:
+                                page_content.append(f'"{k}":"{v}"')
+                    documents.append(Document(page_content=';'.join(page_content),
+                                              metadata={'source': self._file_path}))
+
         elif file_extension == '.xls':
             excel_file = pd.ExcelFile(self._file_path, engine='xlrd')
+            for sheet_name in excel_file.sheet_names:
+                df = excel_file.parse(sheet_name=sheet_name)
+                df.dropna(how='all', inplace=True)
+
+                for _, row in df.iterrows():
+                    page_content = []
+                    for k, v in row.items():
+                        if pd.notna(v):
+                            page_content.append(f'"{k}":"{v}"')
+                    documents.append(Document(page_content=';'.join(page_content),
+                                              metadata={'source': self._file_path}))
         else:
             raise ValueError(f"Unsupported file extension: {file_extension}")
-        for sheet_name in excel_file.sheet_names:
-            df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name)
-
-            # filter out rows with all NaN values
-            df.dropna(how='all', inplace=True)
-
-            # transform each row into a Document
-            documents += [Document(page_content=';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)),
-                                   metadata={'source': self._file_path},
-                                   ) for _, row in df.iterrows()]
 
         return documents
diff --git a/api/poetry.lock b/api/poetry.lock
diff --git a/api/pyproject.toml b/api/pyproject.toml
@@ -177,6 +177,7 @@ xinference-client = "0.9.4"
 yarl = "~1.9.4"
 zhipuai = "1.0.7"
 rank-bm25 = "~0.2.2"
+openpyxl = "^3.1.5"
 ############################################################
 # Tool dependencies required by tool implementations
 ############################################################