From 71950eea872858e772488466251ede5eb7b56f2c Mon Sep 17 00:00:00 2001
From: jyong <718720800@qq.com>
Date: Tue, 24 Dec 2024 14:33:13 +0800
Subject: [PATCH] notion and website import fix

---
 api/core/rag/extractor/extract_processor.py      |  6 +-----
 api/services/dataset_service.py                  | 16 ++++++++--------
 .../knowledge_entities/knowledge_entities.py     |  5 +++--
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py
index a0b1aa4cefbd1f..2967dfe5ff4cc5 100644
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@@ -131,11 +131,7 @@ def extract(
                         extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
                     else:
                         # txt
-                        extractor = (
-                            UnstructuredTextExtractor(file_path, unstructured_api_url)
-                            if is_automatic
-                            else TextExtractor(file_path, autodetect_encoding=True)
-                        )
+                        extractor = TextExtractor(file_path, autodetect_encoding=True)
                 else:
                     if file_extension in {".xlsx", ".xls"}:
                         extractor = ExcelExtractor(file_path)
diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py
index b182008595b36e..7f2a82425667e7 100644
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -882,7 +882,7 @@ def save_document_with_dataset_id(
                             exist_page_ids.append(data_source_info["notion_page_id"])
                             exist_document[data_source_info["notion_page_id"]] = document.id
                     for notion_info in notion_info_list:
-                        workspace_id = notion_info["workspace_id"]
+                        workspace_id = notion_info.workspace_id
                         data_source_binding = DataSourceOauthBinding.query.filter(
                             db.and_(
                                 DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
@@ -893,13 +893,13 @@ def save_document_with_dataset_id(
                         ).first()
                         if not data_source_binding:
                             raise ValueError("Data source binding not found.")
-                        for page in notion_info["pages"]:
-                            if page["page_id"] not in exist_page_ids:
+                        for page in notion_info.pages:
+                            if page.page_id not in exist_page_ids:
                                 data_source_info = {
                                     "notion_workspace_id": workspace_id,
-                                    "notion_page_id": page["page_id"],
-                                    "notion_page_icon": page["page_icon"],
-                                    "type": page["type"],
+                                    "notion_page_id": page.page_id,
+                                    "notion_page_icon": page.page_icon,
+                                    "type": page.type,
                                 }
                                 document = DocumentService.build_document(
                                     dataset,
@@ -911,7 +911,7 @@ def save_document_with_dataset_id(
                                     created_from,
                                     position,
                                     account,
-                                    page["page_name"],
+                                    page.page_name,
                                     batch,
                                 )
                                 db.session.add(document)
@@ -920,7 +920,7 @@ def save_document_with_dataset_id(
                                 documents.append(document)
                                 position += 1
                             else:
-                                exist_document.pop(page["page_id"])
+                                exist_document.pop(page.page_id)
                     # delete not selected documents
                     if len(exist_document) > 0:
                         clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py
index f5ca3924b2b6f3..c3bf7c0a7ee796 100644
--- a/api/services/entities/knowledge_entities/knowledge_entities.py
+++ b/api/services/entities/knowledge_entities/knowledge_entities.py
@@ -19,7 +19,8 @@ class ParentMode(str, Enum):
 class NotionPage(BaseModel):
     page_id: str
     page_name: str
-    page_icon: str
+    page_icon: Optional[str]
+    type: str
 
 
 class NotionInfo(BaseModel):
@@ -31,7 +32,7 @@ class WebsiteInfo(BaseModel):
     provider: str
     job_id: str
     urls: list[str]
-    only_main_content: bool
+    only_main_content: bool = True
 
 
 class FileInfo(BaseModel):