From 71950eea872858e772488466251ede5eb7b56f2c Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Tue, 24 Dec 2024 14:33:13 +0800 Subject: [PATCH] notion and website import fix --- api/core/rag/extractor/extract_processor.py | 6 +----- api/services/dataset_service.py | 16 ++++++++-------- .../knowledge_entities/knowledge_entities.py | 5 +++-- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index a0b1aa4cefbd1f..2967dfe5ff4cc5 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -131,11 +131,7 @@ def extract( extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key) else: # txt - extractor = ( - UnstructuredTextExtractor(file_path, unstructured_api_url) - if is_automatic - else TextExtractor(file_path, autodetect_encoding=True) - ) + extractor = TextExtractor(file_path, autodetect_encoding=True) else: if file_extension in {".xlsx", ".xls"}: extractor = ExcelExtractor(file_path) diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index b182008595b36e..7f2a82425667e7 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -882,7 +882,7 @@ def save_document_with_dataset_id( exist_page_ids.append(data_source_info["notion_page_id"]) exist_document[data_source_info["notion_page_id"]] = document.id for notion_info in notion_info_list: - workspace_id = notion_info["workspace_id"] + workspace_id = notion_info.workspace_id data_source_binding = DataSourceOauthBinding.query.filter( db.and_( DataSourceOauthBinding.tenant_id == current_user.current_tenant_id, @@ -893,13 +893,13 @@ def save_document_with_dataset_id( ).first() if not data_source_binding: raise ValueError("Data source binding not found.") - for page in notion_info["pages"]: - if page["page_id"] not in exist_page_ids: + for page in notion_info.pages: + if page.page_id not in exist_page_ids: data_source_info = { "notion_workspace_id": workspace_id, - "notion_page_id": page["page_id"], - "notion_page_icon": page["page_icon"], - "type": page["type"], + "notion_page_id": page.page_id, + "notion_page_icon": page.page_icon, + "type": page.type, } document = DocumentService.build_document( dataset, @@ -911,7 +911,7 @@ def save_document_with_dataset_id( created_from, position, account, - page["page_name"], + page.page_name, batch, ) db.session.add(document) @@ -920,7 +920,7 @@ def save_document_with_dataset_id( documents.append(document) position += 1 else: - exist_document.pop(page["page_id"]) + exist_document.pop(page.page_id) # delete not selected documents if len(exist_document) > 0: clean_notion_document_task.delay(list(exist_document.values()), dataset.id) diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py index f5ca3924b2b6f3..c3bf7c0a7ee796 100644 --- a/api/services/entities/knowledge_entities/knowledge_entities.py +++ b/api/services/entities/knowledge_entities/knowledge_entities.py @@ -19,7 +19,8 @@ class ParentMode(str, Enum): class NotionPage(BaseModel): page_id: str page_name: str - page_icon: str + page_icon: Optional[str] + type: str class NotionInfo(BaseModel): @@ -31,7 +32,7 @@ class WebsiteInfo(BaseModel): provider: str job_id: str urls: list[str] - only_main_content: bool + only_main_content: bool = True class FileInfo(BaseModel):