From f63b85db4a823efc2f4b354c6bb2fe62e71c142c Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Mon, 23 Dec 2024 19:23:26 +0800 Subject: [PATCH 1/3] qa extractor --- api/core/indexing_runner.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index d3407421fae74e..786427d0814484 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -274,6 +274,7 @@ def indexing_estimate( model_type=ModelType.TEXT_EMBEDDING, ) preview_texts = [] + total_segments = 0 index_type = doc_form index_processor = IndexProcessorFactory(index_type).init_index_processor() @@ -289,14 +290,21 @@ def indexing_estimate( process_rule=processing_rule.to_dict(), tenant_id=current_user.current_tenant_id, doc_language=doc_language, + preview=True ) total_segments += len(documents) for document in documents: if len(preview_texts) < 10: - preview_detail = PreviewDetail(content=document.page_content) - if document.children: - preview_detail.child_chunks = [child.page_content for child in document.children] - preview_texts.append(preview_detail) + if doc_form and doc_form == "qa_model": + preview_detail = QAPreviewDetail(question=document.page_content, + answer=document.metadata.get("answer") + ) + preview_texts.append(preview_detail) + else: + preview_detail = PreviewDetail(content=document.page_content) + if document.children: + preview_detail.child_chunks = [child.page_content for child in document.children] + preview_texts.append(preview_detail) # delete image files and related db records image_upload_file_ids = get_image_upload_file_ids(document.page_content) @@ -312,16 +320,9 @@ def indexing_estimate( db.session.delete(image_file) if doc_form and doc_form == "qa_model": - if len(preview_texts) > 0: - # qa model document - response = LLMGenerator.generate_qa_document( - current_user.current_tenant_id, preview_texts[0].content, doc_language - ) - document_qa_list = self.format_split_text(response) - - return IndexingEstimate( - total_segments=total_segments * 20, qa_preview=document_qa_list, preview=preview_texts - ) + return IndexingEstimate( + total_segments=total_segments * 20, qa_preview=preview_texts, preview=[] + ) return IndexingEstimate(total_segments=total_segments, preview=preview_texts) def _extract( From 515e582fefc38873c78a2b5b62fe8e74a385c926 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Mon, 23 Dec 2024 19:23:33 +0800 Subject: [PATCH 2/3] qa extractor --- .../processor/qa_index_processor.py | 44 +++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py index d42252d43f006c..0467ad2051dab8 100644 --- a/api/core/rag/index_processor/processor/qa_index_processor.py +++ b/api/core/rag/index_processor/processor/qa_index_processor.py @@ -35,6 +35,7 @@ def extract(self, extract_setting: ExtractSetting, **kwargs) -> list[Document]: return text_docs def transform(self, documents: list[Document], **kwargs) -> list[Document]: + preview = kwargs.get("preview") process_rule = kwargs.get("process_rule") rules = Rule(**process_rule.get("rules")) splitter = self._get_splitter( @@ -67,24 +68,31 @@ def transform(self, documents: list[Document], **kwargs) -> list[Document]: document_node.page_content = remove_leading_symbols(page_content) split_documents.append(document_node) all_documents.extend(split_documents) - for i in range(0, len(all_documents), 10): - threads = [] - sub_documents = all_documents[i : i + 10] - for doc in sub_documents: - document_format_thread = threading.Thread( - target=self._format_qa_document, - kwargs={ - "flask_app": current_app._get_current_object(), - "tenant_id": kwargs.get("tenant_id"), - "document_node": doc, - "all_qa_documents": all_qa_documents, - "document_language": kwargs.get("doc_language", "English"), - }, - ) - threads.append(document_format_thread) - document_format_thread.start() - for thread in threads: - thread.join() + if preview: + self._format_qa_document(current_app._get_current_object(), + kwargs.get("tenant_id"), + all_documents[0], + all_qa_documents, + kwargs.get("doc_language", "English")) + else: + for i in range(0, len(all_documents), 10): + threads = [] + sub_documents = all_documents[i : i + 10] + for doc in sub_documents: + document_format_thread = threading.Thread( + target=self._format_qa_document, + kwargs={ + "flask_app": current_app._get_current_object(), + "tenant_id": kwargs.get("tenant_id"), + "document_node": doc, + "all_qa_documents": all_qa_documents, + "document_language": kwargs.get("doc_language", "English"), + }, + ) + threads.append(document_format_thread) + document_format_thread.start() + for thread in threads: + thread.join() return all_qa_documents def format_by_template(self, file: FileStorage, **kwargs) -> list[Document]: From 71950eea872858e772488466251ede5eb7b56f2c Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Tue, 24 Dec 2024 14:33:13 +0800 Subject: [PATCH 3/3] notion and website import fix --- api/core/rag/extractor/extract_processor.py | 6 +----- api/services/dataset_service.py | 16 ++++++++-------- .../knowledge_entities/knowledge_entities.py | 5 +++-- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index a0b1aa4cefbd1f..2967dfe5ff4cc5 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -131,11 +131,7 @@ def extract( extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key) else: # txt - extractor = ( - UnstructuredTextExtractor(file_path, unstructured_api_url) - if is_automatic - else TextExtractor(file_path, autodetect_encoding=True) - ) + extractor = TextExtractor(file_path, autodetect_encoding=True) else: if file_extension in {".xlsx", ".xls"}: extractor = ExcelExtractor(file_path) diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index b182008595b36e..7f2a82425667e7 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -882,7 +882,7 @@ def save_document_with_dataset_id( exist_page_ids.append(data_source_info["notion_page_id"]) exist_document[data_source_info["notion_page_id"]] = document.id for notion_info in notion_info_list: - workspace_id = notion_info["workspace_id"] + workspace_id = notion_info.workspace_id data_source_binding = DataSourceOauthBinding.query.filter( db.and_( DataSourceOauthBinding.tenant_id == current_user.current_tenant_id, @@ -893,13 +893,13 @@ def save_document_with_dataset_id( ).first() if not data_source_binding: raise ValueError("Data source binding not found.") - for page in notion_info["pages"]: - if page["page_id"] not in exist_page_ids: + for page in notion_info.pages: + if page.page_id not in exist_page_ids: data_source_info = { "notion_workspace_id": workspace_id, - "notion_page_id": page["page_id"], - "notion_page_icon": page["page_icon"], - "type": page["type"], + "notion_page_id": page.page_id, + "notion_page_icon": page.page_icon, + "type": page.type, } document = DocumentService.build_document( dataset, @@ -911,7 +911,7 @@ def save_document_with_dataset_id( created_from, position, account, - page["page_name"], + page.page_name, batch, ) db.session.add(document) @@ -920,7 +920,7 @@ def save_document_with_dataset_id( documents.append(document) position += 1 else: - exist_document.pop(page["page_id"]) + exist_document.pop(page.page_id) # delete not selected documents if len(exist_document) > 0: clean_notion_document_task.delay(list(exist_document.values()), dataset.id) diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py index f5ca3924b2b6f3..c3bf7c0a7ee796 100644 --- a/api/services/entities/knowledge_entities/knowledge_entities.py +++ b/api/services/entities/knowledge_entities/knowledge_entities.py @@ -19,7 +19,8 @@ class ParentMode(str, Enum): class NotionPage(BaseModel): page_id: str page_name: str - page_icon: str + page_icon: Optional[str] + type: str class NotionInfo(BaseModel): @@ -31,7 +32,7 @@ class WebsiteInfo(BaseModel): provider: str job_id: str urls: list[str] - only_main_content: bool + only_main_content: bool = True class FileInfo(BaseModel):