Skip to content

Commit

Permalink
notion and website import fix
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnJyong committed Dec 24, 2024
1 parent 515e582 commit 71950ee
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 15 deletions.
6 changes: 1 addition & 5 deletions api/core/rag/extractor/extract_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,7 @@ def extract(
extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
else:
# txt
extractor = (
UnstructuredTextExtractor(file_path, unstructured_api_url)
if is_automatic
else TextExtractor(file_path, autodetect_encoding=True)
)
extractor = TextExtractor(file_path, autodetect_encoding=True)
else:
if file_extension in {".xlsx", ".xls"}:
extractor = ExcelExtractor(file_path)
Expand Down
16 changes: 8 additions & 8 deletions api/services/dataset_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,7 +882,7 @@ def save_document_with_dataset_id(
exist_page_ids.append(data_source_info["notion_page_id"])
exist_document[data_source_info["notion_page_id"]] = document.id
for notion_info in notion_info_list:
workspace_id = notion_info["workspace_id"]
workspace_id = notion_info.workspace_id
data_source_binding = DataSourceOauthBinding.query.filter(
db.and_(
DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
Expand All @@ -893,13 +893,13 @@ def save_document_with_dataset_id(
).first()
if not data_source_binding:
raise ValueError("Data source binding not found.")
for page in notion_info["pages"]:
if page["page_id"] not in exist_page_ids:
for page in notion_info.pages:
if page.page_id not in exist_page_ids:
data_source_info = {
"notion_workspace_id": workspace_id,
"notion_page_id": page["page_id"],
"notion_page_icon": page["page_icon"],
"type": page["type"],
"notion_page_id": page.page_id,
"notion_page_icon": page.page_icon,
"type": page.type,
}
document = DocumentService.build_document(
dataset,
Expand All @@ -911,7 +911,7 @@ def save_document_with_dataset_id(
created_from,
position,
account,
page["page_name"],
page.page_name,
batch,
)
db.session.add(document)
Expand All @@ -920,7 +920,7 @@ def save_document_with_dataset_id(
documents.append(document)
position += 1
else:
exist_document.pop(page["page_id"])
exist_document.pop(page.page_id)
# delete not selected documents
if len(exist_document) > 0:
clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class ParentMode(str, Enum):
class NotionPage(BaseModel):
page_id: str
page_name: str
page_icon: str
page_icon: Optional[str]
type: str


class NotionInfo(BaseModel):
Expand All @@ -31,7 +32,7 @@ class WebsiteInfo(BaseModel):
provider: str
job_id: str
urls: list[str]
only_main_content: bool
only_main_content: bool = True


class FileInfo(BaseModel):
Expand Down

0 comments on commit 71950ee

Please sign in to comment.