Skip to content

Commit

Permalink
Merge branch 'feat/firecrawl-data-source' into deploy/dev
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnJyong committed Jun 7, 2024
2 parents 44ca9f6 + be91a90 commit 6dd4aec
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 5 deletions.
15 changes: 15 additions & 0 deletions api/controllers/console/datasets/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,21 @@ def post(self):
document_model=args['doc_form']
)
extract_settings.append(extract_setting)
elif args['info_list']['data_source_type'] == 'website_crawl':
website_info_list = args['info_list']['website_info_list']
for url in website_info_list['urls']:
extract_setting = ExtractSetting(
datasource_type="website",
website_info={
"provider": website_info_list['provider'],
"job_id": website_info_list['job_id'],
"url": url,
"mode": 'crawl',
"only_main_content": website_info_list['only_main_content']
},
document_model=args['doc_form']
)
extract_settings.append(extract_setting)
else:
raise ValueError('Data source type not support')
indexing_runner = IndexingRunner()
Expand Down
13 changes: 13 additions & 0 deletions api/controllers/console/datasets/datasets_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,19 @@ def get(self, dataset_id, batch):
document_model=document.doc_form
)
extract_settings.append(extract_setting)
elif document.data_source_type == 'website_crawl':
extract_setting = ExtractSetting(
datasource_type="website",
website_info={
"provider": data_source_info['provider'],
"job_id": data_source_info['job_id'],
"url": data_source_info['url'],
"mode": data_source_info['mode'],
"only_main_content": data_source_info['only_main_content']
},
document_model=document.doc_form
)
extract_settings.append(extract_setting)

else:
raise ValueError('Data source type not support')
Expand Down
4 changes: 2 additions & 2 deletions api/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ class Document(db.Model):
255), nullable=False, server_default=db.text("'text_model'::character varying"))
doc_language = db.Column(db.String(255), nullable=True)

DATA_SOURCES = ['upload_file', 'notion_import', 'website']
DATA_SOURCES = ['upload_file', 'notion_import', 'website_crawl']

@property
def display_status(self):
Expand Down Expand Up @@ -322,7 +322,7 @@ def data_source_detail_dict(self):
'created_at': file_detail.created_at.timestamp()
}
}
elif self.data_source_type == 'notion_import':
elif self.data_source_type == 'notion_import' or self.data_source_type == 'website_crawl':
return json.loads(self.data_source_info)
return {}

Expand Down
10 changes: 7 additions & 3 deletions api/services/dataset_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,7 +749,9 @@ def save_document_with_dataset_id(dataset: Dataset, document_data: dict,
data_source_info = {
'url': url,
'provider': website_info['provider'],
'job_id': website_info['job_id']
'job_id': website_info['job_id'],
'only_main_content': website_info.get('only_main_content', False),
'mode': 'crawl',
}
document = DocumentService.build_document(dataset, dataset_process_rule.id,
document_data["data_source"]["type"],
Expand Down Expand Up @@ -887,7 +889,9 @@ def update_document_with_dataset_id(dataset: Dataset, document_data: dict,
data_source_info = {
'url': url,
'provider': website_info['provider'],
'job_id': website_info['job_id']
'job_id': website_info['job_id'],
'only_main_content': website_info.get('only_main_content', False),
'mode': 'crawl',
}
document.data_source_type = document_data["data_source"]["type"]
document.data_source_info = json.dumps(data_source_info)
Expand Down Expand Up @@ -1030,7 +1034,7 @@ def data_source_args_validate(cls, args: dict):
if 'notion_info_list' not in args['data_source']['info_list'] or not args['data_source']['info_list'][
'notion_info_list']:
raise ValueError("Notion source info is required")
if args['data_source']['type'] == 'website':
if args['data_source']['type'] == 'website_crawl':
if 'website_info_list' not in args['data_source']['info_list'] or not args['data_source']['info_list'][
'website_info_list']:
raise ValueError("Website source info is required")
Expand Down

0 comments on commit 6dd4aec

Please sign in to comment.