diff --git a/api/controllers/console/datasets/website.py b/api/controllers/console/datasets/website.py index cb54f1aacbc2ab..e80ce17c6866aa 100644 --- a/api/controllers/console/datasets/website.py +++ b/api/controllers/console/datasets/website.py @@ -14,7 +14,9 @@ class WebsiteCrawlApi(Resource): @account_initialization_required def post(self): parser = reqparse.RequestParser() - parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, nullable=True, location="json") + parser.add_argument( + "provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json" + ) parser.add_argument("url", type=str, required=True, nullable=True, location="json") parser.add_argument("options", type=dict, required=True, nullable=True, location="json") args = parser.parse_args() @@ -33,7 +35,7 @@ class WebsiteCrawlStatusApi(Resource): @account_initialization_required def get(self, job_id: str): parser = reqparse.RequestParser() - parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, location="args") + parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args") args = parser.parse_args() # get crawl status try: diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index 0ffc89b214c2d9..9048138511b122 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -12,6 +12,7 @@ from core.rag.extractor.excel_extractor import ExcelExtractor from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor from core.rag.extractor.html_extractor import HtmlExtractor +from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor from core.rag.extractor.markdown_extractor import MarkdownExtractor from core.rag.extractor.notion_extractor import NotionExtractor from core.rag.extractor.pdf_extractor import PdfExtractor @@ -171,6 +172,15 @@ def extract( only_main_content=extract_setting.website_info.only_main_content, ) return extractor.extract() + elif extract_setting.website_info.provider == "jinareader": + extractor = JinaReaderWebExtractor( + url=extract_setting.website_info.url, + job_id=extract_setting.website_info.job_id, + tenant_id=extract_setting.website_info.tenant_id, + mode=extract_setting.website_info.mode, + only_main_content=extract_setting.website_info.only_main_content, + ) + return extractor.extract() else: raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}") else: diff --git a/api/core/rag/extractor/jina_reader_extractor.py b/api/core/rag/extractor/jina_reader_extractor.py new file mode 100644 index 00000000000000..5b780af126b309 --- /dev/null +++ b/api/core/rag/extractor/jina_reader_extractor.py @@ -0,0 +1,35 @@ +from core.rag.extractor.extractor_base import BaseExtractor +from core.rag.models.document import Document +from services.website_service import WebsiteService + + +class JinaReaderWebExtractor(BaseExtractor): + """ + Crawl and scrape websites and return content in clean llm-ready markdown. + """ + + def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False): + """Initialize with url, api_key, base_url and mode.""" + self._url = url + self.job_id = job_id + self.tenant_id = tenant_id + self.mode = mode + self.only_main_content = only_main_content + + def extract(self) -> list[Document]: + """Extract content from the URL.""" + documents = [] + if self.mode == "crawl": + crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id) + if crawl_data is None: + return [] + document = Document( + page_content=crawl_data.get("content", ""), + metadata={ + "source_url": crawl_data.get("url"), + "description": crawl_data.get("description"), + "title": crawl_data.get("title"), + }, + ) + documents.append(document) + return documents diff --git a/api/services/auth/api_key_auth_factory.py b/api/services/auth/api_key_auth_factory.py index ae5b953b47f589..36387e9c2efdb2 100644 --- a/api/services/auth/api_key_auth_factory.py +++ b/api/services/auth/api_key_auth_factory.py @@ -1,10 +1,13 @@ from services.auth.firecrawl import FirecrawlAuth +from services.auth.jina import JinaAuth class ApiKeyAuthFactory: def __init__(self, provider: str, credentials: dict): if provider == "firecrawl": self.auth = FirecrawlAuth(credentials) + elif provider == "jinareader": + self.auth = JinaAuth(credentials) else: raise ValueError("Invalid provider") diff --git a/api/services/auth/jina.py b/api/services/auth/jina.py new file mode 100644 index 00000000000000..de898a1f94b763 --- /dev/null +++ b/api/services/auth/jina.py @@ -0,0 +1,44 @@ +import json + +import requests + +from services.auth.api_key_auth_base import ApiKeyAuthBase + + +class JinaAuth(ApiKeyAuthBase): + def __init__(self, credentials: dict): + super().__init__(credentials) + auth_type = credentials.get("auth_type") + if auth_type != "bearer": + raise ValueError("Invalid auth type, Jina Reader auth type must be Bearer") + self.api_key = credentials.get("config").get("api_key", None) + + if not self.api_key: + raise ValueError("No API key provided") + + def validate_credentials(self): + headers = self._prepare_headers() + options = { + "url": "https://example.com", + } + response = self._post_request("https://r.jina.ai", options, headers) + if response.status_code == 200: + return True + else: + self._handle_error(response) + + def _prepare_headers(self): + return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} + + def _post_request(self, url, data, headers): + return requests.post(url, headers=headers, json=data) + + def _handle_error(self, response): + if response.status_code in {402, 409, 500}: + error_message = response.json().get("error", "Unknown error occurred") + raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") + else: + if response.text: + error_message = json.loads(response.text).get("error", "Unknown error occurred") + raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") + raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}") diff --git a/api/services/website_service.py b/api/services/website_service.py index fea605cf30b410..be01815720ab57 100644 --- a/api/services/website_service.py +++ b/api/services/website_service.py @@ -1,6 +1,7 @@ import datetime import json +import requests from flask_login import current_user from core.helper import encrypter @@ -65,6 +66,35 @@ def crawl_url(cls, args: dict) -> dict: time = str(datetime.datetime.now().timestamp()) redis_client.setex(website_crawl_time_cache_key, 3600, time) return {"status": "active", "job_id": job_id} + elif provider == "jinareader": + api_key = encrypter.decrypt_token( + tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") + ) + crawl_sub_pages = options.get("crawl_sub_pages", False) + if not crawl_sub_pages: + response = requests.get( + f"https://r.jina.ai/{url}", + headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"}, + ) + if response.json().get("code") != 200: + raise ValueError("Failed to crawl") + return {"status": "active", "data": response.json().get("data")} + else: + response = requests.post( + "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app", + json={ + "url": url, + "maxPages": options.get("limit", 1), + "useSitemap": options.get("use_sitemap", True), + }, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + }, + ) + if response.json().get("code") != 200: + raise ValueError("Failed to crawl") + return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")} else: raise ValueError("Invalid provider") @@ -93,6 +123,42 @@ def get_crawl_status(cls, job_id: str, provider: str) -> dict: time_consuming = abs(end_time - float(start_time)) crawl_status_data["time_consuming"] = f"{time_consuming:.2f}" redis_client.delete(website_crawl_time_cache_key) + elif provider == "jinareader": + api_key = encrypter.decrypt_token( + tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") + ) + response = requests.post( + "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", + headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, + json={"taskId": job_id}, + ) + data = response.json().get("data", {}) + crawl_status_data = { + "status": data.get("status", "active"), + "job_id": job_id, + "total": len(data.get("urls", [])), + "current": len(data.get("processed", [])) + len(data.get("failed", [])), + "data": [], + "time_consuming": data.get("duration", 0) / 1000, + } + + if crawl_status_data["status"] == "completed": + response = requests.post( + "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", + headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, + json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())}, + ) + data = response.json().get("data", {}) + formatted_data = [ + { + "title": item.get("data", {}).get("title"), + "source_url": item.get("data", {}).get("url"), + "description": item.get("data", {}).get("description"), + "markdown": item.get("data", {}).get("content"), + } + for item in data.get("processed", {}).values() + ] + crawl_status_data["data"] = formatted_data else: raise ValueError("Invalid provider") return crawl_status_data @@ -119,6 +185,40 @@ def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str if item.get("source_url") == url: return item return None + elif provider == "jinareader": + file_key = "website_files/" + job_id + ".txt" + if storage.exists(file_key): + data = storage.load_once(file_key) + if data: + data = json.loads(data.decode("utf-8")) + elif not job_id: + response = requests.get( + f"https://r.jina.ai/{url}", + headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"}, + ) + if response.json().get("code") != 200: + raise ValueError("Failed to crawl") + return response.json().get("data") + else: + api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) + response = requests.post( + "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", + headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, + json={"taskId": job_id}, + ) + data = response.json().get("data", {}) + if data.get("status") != "completed": + raise ValueError("Crawl job is not completed") + + response = requests.post( + "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", + headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, + json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())}, + ) + data = response.json().get("data", {}) + for item in data.get("processed", {}).values(): + if item.get("data", {}).get("url") == url: + return item.get("data", {}) else: raise ValueError("Invalid provider") diff --git a/web/app/components/datasets/create/assets/jina.png b/web/app/components/datasets/create/assets/jina.png new file mode 100644 index 00000000000000..b4beeafdfb1271 Binary files /dev/null and b/web/app/components/datasets/create/assets/jina.png differ diff --git a/web/app/components/datasets/create/index.tsx b/web/app/components/datasets/create/index.tsx index 12c6284d882c5b..98098445c7695c 100644 --- a/web/app/components/datasets/create/index.tsx +++ b/web/app/components/datasets/create/index.tsx @@ -11,7 +11,7 @@ import { DataSourceType } from '@/models/datasets' import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets' import { fetchDataSource } from '@/service/common' import { fetchDatasetDetail } from '@/service/datasets' -import type { NotionPage } from '@/models/common' +import { DataSourceProvider, type NotionPage } from '@/models/common' import { useModalContext } from '@/context/modal-context' import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks' @@ -26,6 +26,7 @@ const DEFAULT_CRAWL_OPTIONS: CrawlOptions = { excludes: '', limit: 10, max_depth: '', + use_sitemap: true, } const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { @@ -51,7 +52,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { const updateFileList = (preparedFiles: FileItem[]) => { setFiles(preparedFiles) } - const [fireCrawlJobId, setFireCrawlJobId] = useState('') + const [websiteCrawlProvider, setWebsiteCrawlProvider] = useState(DataSourceProvider.fireCrawl) + const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('') const updateFile = (fileItem: FileItem, progress: number, list: FileItem[]) => { const targetIndex = list.findIndex(file => file.fileID === fileItem.fileID) @@ -137,7 +139,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { onStepChange={nextStep} websitePages={websitePages} updateWebsitePages={setWebsitePages} - onFireCrawlJobIdChange={setFireCrawlJobId} + onWebsiteCrawlProviderChange={setWebsiteCrawlProvider} + onWebsiteCrawlJobIdChange={setWebsiteCrawlJobId} crawlOptions={crawlOptions} onCrawlOptionsChange={setCrawlOptions} /> @@ -151,7 +154,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { files={fileList.map(file => file.file)} notionPages={notionPages} websitePages={websitePages} - fireCrawlJobId={fireCrawlJobId} + websiteCrawlProvider={websiteCrawlProvider} + websiteCrawlJobId={websiteCrawlJobId} onStepChange={changeStep} updateIndexingTypeCache={updateIndexingTypeCache} updateResultCache={updateResultCache} diff --git a/web/app/components/datasets/create/step-one/index.tsx b/web/app/components/datasets/create/step-one/index.tsx index c2d77f4cecdcc8..643932e9ae21d5 100644 --- a/web/app/components/datasets/create/step-one/index.tsx +++ b/web/app/components/datasets/create/step-one/index.tsx @@ -10,7 +10,7 @@ import WebsitePreview from '../website/preview' import s from './index.module.css' import cn from '@/utils/classnames' import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets' -import type { NotionPage } from '@/models/common' +import type { DataSourceProvider, NotionPage } from '@/models/common' import { DataSourceType } from '@/models/datasets' import Button from '@/app/components/base/button' import { NotionPageSelector } from '@/app/components/base/notion-page-selector' @@ -33,7 +33,8 @@ type IStepOneProps = { changeType: (type: DataSourceType) => void websitePages?: CrawlResultItem[] updateWebsitePages: (value: CrawlResultItem[]) => void - onFireCrawlJobIdChange: (jobId: string) => void + onWebsiteCrawlProviderChange: (provider: DataSourceProvider) => void + onWebsiteCrawlJobIdChange: (jobId: string) => void crawlOptions: CrawlOptions onCrawlOptionsChange: (payload: CrawlOptions) => void } @@ -69,7 +70,8 @@ const StepOne = ({ updateNotionPages, websitePages = [], updateWebsitePages, - onFireCrawlJobIdChange, + onWebsiteCrawlProviderChange, + onWebsiteCrawlJobIdChange, crawlOptions, onCrawlOptionsChange, }: IStepOneProps) => { @@ -229,7 +231,8 @@ const StepOne = ({ onPreview={setCurrentWebsite} checkedCrawlResult={websitePages} onCheckedCrawlResultChange={updateWebsitePages} - onJobIdChange={onFireCrawlJobIdChange} + onCrawlProviderChange={onWebsiteCrawlProviderChange} + onJobIdChange={onWebsiteCrawlJobIdChange} crawlOptions={crawlOptions} onCrawlOptionsChange={onCrawlOptionsChange} /> diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index f4fc58ee2a2d87..ee01652de8dd2d 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -33,6 +33,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen import Toast from '@/app/components/base/toast' import { formatNumber } from '@/utils/format' import type { NotionPage } from '@/models/common' +import { DataSourceProvider } from '@/models/common' import { DataSourceType, DocForm } from '@/models/datasets' import NotionIcon from '@/app/components/base/notion-icon' import Switch from '@/app/components/base/switch' @@ -63,7 +64,8 @@ type StepTwoProps = { notionPages?: NotionPage[] websitePages?: CrawlResultItem[] crawlOptions?: CrawlOptions - fireCrawlJobId?: string + websiteCrawlProvider?: DataSourceProvider + websiteCrawlJobId?: string onStepChange?: (delta: number) => void updateIndexingTypeCache?: (type: string) => void updateResultCache?: (res: createDocumentResponse) => void @@ -94,7 +96,8 @@ const StepTwo = ({ notionPages = [], websitePages = [], crawlOptions, - fireCrawlJobId = '', + websiteCrawlProvider = DataSourceProvider.fireCrawl, + websiteCrawlJobId = '', onStepChange, updateIndexingTypeCache, updateResultCache, @@ -260,8 +263,8 @@ const StepTwo = ({ const getWebsiteInfo = () => { return { - provider: 'firecrawl', - job_id: fireCrawlJobId, + provider: websiteCrawlProvider, + job_id: websiteCrawlJobId, urls: websitePages.map(page => page.source_url), only_main_content: crawlOptions?.only_main_content, } diff --git a/web/app/components/datasets/create/website/firecrawl/base/checkbox-with-label.tsx b/web/app/components/datasets/create/website/base/checkbox-with-label.tsx similarity index 72% rename from web/app/components/datasets/create/website/firecrawl/base/checkbox-with-label.tsx rename to web/app/components/datasets/create/website/base/checkbox-with-label.tsx index 5c574ebe3e6195..25d40fe0763dab 100644 --- a/web/app/components/datasets/create/website/firecrawl/base/checkbox-with-label.tsx +++ b/web/app/components/datasets/create/website/base/checkbox-with-label.tsx @@ -3,6 +3,7 @@ import type { FC } from 'react' import React from 'react' import cn from '@/utils/classnames' import Checkbox from '@/app/components/base/checkbox' +import Tooltip from '@/app/components/base/tooltip' type Props = { className?: string @@ -10,6 +11,7 @@ type Props = { onChange: (isChecked: boolean) => void label: string labelClassName?: string + tooltip?: string } const CheckboxWithLabel: FC = ({ @@ -18,11 +20,20 @@ const CheckboxWithLabel: FC = ({ onChange, label, labelClassName, + tooltip, }) => { return ( ) } diff --git a/web/app/components/datasets/create/website/firecrawl/crawled-result-item.tsx b/web/app/components/datasets/create/website/base/crawled-result-item.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/crawled-result-item.tsx rename to web/app/components/datasets/create/website/base/crawled-result-item.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/crawled-result.tsx b/web/app/components/datasets/create/website/base/crawled-result.tsx similarity index 97% rename from web/app/components/datasets/create/website/firecrawl/crawled-result.tsx rename to web/app/components/datasets/create/website/base/crawled-result.tsx index 2bd51e4d731a95..d5c8d1b80a5a19 100644 --- a/web/app/components/datasets/create/website/firecrawl/crawled-result.tsx +++ b/web/app/components/datasets/create/website/base/crawled-result.tsx @@ -2,7 +2,7 @@ import type { FC } from 'react' import React, { useCallback } from 'react' import { useTranslation } from 'react-i18next' -import CheckboxWithLabel from './base/checkbox-with-label' +import CheckboxWithLabel from './checkbox-with-label' import CrawledResultItem from './crawled-result-item' import cn from '@/utils/classnames' import type { CrawlResultItem } from '@/models/datasets' diff --git a/web/app/components/datasets/create/website/firecrawl/crawling.tsx b/web/app/components/datasets/create/website/base/crawling.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/crawling.tsx rename to web/app/components/datasets/create/website/base/crawling.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/base/error-message.tsx b/web/app/components/datasets/create/website/base/error-message.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/base/error-message.tsx rename to web/app/components/datasets/create/website/base/error-message.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/base/field.tsx b/web/app/components/datasets/create/website/base/field.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/base/field.tsx rename to web/app/components/datasets/create/website/base/field.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/base/input.tsx b/web/app/components/datasets/create/website/base/input.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/base/input.tsx rename to web/app/components/datasets/create/website/base/input.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/mock-crawl-result.ts b/web/app/components/datasets/create/website/base/mock-crawl-result.ts similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/mock-crawl-result.ts rename to web/app/components/datasets/create/website/base/mock-crawl-result.ts diff --git a/web/app/components/datasets/create/website/firecrawl/base/options-wrap.tsx b/web/app/components/datasets/create/website/base/options-wrap.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/base/options-wrap.tsx rename to web/app/components/datasets/create/website/base/options-wrap.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/base/url-input.tsx b/web/app/components/datasets/create/website/base/url-input.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/base/url-input.tsx rename to web/app/components/datasets/create/website/base/url-input.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/index.tsx b/web/app/components/datasets/create/website/firecrawl/index.tsx index de4f8bb1293447..aa4dffc174315f 100644 --- a/web/app/components/datasets/create/website/firecrawl/index.tsx +++ b/web/app/components/datasets/create/website/firecrawl/index.tsx @@ -2,13 +2,13 @@ import type { FC } from 'react' import React, { useCallback, useEffect, useState } from 'react' import { useTranslation } from 'react-i18next' +import UrlInput from '../base/url-input' +import OptionsWrap from '../base/options-wrap' +import CrawledResult from '../base/crawled-result' +import Crawling from '../base/crawling' +import ErrorMessage from '../base/error-message' import Header from './header' -import UrlInput from './base/url-input' -import OptionsWrap from './base/options-wrap' import Options from './options' -import CrawledResult from './crawled-result' -import Crawling from './crawling' -import ErrorMessage from './base/error-message' import cn from '@/utils/classnames' import { useModalContext } from '@/context/modal-context' import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' diff --git a/web/app/components/datasets/create/website/firecrawl/options.tsx b/web/app/components/datasets/create/website/firecrawl/options.tsx index 20cc4f073fe43b..8cc2c6757c9615 100644 --- a/web/app/components/datasets/create/website/firecrawl/options.tsx +++ b/web/app/components/datasets/create/website/firecrawl/options.tsx @@ -2,8 +2,8 @@ import type { FC } from 'react' import React, { useCallback } from 'react' import { useTranslation } from 'react-i18next' -import CheckboxWithLabel from './base/checkbox-with-label' -import Field from './base/field' +import CheckboxWithLabel from '../base/checkbox-with-label' +import Field from '../base/field' import cn from '@/utils/classnames' import type { CrawlOptions } from '@/models/datasets' diff --git a/web/app/components/datasets/create/website/index.module.css b/web/app/components/datasets/create/website/index.module.css new file mode 100644 index 00000000000000..abaab4bea4b7a1 --- /dev/null +++ b/web/app/components/datasets/create/website/index.module.css @@ -0,0 +1,6 @@ +.jinaLogo { + @apply w-4 h-4 bg-center bg-no-repeat inline-block; + background-color: #F5FAFF; + background-image: url(../assets/jina.png); + background-size: 16px; +} diff --git a/web/app/components/datasets/create/website/index.tsx b/web/app/components/datasets/create/website/index.tsx index e06fbb4a1210b6..58b7f5f2fd77bd 100644 --- a/web/app/components/datasets/create/website/index.tsx +++ b/web/app/components/datasets/create/website/index.tsx @@ -1,8 +1,12 @@ 'use client' import type { FC } from 'react' import React, { useCallback, useEffect, useState } from 'react' +import { useTranslation } from 'react-i18next' +import s from './index.module.css' import NoData from './no-data' import Firecrawl from './firecrawl' +import JinaReader from './jina-reader' +import cn from '@/utils/classnames' import { useModalContext } from '@/context/modal-context' import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' import { fetchDataSources } from '@/service/datasets' @@ -12,6 +16,7 @@ type Props = { onPreview: (payload: CrawlResultItem) => void checkedCrawlResult: CrawlResultItem[] onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void + onCrawlProviderChange: (provider: DataSourceProvider) => void onJobIdChange: (jobId: string) => void crawlOptions: CrawlOptions onCrawlOptionsChange: (payload: CrawlOptions) => void @@ -21,17 +26,32 @@ const Website: FC = ({ onPreview, checkedCrawlResult, onCheckedCrawlResultChange, + onCrawlProviderChange, onJobIdChange, crawlOptions, onCrawlOptionsChange, }) => { + const { t } = useTranslation() const { setShowAccountSettingModal } = useModalContext() const [isLoaded, setIsLoaded] = useState(false) - const [isSetFirecrawlApiKey, setIsSetFirecrawlApiKey] = useState(false) + const [selectedProvider, setSelectedProvider] = useState(DataSourceProvider.jinaReader) + const [sources, setSources] = useState([]) + + useEffect(() => { + onCrawlProviderChange(selectedProvider) + }, [selectedProvider, onCrawlProviderChange]) + const checkSetApiKey = useCallback(async () => { const res = await fetchDataSources() as any - const isFirecrawlSet = res.sources.some((item: DataSourceItem) => item.provider === DataSourceProvider.fireCrawl) - setIsSetFirecrawlApiKey(isFirecrawlSet) + setSources(res.sources) + + // If users have configured one of the providers, select it. + const availableProviders = res.sources.filter((item: DataSourceItem) => + [DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider), + ) + + if (availableProviders.length > 0) + setSelectedProvider(availableProviders[0].provider) }, []) useEffect(() => { @@ -52,20 +72,66 @@ const Website: FC = ({ return (
- {isSetFirecrawlApiKey - ? ( - - ) - : ( - - )} +
+
+ {t('datasetCreation.stepOne.website.chooseProvider')} +
+
+ + +
+
+ + { + selectedProvider === DataSourceProvider.fireCrawl + ? sources.find(source => source.provider === DataSourceProvider.fireCrawl) + ? ( + + ) + : ( + + ) + : sources.find(source => source.provider === DataSourceProvider.jinaReader) + ? ( + + ) + : ( + + ) + }
) } diff --git a/web/app/components/datasets/create/website/jina-reader/header.tsx b/web/app/components/datasets/create/website/jina-reader/header.tsx new file mode 100644 index 00000000000000..85014a30ee2b12 --- /dev/null +++ b/web/app/components/datasets/create/website/jina-reader/header.tsx @@ -0,0 +1,42 @@ +'use client' +import type { FC } from 'react' +import React from 'react' +import { useTranslation } from 'react-i18next' +import { Settings01 } from '@/app/components/base/icons/src/vender/line/general' +import { BookOpen01 } from '@/app/components/base/icons/src/vender/line/education' + +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + onSetting: () => void +} + +const Header: FC = ({ + onSetting, +}) => { + const { t } = useTranslation() + + return ( +
+
+
{t(`${I18N_PREFIX}.jinaReaderTitle`)}
+
+
+ +
+
+ + + {t(`${I18N_PREFIX}.jinaReaderDoc`)} + +
+ ) +} +export default React.memo(Header) diff --git a/web/app/components/datasets/create/website/jina-reader/index.tsx b/web/app/components/datasets/create/website/jina-reader/index.tsx new file mode 100644 index 00000000000000..51d77d712140b7 --- /dev/null +++ b/web/app/components/datasets/create/website/jina-reader/index.tsx @@ -0,0 +1,232 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback, useEffect, useState } from 'react' +import { useTranslation } from 'react-i18next' +import UrlInput from '../base/url-input' +import OptionsWrap from '../base/options-wrap' +import CrawledResult from '../base/crawled-result' +import Crawling from '../base/crawling' +import ErrorMessage from '../base/error-message' +import Header from './header' +import Options from './options' +import cn from '@/utils/classnames' +import { useModalContext } from '@/context/modal-context' +import Toast from '@/app/components/base/toast' +import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets' +import { sleep } from '@/utils' +import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' + +const ERROR_I18N_PREFIX = 'common.errorMsg' +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + onPreview: (payload: CrawlResultItem) => void + checkedCrawlResult: CrawlResultItem[] + onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void + onJobIdChange: (jobId: string) => void + crawlOptions: CrawlOptions + onCrawlOptionsChange: (payload: CrawlOptions) => void +} + +enum Step { + init = 'init', + running = 'running', + finished = 'finished', +} + +const JinaReader: FC = ({ + onPreview, + checkedCrawlResult, + onCheckedCrawlResultChange, + onJobIdChange, + crawlOptions, + onCrawlOptionsChange, +}) => { + const { t } = useTranslation() + const [step, setStep] = useState(Step.init) + const [controlFoldOptions, setControlFoldOptions] = useState(0) + useEffect(() => { + if (step !== Step.init) + setControlFoldOptions(Date.now()) + }, [step]) + const { setShowAccountSettingModal } = useModalContext() + const handleSetting = useCallback(() => { + setShowAccountSettingModal({ + payload: 'data-source', + }) + }, [setShowAccountSettingModal]) + + const checkValid = useCallback((url: string) => { + let errorMsg = '' + if (!url) { + errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: 'url', + }) + } + + if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://')))) + errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`) + + if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) { + errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: t(`${I18N_PREFIX}.limit`), + }) + } + + return { + isValid: !errorMsg, + errorMsg, + } + }, [crawlOptions, t]) + + const isInit = step === Step.init + const isCrawlFinished = step === Step.finished + const isRunning = step === Step.running + const [crawlResult, setCrawlResult] = useState<{ + current: number + total: number + data: CrawlResultItem[] + time_consuming: number | string + } | undefined>(undefined) + const [crawlErrorMessage, setCrawlErrorMessage] = useState('') + const showError = isCrawlFinished && crawlErrorMessage + + const waitForCrawlFinished = useCallback(async (jobId: string) => { + try { + const res = await checkJinaReaderTaskStatus(jobId) as any + console.log('res', res) + if (res.status === 'completed') { + return { + isError: false, + data: { + ...res, + total: Math.min(res.total, parseFloat(crawlOptions.limit as string)), + }, + } + } + if (res.status === 'failed' || !res.status) { + return { + isError: true, + errorMessage: res.message, + data: { + data: [], + }, + } + } + // update the progress + setCrawlResult({ + ...res, + total: Math.min(res.total, parseFloat(crawlOptions.limit as string)), + }) + onCheckedCrawlResultChange(res.data || []) // default select the crawl result + await sleep(2500) + return await waitForCrawlFinished(jobId) + } + catch (e: any) { + const errorBody = await e.json() + return { + isError: true, + errorMessage: errorBody.message, + data: { + data: [], + }, + } + } + }, [crawlOptions.limit]) + + const handleRun = useCallback(async (url: string) => { + const { isValid, errorMsg } = checkValid(url) + if (!isValid) { + Toast.notify({ + message: errorMsg!, + type: 'error', + }) + return + } + setStep(Step.running) + try { + const startTime = Date.now() + const res = await createJinaReaderTask({ + url, + options: crawlOptions, + }) as any + + if (res.data) { + const data = { + current: 1, + total: 1, + data: [{ + title: res.data.title, + markdown: res.data.content, + description: res.data.description, + source_url: res.data.url, + }], + time_consuming: (Date.now() - startTime) / 1000, + } + setCrawlResult(data) + onCheckedCrawlResultChange(data.data || []) + setCrawlErrorMessage('') + } + else if (res.job_id) { + const jobId = res.job_id + onJobIdChange(jobId) + const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) + if (isError) { + setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) + } + else { + setCrawlResult(data) + onCheckedCrawlResultChange(data.data || []) // default select the crawl result + setCrawlErrorMessage('') + } + } + } + catch (e) { + setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) + console.log(e) + } + finally { + setStep(Step.finished) + } + }, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished]) + + return ( +
+
+
+ + + + + + {!isInit && ( +
+ {isRunning + && } + {showError && ( + + )} + {isCrawlFinished && !showError + && + } +
+ )} +
+
+ ) +} +export default React.memo(JinaReader) diff --git a/web/app/components/datasets/create/website/jina-reader/options.tsx b/web/app/components/datasets/create/website/jina-reader/options.tsx new file mode 100644 index 00000000000000..52cfaa8b3b40f3 --- /dev/null +++ b/web/app/components/datasets/create/website/jina-reader/options.tsx @@ -0,0 +1,59 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback } from 'react' +import { useTranslation } from 'react-i18next' +import CheckboxWithLabel from '../base/checkbox-with-label' +import Field from '../base/field' +import cn from '@/utils/classnames' +import type { CrawlOptions } from '@/models/datasets' + +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + className?: string + payload: CrawlOptions + onChange: (payload: CrawlOptions) => void +} + +const Options: FC = ({ + className = '', + payload, + onChange, +}) => { + const { t } = useTranslation() + + const handleChange = useCallback((key: keyof CrawlOptions) => { + return (value: any) => { + onChange({ + ...payload, + [key]: value, + }) + } + }, [payload, onChange]) + return ( +
+ + +
+ +
+
+ ) +} +export default React.memo(Options) diff --git a/web/app/components/datasets/create/website/no-data.tsx b/web/app/components/datasets/create/website/no-data.tsx index 13e5ee7dfbd508..8a508a48c6bb8e 100644 --- a/web/app/components/datasets/create/website/no-data.tsx +++ b/web/app/components/datasets/create/website/no-data.tsx @@ -2,35 +2,56 @@ import type { FC } from 'react' import React from 'react' import { useTranslation } from 'react-i18next' +import s from './index.module.css' import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others' import Button from '@/app/components/base/button' +import { DataSourceProvider } from '@/models/common' const I18N_PREFIX = 'datasetCreation.stepOne.website' type Props = { onConfig: () => void + provider: DataSourceProvider } const NoData: FC = ({ onConfig, + provider, }) => { const { t } = useTranslation() + const providerConfig = { + [DataSourceProvider.jinaReader]: { + emoji: , + title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`), + description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`), + }, + [DataSourceProvider.fireCrawl]: { + emoji: '🔥', + title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`), + description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`), + }, + } + + const currentProvider = providerConfig[provider] + return ( -
-
- 🔥 -
-
- {t(`${I18N_PREFIX}.fireCrawlNotConfigured`)} -
- {t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`)} + <> +
+
+ {currentProvider.emoji} +
+
+ {currentProvider.title} +
+ {currentProvider.description} +
+
- -
+ ) } export default React.memo(NoData) diff --git a/web/app/components/header/account-setting/data-source-page/data-source-website/config-firecrawl-modal.tsx b/web/app/components/header/account-setting/data-source-page/data-source-website/config-firecrawl-modal.tsx index d68fc79b0d5a78..a4a8b9b63722bd 100644 --- a/web/app/components/header/account-setting/data-source-page/data-source-website/config-firecrawl-modal.tsx +++ b/web/app/components/header/account-setting/data-source-page/data-source-website/config-firecrawl-modal.tsx @@ -9,7 +9,7 @@ import { import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security' import Button from '@/app/components/base/button' import type { FirecrawlConfig } from '@/models/common' -import Field from '@/app/components/datasets/create/website/firecrawl/base/field' +import Field from '@/app/components/datasets/create/website/base/field' import Toast from '@/app/components/base/toast' import { createDataSourceApiKeyBinding } from '@/service/datasets' import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general' diff --git a/web/app/components/header/account-setting/data-source-page/data-source-website/config-jina-reader-modal.tsx b/web/app/components/header/account-setting/data-source-page/data-source-website/config-jina-reader-modal.tsx new file mode 100644 index 00000000000000..c6d6ad02565cb3 --- /dev/null +++ b/web/app/components/header/account-setting/data-source-page/data-source-website/config-jina-reader-modal.tsx @@ -0,0 +1,140 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback, useState } from 'react' +import { useTranslation } from 'react-i18next' +import { + PortalToFollowElem, + PortalToFollowElemContent, +} from '@/app/components/base/portal-to-follow-elem' +import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security' +import Button from '@/app/components/base/button' +import { DataSourceProvider } from '@/models/common' +import Field from '@/app/components/datasets/create/website/base/field' +import Toast from '@/app/components/base/toast' +import { createDataSourceApiKeyBinding } from '@/service/datasets' +import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general' +type Props = { + onCancel: () => void + onSaved: () => void +} + +const I18N_PREFIX = 'datasetCreation.jinaReader' + +const ConfigJinaReaderModal: FC = ({ + onCancel, + onSaved, +}) => { + const { t } = useTranslation() + const [isSaving, setIsSaving] = useState(false) + const [apiKey, setApiKey] = useState('') + + const handleSave = useCallback(async () => { + if (isSaving) + return + let errorMsg = '' + if (!errorMsg) { + if (!apiKey) { + errorMsg = t('common.errorMsg.fieldRequired', { + field: 'API Key', + }) + } + } + + if (errorMsg) { + Toast.notify({ + type: 'error', + message: errorMsg, + }) + return + } + const postData = { + category: 'website', + provider: DataSourceProvider.jinaReader, + credentials: { + auth_type: 'bearer', + config: { + api_key: apiKey, + }, + }, + } + try { + setIsSaving(true) + await createDataSourceApiKeyBinding(postData) + Toast.notify({ + type: 'success', + message: t('common.api.success'), + }) + } + finally { + setIsSaving(false) + } + + onSaved() + }, [apiKey, onSaved, t, isSaving]) + + return ( + + +
+
+
+
+
{t(`${I18N_PREFIX}.configJinaReader`)}
+
+ +
+ setApiKey(value as string)} + placeholder={t(`${I18N_PREFIX}.apiKeyPlaceholder`)!} + /> +
+
+ + {t(`${I18N_PREFIX}.getApiKeyLinkText`)} + + +
+ + +
+ +
+
+
+
+ + {t('common.modelProvider.encrypted.front')} + + PKCS1_OAEP + + {t('common.modelProvider.encrypted.back')} +
+
+
+
+
+
+ ) +} +export default React.memo(ConfigJinaReaderModal) diff --git a/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx b/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx index 21f7660ef1dd16..628510c5dd3871 100644 --- a/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx @@ -2,11 +2,12 @@ import type { FC } from 'react' import React, { useCallback, useEffect, useState } from 'react' import { useTranslation } from 'react-i18next' -import { useBoolean } from 'ahooks' import Panel from '../panel' import { DataSourceType } from '../panel/types' import ConfigFirecrawlModal from './config-firecrawl-modal' +import ConfigJinaReaderModal from './config-jina-reader-modal' import cn from '@/utils/classnames' +import s from '@/app/components/datasets/create/website/index.module.css' import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets' import type { @@ -19,9 +20,11 @@ import { } from '@/models/common' import Toast from '@/app/components/base/toast' -type Props = {} +type Props = { + provider: DataSourceProvider +} -const DataSourceWebsite: FC = () => { +const DataSourceWebsite: FC = ({ provider }) => { const { t } = useTranslation() const { isCurrentWorkspaceManager } = useAppContext() const [sources, setSources] = useState([]) @@ -36,22 +39,26 @@ const DataSourceWebsite: FC = () => { // eslint-disable-next-line react-hooks/exhaustive-deps }, []) - const [isShowConfig, { - setTrue: showConfig, - setFalse: hideConfig, - }] = useBoolean(false) + const [configTarget, setConfigTarget] = useState(null) + const showConfig = useCallback((provider: DataSourceProvider) => { + setConfigTarget(provider) + }, [setConfigTarget]) + + const hideConfig = useCallback(() => { + setConfigTarget(null) + }, [setConfigTarget]) const handleAdded = useCallback(() => { checkSetApiKey() hideConfig() }, [checkSetApiKey, hideConfig]) - const getIdByProvider = (provider: string): string | undefined => { + const getIdByProvider = (provider: DataSourceProvider): string | undefined => { const source = sources.find(item => item.provider === provider) return source?.id } - const handleRemove = useCallback((provider: string) => { + const handleRemove = useCallback((provider: DataSourceProvider) => { return async () => { const dataSourceId = getIdByProvider(provider) if (dataSourceId) { @@ -69,22 +76,34 @@ const DataSourceWebsite: FC = () => { <> 0} - onConfigure={showConfig} + provider={provider} + isConfigured={sources.find(item => item.provider === provider) !== undefined} + onConfigure={() => showConfig(provider)} readOnly={!isCurrentWorkspaceManager} - configuredList={sources.map(item => ({ + configuredList={sources.filter(item => item.provider === provider).map(item => ({ id: item.id, logo: ({ className }: { className: string }) => ( -
🔥
+ item.provider === DataSourceProvider.fireCrawl + ? ( +
🔥
+ ) + : ( +
+ +
+ ) ), - name: 'Firecrawl', + name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader', isActive: true, }))} - onRemove={handleRemove(DataSourceProvider.fireCrawl)} + onRemove={handleRemove(provider)} /> - {isShowConfig && ( + {configTarget === DataSourceProvider.fireCrawl && ( )} + {configTarget === DataSourceProvider.jinaReader && ( + + )} ) diff --git a/web/app/components/header/account-setting/data-source-page/index.tsx b/web/app/components/header/account-setting/data-source-page/index.tsx index ede83152b223e1..c3da977ca4e203 100644 --- a/web/app/components/header/account-setting/data-source-page/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/index.tsx @@ -3,6 +3,7 @@ import { useTranslation } from 'react-i18next' import DataSourceNotion from './data-source-notion' import DataSourceWebsite from './data-source-website' import { fetchDataSource } from '@/service/common' +import { DataSourceProvider } from '@/models/common' export default function DataSourcePage() { const { t } = useTranslation() @@ -13,7 +14,8 @@ export default function DataSourcePage() {
{t('common.dataSource.add')}
- + +
) } diff --git a/web/app/components/header/account-setting/data-source-page/panel/index.tsx b/web/app/components/header/account-setting/data-source-page/panel/index.tsx index 988aedcaf74767..4a810020b440ed 100644 --- a/web/app/components/header/account-setting/data-source-page/panel/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/panel/index.tsx @@ -8,10 +8,12 @@ import ConfigItem from './config-item' import s from './style.module.css' import { DataSourceType } from './types' +import { DataSourceProvider } from '@/models/common' import cn from '@/utils/classnames' type Props = { type: DataSourceType + provider: DataSourceProvider isConfigured: boolean onConfigure: () => void readOnly: boolean @@ -25,6 +27,7 @@ type Props = { const Panel: FC = ({ type, + provider, isConfigured, onConfigure, readOnly, @@ -46,7 +49,7 @@ const Panel: FC = ({
{t(`common.dataSource.${type}.title`)}
{isWebsite && (
- {t('common.dataSource.website.with')} 🔥 Firecrawl + {t('common.dataSource.website.with')} { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'}
)}
diff --git a/web/i18n/en-US/dataset-creation.ts b/web/i18n/en-US/dataset-creation.ts index 32f9d596ca6ac9..1849b12757f9da 100644 --- a/web/i18n/en-US/dataset-creation.ts +++ b/web/i18n/en-US/dataset-creation.ts @@ -16,6 +16,11 @@ const translation = { apiKeyPlaceholder: 'API key from firecrawl.dev', getApiKeyLinkText: 'Get your API key from firecrawl.dev', }, + jinaReader: { + configJinaReader: 'Configure Jina Reader', + apiKeyPlaceholder: 'API key from jina.ai', + getApiKeyLinkText: 'Get your free API key at jina.ai', + }, stepOne: { filePreview: 'File Preview', pagePreview: 'Page Preview', @@ -56,13 +61,21 @@ const translation = { failed: 'Creation failed', }, website: { + chooseProvider: 'Select a provider', fireCrawlNotConfigured: 'Firecrawl is not configured', fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.', + jinaReaderNotConfigured: 'Jina Reader is not configured', + jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.', configure: 'Configure', run: 'Run', firecrawlTitle: 'Extract web content with 🔥Firecrawl', firecrawlDoc: 'Firecrawl docs', firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', + jinaReaderTitle: 'Convert the entire site to Markdown', + jinaReaderDoc: 'Learn more about Jina Reader', + jinaReaderDocLink: 'https://jina.ai/reader', + useSitemap: 'Use sitemap', + useSitemapTooltip: 'Follow the sitemap to crawl the site. If not, Jina Reader will crawl iteratively based on page relevance, yielding fewer but higher-quality pages.', options: 'Options', crawlSubPage: 'Crawl sub-pages', limit: 'Limit', @@ -70,7 +83,7 @@ const translation = { excludePaths: 'Exclude paths', includeOnlyPaths: 'Include only paths', extractOnlyMainContent: 'Extract only main content (no headers, navs, footers, etc.)', - exceptionErrorTitle: 'An exception occurred while running Firecrawl job:', + exceptionErrorTitle: 'An exception occurred while running crawling job:', unknownError: 'Unknown error', totalPageScraped: 'Total pages scraped:', selectAll: 'Select All', diff --git a/web/i18n/zh-Hans/dataset-creation.ts b/web/i18n/zh-Hans/dataset-creation.ts index 78f51707918773..4f6786a1919b03 100644 --- a/web/i18n/zh-Hans/dataset-creation.ts +++ b/web/i18n/zh-Hans/dataset-creation.ts @@ -16,6 +16,11 @@ const translation = { apiKeyPlaceholder: '从 firecrawl.dev 获取 API Key', getApiKeyLinkText: '从 firecrawl.dev 获取您的 API Key', }, + jinaReader: { + configJinaReader: '配置 Jina Reader', + apiKeyPlaceholder: '从 jina.ai 获取 API Key', + getApiKeyLinkText: '从 jina.ai 获取您的免费 API Key', + }, stepOne: { filePreview: '文件预览', pagePreview: '页面预览', @@ -56,13 +61,21 @@ const translation = { failed: '创建失败', }, website: { + chooseProvider: '选择工具', fireCrawlNotConfigured: 'Firecrawl 未配置', fireCrawlNotConfiguredDescription: '请配置 Firecrawl 的 API 密钥以使用它。', + jinaReaderNotConfigured: 'Jina Reader 未配置', + jinaReaderNotConfiguredDescription: '请配置 Jina Reader 的免费 API 密钥以访问它。', configure: '配置', run: '运行', firecrawlTitle: '使用 🔥Firecrawl 提取网页内容', firecrawlDoc: 'Firecrawl 文档', firecrawlDocLink: 'https://docs.dify.ai/v/zh-hans/guides/knowledge-base/sync-from-website', + jinaReaderTitle: '将整个站点内容转换为 Markdown 格式', + jinaReaderDoc: '了解更多关于 Jina Reader', + jinaReaderDocLink: 'https://jina.ai/reader', + useSitemap: '使用 sitemap', + useSitemapTooltip: '根据 sitemap 爬取站点。否则,Jina Reader 将基于页面相关性迭代爬取,抓取较少的页面,但质量更高。', options: '选项', crawlSubPage: '爬取子页面', limit: '限制数量', @@ -70,7 +83,7 @@ const translation = { excludePaths: '排除路径', includeOnlyPaths: '仅包含路径', extractOnlyMainContent: '仅提取主要内容(无标题、导航、页脚等)', - exceptionErrorTitle: '运行 Firecrawl 时发生异常:', + exceptionErrorTitle: '运行时发生异常:', unknownError: '未知错误', totalPageScraped: '抓取页面总数:', selectAll: '全选', diff --git a/web/models/common.ts b/web/models/common.ts index 78f09bee09649f..204e89ed9b6b7a 100644 --- a/web/models/common.ts +++ b/web/models/common.ts @@ -177,6 +177,7 @@ export enum DataSourceCategory { } export enum DataSourceProvider { fireCrawl = 'firecrawl', + jinaReader = 'jinareader', } export type FirecrawlConfig = { diff --git a/web/models/datasets.ts b/web/models/datasets.ts index 23d1fe6136b85e..9358f6fcb9fb44 100644 --- a/web/models/datasets.ts +++ b/web/models/datasets.ts @@ -49,6 +49,7 @@ export type CrawlOptions = { excludes: string limit: number | string max_depth: number | string + use_sitemap: boolean } export type CrawlResultItem = { diff --git a/web/service/datasets.ts b/web/service/datasets.ts index 4ca269a7d6e212..689ac7c4035f1c 100644 --- a/web/service/datasets.ts +++ b/web/service/datasets.ts @@ -23,7 +23,7 @@ import type { SegmentsResponse, createDocumentResponse, } from '@/models/datasets' -import type { CommonResponse, DataSourceNotionWorkspace } from '@/models/common' +import { type CommonResponse, type DataSourceNotionWorkspace, DataSourceProvider } from '@/models/common' import type { ApiKeysListResponse, CreateApiKeyResponse, @@ -253,7 +253,7 @@ export const createFirecrawlTask: Fetcher> = return post('website/crawl', { body: { ...body, - provider: 'firecrawl', + provider: DataSourceProvider.fireCrawl, }, }) } @@ -261,7 +261,26 @@ export const createFirecrawlTask: Fetcher> = export const checkFirecrawlTaskStatus: Fetcher = (jobId: string) => { return get(`website/crawl/status/${jobId}`, { params: { - provider: 'firecrawl', + provider: DataSourceProvider.fireCrawl, + }, + }, { + silent: true, + }) +} + +export const createJinaReaderTask: Fetcher> = (body) => { + return post('website/crawl', { + body: { + ...body, + provider: DataSourceProvider.jinaReader, + }, + }) +} + +export const checkJinaReaderTaskStatus: Fetcher = (jobId: string) => { + return get(`website/crawl/status/${jobId}`, { + params: { + provider: 'jinareader', }, }, { silent: true,