From 369e1e6f58814929c7a236a482189f2ff4605570 Mon Sep 17 00:00:00 2001 From: Zhaofeng Miao <522856232@qq.com> Date: Mon, 30 Sep 2024 09:57:19 +0800 Subject: [PATCH] feat(website-crawl): add jina reader as additional alternative for website crawling (#8761) --- api/controllers/console/datasets/website.py | 6 +- api/core/rag/extractor/extract_processor.py | 10 + .../rag/extractor/jina_reader_extractor.py | 35 +++ api/services/auth/api_key_auth_factory.py | 3 + api/services/auth/jina.py | 44 ++++ api/services/website_service.py | 100 ++++++++ .../datasets/create/assets/jina.png | Bin 0 -> 2773 bytes web/app/components/datasets/create/index.tsx | 12 +- .../datasets/create/step-one/index.tsx | 11 +- .../datasets/create/step-two/index.tsx | 11 +- .../base/checkbox-with-label.tsx | 11 + .../crawled-result-item.tsx | 0 .../{firecrawl => base}/crawled-result.tsx | 2 +- .../website/{firecrawl => base}/crawling.tsx | 0 .../{firecrawl => }/base/error-message.tsx | 0 .../website/{firecrawl => }/base/field.tsx | 0 .../website/{firecrawl => }/base/input.tsx | 0 .../{firecrawl => base}/mock-crawl-result.ts | 0 .../{firecrawl => }/base/options-wrap.tsx | 0 .../{firecrawl => }/base/url-input.tsx | 0 .../create/website/firecrawl/index.tsx | 10 +- .../create/website/firecrawl/options.tsx | 4 +- .../datasets/create/website/index.module.css | 6 + .../datasets/create/website/index.tsx | 100 ++++++-- .../create/website/jina-reader/header.tsx | 42 ++++ .../create/website/jina-reader/index.tsx | 232 ++++++++++++++++++ .../create/website/jina-reader/options.tsx | 59 +++++ .../datasets/create/website/no-data.tsx | 45 +++- .../config-firecrawl-modal.tsx | 2 +- .../config-jina-reader-modal.tsx | 140 +++++++++++ .../data-source-website/index.tsx | 51 ++-- .../data-source-page/index.tsx | 4 +- .../data-source-page/panel/index.tsx | 5 +- web/i18n/en-US/dataset-creation.ts | 15 +- web/i18n/zh-Hans/dataset-creation.ts | 15 +- web/models/common.ts | 1 + web/models/datasets.ts | 1 + web/service/datasets.ts | 25 +- 38 files changed, 927 insertions(+), 75 deletions(-) create mode 100644 api/core/rag/extractor/jina_reader_extractor.py create mode 100644 api/services/auth/jina.py create mode 100644 web/app/components/datasets/create/assets/jina.png rename web/app/components/datasets/create/website/{firecrawl => }/base/checkbox-with-label.tsx (72%) rename web/app/components/datasets/create/website/{firecrawl => base}/crawled-result-item.tsx (100%) rename web/app/components/datasets/create/website/{firecrawl => base}/crawled-result.tsx (97%) rename web/app/components/datasets/create/website/{firecrawl => base}/crawling.tsx (100%) rename web/app/components/datasets/create/website/{firecrawl => }/base/error-message.tsx (100%) rename web/app/components/datasets/create/website/{firecrawl => }/base/field.tsx (100%) rename web/app/components/datasets/create/website/{firecrawl => }/base/input.tsx (100%) rename web/app/components/datasets/create/website/{firecrawl => base}/mock-crawl-result.ts (100%) rename web/app/components/datasets/create/website/{firecrawl => }/base/options-wrap.tsx (100%) rename web/app/components/datasets/create/website/{firecrawl => }/base/url-input.tsx (100%) create mode 100644 web/app/components/datasets/create/website/index.module.css create mode 100644 web/app/components/datasets/create/website/jina-reader/header.tsx create mode 100644 web/app/components/datasets/create/website/jina-reader/index.tsx create mode 100644 web/app/components/datasets/create/website/jina-reader/options.tsx create mode 100644 web/app/components/header/account-setting/data-source-page/data-source-website/config-jina-reader-modal.tsx diff --git a/api/controllers/console/datasets/website.py b/api/controllers/console/datasets/website.py index cb54f1aacbc2ab..e80ce17c6866aa 100644 --- a/api/controllers/console/datasets/website.py +++ b/api/controllers/console/datasets/website.py @@ -14,7 +14,9 @@ class WebsiteCrawlApi(Resource): @account_initialization_required def post(self): parser = reqparse.RequestParser() - parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, nullable=True, location="json") + parser.add_argument( + "provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json" + ) parser.add_argument("url", type=str, required=True, nullable=True, location="json") parser.add_argument("options", type=dict, required=True, nullable=True, location="json") args = parser.parse_args() @@ -33,7 +35,7 @@ class WebsiteCrawlStatusApi(Resource): @account_initialization_required def get(self, job_id: str): parser = reqparse.RequestParser() - parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, location="args") + parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args") args = parser.parse_args() # get crawl status try: diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index 0ffc89b214c2d9..9048138511b122 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -12,6 +12,7 @@ from core.rag.extractor.excel_extractor import ExcelExtractor from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor from core.rag.extractor.html_extractor import HtmlExtractor +from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor from core.rag.extractor.markdown_extractor import MarkdownExtractor from core.rag.extractor.notion_extractor import NotionExtractor from core.rag.extractor.pdf_extractor import PdfExtractor @@ -171,6 +172,15 @@ def extract( only_main_content=extract_setting.website_info.only_main_content, ) return extractor.extract() + elif extract_setting.website_info.provider == "jinareader": + extractor = JinaReaderWebExtractor( + url=extract_setting.website_info.url, + job_id=extract_setting.website_info.job_id, + tenant_id=extract_setting.website_info.tenant_id, + mode=extract_setting.website_info.mode, + only_main_content=extract_setting.website_info.only_main_content, + ) + return extractor.extract() else: raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}") else: diff --git a/api/core/rag/extractor/jina_reader_extractor.py b/api/core/rag/extractor/jina_reader_extractor.py new file mode 100644 index 00000000000000..5b780af126b309 --- /dev/null +++ b/api/core/rag/extractor/jina_reader_extractor.py @@ -0,0 +1,35 @@ +from core.rag.extractor.extractor_base import BaseExtractor +from core.rag.models.document import Document +from services.website_service import WebsiteService + + +class JinaReaderWebExtractor(BaseExtractor): + """ + Crawl and scrape websites and return content in clean llm-ready markdown. + """ + + def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False): + """Initialize with url, api_key, base_url and mode.""" + self._url = url + self.job_id = job_id + self.tenant_id = tenant_id + self.mode = mode + self.only_main_content = only_main_content + + def extract(self) -> list[Document]: + """Extract content from the URL.""" + documents = [] + if self.mode == "crawl": + crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id) + if crawl_data is None: + return [] + document = Document( + page_content=crawl_data.get("content", ""), + metadata={ + "source_url": crawl_data.get("url"), + "description": crawl_data.get("description"), + "title": crawl_data.get("title"), + }, + ) + documents.append(document) + return documents diff --git a/api/services/auth/api_key_auth_factory.py b/api/services/auth/api_key_auth_factory.py index ae5b953b47f589..36387e9c2efdb2 100644 --- a/api/services/auth/api_key_auth_factory.py +++ b/api/services/auth/api_key_auth_factory.py @@ -1,10 +1,13 @@ from services.auth.firecrawl import FirecrawlAuth +from services.auth.jina import JinaAuth class ApiKeyAuthFactory: def __init__(self, provider: str, credentials: dict): if provider == "firecrawl": self.auth = FirecrawlAuth(credentials) + elif provider == "jinareader": + self.auth = JinaAuth(credentials) else: raise ValueError("Invalid provider") diff --git a/api/services/auth/jina.py b/api/services/auth/jina.py new file mode 100644 index 00000000000000..de898a1f94b763 --- /dev/null +++ b/api/services/auth/jina.py @@ -0,0 +1,44 @@ +import json + +import requests + +from services.auth.api_key_auth_base import ApiKeyAuthBase + + +class JinaAuth(ApiKeyAuthBase): + def __init__(self, credentials: dict): + super().__init__(credentials) + auth_type = credentials.get("auth_type") + if auth_type != "bearer": + raise ValueError("Invalid auth type, Jina Reader auth type must be Bearer") + self.api_key = credentials.get("config").get("api_key", None) + + if not self.api_key: + raise ValueError("No API key provided") + + def validate_credentials(self): + headers = self._prepare_headers() + options = { + "url": "https://example.com", + } + response = self._post_request("https://r.jina.ai", options, headers) + if response.status_code == 200: + return True + else: + self._handle_error(response) + + def _prepare_headers(self): + return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} + + def _post_request(self, url, data, headers): + return requests.post(url, headers=headers, json=data) + + def _handle_error(self, response): + if response.status_code in {402, 409, 500}: + error_message = response.json().get("error", "Unknown error occurred") + raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") + else: + if response.text: + error_message = json.loads(response.text).get("error", "Unknown error occurred") + raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") + raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}") diff --git a/api/services/website_service.py b/api/services/website_service.py index fea605cf30b410..be01815720ab57 100644 --- a/api/services/website_service.py +++ b/api/services/website_service.py @@ -1,6 +1,7 @@ import datetime import json +import requests from flask_login import current_user from core.helper import encrypter @@ -65,6 +66,35 @@ def crawl_url(cls, args: dict) -> dict: time = str(datetime.datetime.now().timestamp()) redis_client.setex(website_crawl_time_cache_key, 3600, time) return {"status": "active", "job_id": job_id} + elif provider == "jinareader": + api_key = encrypter.decrypt_token( + tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") + ) + crawl_sub_pages = options.get("crawl_sub_pages", False) + if not crawl_sub_pages: + response = requests.get( + f"https://r.jina.ai/{url}", + headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"}, + ) + if response.json().get("code") != 200: + raise ValueError("Failed to crawl") + return {"status": "active", "data": response.json().get("data")} + else: + response = requests.post( + "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app", + json={ + "url": url, + "maxPages": options.get("limit", 1), + "useSitemap": options.get("use_sitemap", True), + }, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + }, + ) + if response.json().get("code") != 200: + raise ValueError("Failed to crawl") + return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")} else: raise ValueError("Invalid provider") @@ -93,6 +123,42 @@ def get_crawl_status(cls, job_id: str, provider: str) -> dict: time_consuming = abs(end_time - float(start_time)) crawl_status_data["time_consuming"] = f"{time_consuming:.2f}" redis_client.delete(website_crawl_time_cache_key) + elif provider == "jinareader": + api_key = encrypter.decrypt_token( + tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") + ) + response = requests.post( + "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", + headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, + json={"taskId": job_id}, + ) + data = response.json().get("data", {}) + crawl_status_data = { + "status": data.get("status", "active"), + "job_id": job_id, + "total": len(data.get("urls", [])), + "current": len(data.get("processed", [])) + len(data.get("failed", [])), + "data": [], + "time_consuming": data.get("duration", 0) / 1000, + } + + if crawl_status_data["status"] == "completed": + response = requests.post( + "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", + headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, + json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())}, + ) + data = response.json().get("data", {}) + formatted_data = [ + { + "title": item.get("data", {}).get("title"), + "source_url": item.get("data", {}).get("url"), + "description": item.get("data", {}).get("description"), + "markdown": item.get("data", {}).get("content"), + } + for item in data.get("processed", {}).values() + ] + crawl_status_data["data"] = formatted_data else: raise ValueError("Invalid provider") return crawl_status_data @@ -119,6 +185,40 @@ def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str if item.get("source_url") == url: return item return None + elif provider == "jinareader": + file_key = "website_files/" + job_id + ".txt" + if storage.exists(file_key): + data = storage.load_once(file_key) + if data: + data = json.loads(data.decode("utf-8")) + elif not job_id: + response = requests.get( + f"https://r.jina.ai/{url}", + headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"}, + ) + if response.json().get("code") != 200: + raise ValueError("Failed to crawl") + return response.json().get("data") + else: + api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) + response = requests.post( + "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", + headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, + json={"taskId": job_id}, + ) + data = response.json().get("data", {}) + if data.get("status") != "completed": + raise ValueError("Crawl job is not completed") + + response = requests.post( + "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", + headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, + json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())}, + ) + data = response.json().get("data", {}) + for item in data.get("processed", {}).values(): + if item.get("data", {}).get("url") == url: + return item.get("data", {}) else: raise ValueError("Invalid provider") diff --git a/web/app/components/datasets/create/assets/jina.png b/web/app/components/datasets/create/assets/jina.png new file mode 100644 index 0000000000000000000000000000000000000000..b4beeafdfb127115d0e29ad3728654310dfc389a GIT binary patch literal 2773 zcmXX|c{~&T|9|gbi^k^I9K(h*K91$em4*q0mJXD(ijqvONNHws7CMxpD5>O#s8vLc zkK7fNPq~eVTyth>_^!wA_s9GBe!d>B$LsNWzmCWI@l2w*I-n$!Bme*?Cr7HA=w|&r zNO2K1SRZc(KqA|TYIER1_DqhGpPY*v%ToIr$tLKk(#mCbw&WEFw$k>cN2*>@s{MN} z((1=F^LULM%RF`!0)wI2i-`p{>Q#)en|37aN^x!a9obl&e`@`wbHrX3_w&DZkkmJJ zzOQETi`}1{jr8?QZ!GlO_bMh_tiTfvZ0P&o1f;GK6bZc=0@!tVn%4_zWvnM`D;e7p&*&<^ z80hL67nub1o0XKpS+?UVGkqESzSq|>Zr`3?SO{4kZa`=#O)O0J7%@j%FxDZXj`HXU zesDfl7)w0;F}t<6xY)dHE0-sQ=lsjYC&)1`-38Yg8t- z*UYA8kiBedVibyqSik81DA)wQg(R&Nf zAQXFh`w7w5D1dYY+`fJL-vT%Vp1?X(au@^lA##g~uBfvD6VPBbHUu z0~j%0^EEa2|GVN@cq+@EgNt#C1GLSQyQU05$Nimg!JAmfmsU_pt*kT1W%RugetULI z4iaV=3xc-Zg3;hS0$1UYACjb_Ga_mVcGqJ~GTWSjfuI?n_vKj!451Bh&kfxJiH1PM zB8+EZVsZ;M5srlLTqjIu!Elnkem)XCW=CvF| z#V_iW>UZW_K>M_YSXR?C_9<;mXhFZnLt zzc*jQ`Sv{sJf=hP=FLk`oSc#v{bdla{jS0$ihoC(SVrSj+~PxrYYTr?_ znhp(YNn?d`MgvWd`}G}d(#+pWXRI&G(Uo5X%zijdMDS(@s+c*?RDrP;o@*665?i8c zVW|9F9WCIM#a$8=7%n~?;YcTn0n5hQ|F}=5=E?))a`)8E%fB^m?U{M21AmBQ@@M*9 zH`=BiLwdA8%$ULprYaOhN5YY(WeFp_7fazbU{x@$p|(BHOD1tyh2=5Q^P16@j|0Nt zmuJ68#Ep_w9uad}G<+7kQ|fJ{fVaxHeAE_@JAde@$MJt2H?WX+xl5wbxLc7PNJyQg zB7Etq3}g=R#5L#6yu5zPw)y3$ZzI|F``#tS$${jHLHFusUg*HUF>{{da^LH7DV6@H zhLN5Zr;iSCKiDEc@9*J;gdS(;DgvD0SO#z$1wtwmuWS^z3u_BH2NBY$@3n+~U=*pf z{G}+*&KUksEp8E$brB&+1jyAq`>SjOKwVs(Jg5P=j^aix{zw)-L4<*2PH1u&TianY z8%Dp#-)kb{RVnpP>Unzz))1frJxGgvF_&SiNA=TbPm`0_Z}g77;eI65*@}bIjj3~tqNCMnH<;5d{`GR^CYO%77naC*kY0gVx!LidR64&DUv#C^Utp(M^wh!JsrXI z*D*LOR$Lh}mvETtM>IU^NlX(X#-TKit3gj}%5;C`T~f4Cg`ikl%Qqb{p}V`~KPc&< zgE@10ugc0Uo!%`o456B7KxWCUr1%iS{`L%mnYsJ^W>_d7-z+)&aEwd~1 zd~xaloS@HG4;ZYjY#UV50}>Jvwy3)4o)-sKn0)iPx)_+9r^EPv#qO!?`S0m~JkLSB zdaedVP-+f6&ddAP%+42@TN)X9iUbNQrz75nd`YkXgd6M?nx{>gSlW}5&8g-}*A*qe zq;>SV&?}aC4zx5`hW>EA$<~sL??fq^c7p^6%fpsGP!%y=G=2#IQbd2u8_g{)*svdu zoYL@@Fh-jJcS^K|_#rPYV8tsLol3^UFFZFwx++;J-c*FMtcLf0gQjCa=Y=3S$w4RI zI2Sp5o|KyPboU=U4xk)4bnxK(zK6U1D|~?kK5?dj{h2;KvSKKSstjiY;a;bFeoyfI z+76Ctr5Hv3sh6i`;Rm|6_eN)Zeju{YB~0DSLfYOC8dehld^n7unt^T~bc)1X_sPk% z^i!ZSoJN9Odnq;%6%AT4bn7+kJ9mXn2}M2cdpr1!F6%>$xA649Hdw-j*J#4OBZRNf_RbVD%Tf=2-Y{hBHlF$^Znck$3i# zpLF=zsFw_YDanc{G;4)ez;T7ViaSs5ZvtIfI*(XU>%t!Ip7vMODTlxuq^#{nRRo`F z!}?eo_VecV=g*|T$b3bRHFLZ*ZEIu9VLYFs6Q5-Sd|*;47G2ib_yakBTs9rm{er|P zlw@5vk=@K^y9Ls;QT1I43;)3g%9wP$;BP_tldx9kP|A?>*!Li-WhJ*Pbuij%FHjj9 z8;criyB&UDw>$ZTHy|j>V9CFxd(wR54(@L%6w9M8z>Csyzpup}NEZVk+Lo36hyaRc z`BL7BKX_jDxE$~}>ci-T+~mRl(#~a0S)xzT4Fj)=I>crLy5%SSg*GoJ0mpU54bVyK zn~cRDANKTCWwcu~H$ezHjfKj#YD=9s18+@5TuNd*l}tYKZ@PX4%h#JN_L)yi%UvoU_9rJNE+;Y9z%5Uo&)qWJHw?!(OZ(tVNZz@hwA1)=f@n&)$a0> zjD$jo;`eqX^nzBHucxR}?{NBFq_5P`Z9Ax`9Xo(45p8(BKuYSQrh-ga23#r6m7LxsB?(gpejM#wMErB$tX5wramF>-FfvPf(QZf@E0 zE?>qmWPh4jS996u$IZa)QheGC-iBRkXn*C@Xn~iOTk^IWb#Jn+C3JjUwj%o1Z;*Z| zkm(jr+A^rT(_yh)Q9nZ@yEIzvI#22WS6YSM6gE~h9b%d!2fB2jA|XeDqTyH+1Ri5= z{#W?PIHEn4uoMv}iI}mph#6z3B9Q%S$~&VyxPQI71Lo9>)ES#@8FWvI { @@ -51,7 +52,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { const updateFileList = (preparedFiles: FileItem[]) => { setFiles(preparedFiles) } - const [fireCrawlJobId, setFireCrawlJobId] = useState('') + const [websiteCrawlProvider, setWebsiteCrawlProvider] = useState(DataSourceProvider.fireCrawl) + const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('') const updateFile = (fileItem: FileItem, progress: number, list: FileItem[]) => { const targetIndex = list.findIndex(file => file.fileID === fileItem.fileID) @@ -137,7 +139,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { onStepChange={nextStep} websitePages={websitePages} updateWebsitePages={setWebsitePages} - onFireCrawlJobIdChange={setFireCrawlJobId} + onWebsiteCrawlProviderChange={setWebsiteCrawlProvider} + onWebsiteCrawlJobIdChange={setWebsiteCrawlJobId} crawlOptions={crawlOptions} onCrawlOptionsChange={setCrawlOptions} /> @@ -151,7 +154,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { files={fileList.map(file => file.file)} notionPages={notionPages} websitePages={websitePages} - fireCrawlJobId={fireCrawlJobId} + websiteCrawlProvider={websiteCrawlProvider} + websiteCrawlJobId={websiteCrawlJobId} onStepChange={changeStep} updateIndexingTypeCache={updateIndexingTypeCache} updateResultCache={updateResultCache} diff --git a/web/app/components/datasets/create/step-one/index.tsx b/web/app/components/datasets/create/step-one/index.tsx index c2d77f4cecdcc8..643932e9ae21d5 100644 --- a/web/app/components/datasets/create/step-one/index.tsx +++ b/web/app/components/datasets/create/step-one/index.tsx @@ -10,7 +10,7 @@ import WebsitePreview from '../website/preview' import s from './index.module.css' import cn from '@/utils/classnames' import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets' -import type { NotionPage } from '@/models/common' +import type { DataSourceProvider, NotionPage } from '@/models/common' import { DataSourceType } from '@/models/datasets' import Button from '@/app/components/base/button' import { NotionPageSelector } from '@/app/components/base/notion-page-selector' @@ -33,7 +33,8 @@ type IStepOneProps = { changeType: (type: DataSourceType) => void websitePages?: CrawlResultItem[] updateWebsitePages: (value: CrawlResultItem[]) => void - onFireCrawlJobIdChange: (jobId: string) => void + onWebsiteCrawlProviderChange: (provider: DataSourceProvider) => void + onWebsiteCrawlJobIdChange: (jobId: string) => void crawlOptions: CrawlOptions onCrawlOptionsChange: (payload: CrawlOptions) => void } @@ -69,7 +70,8 @@ const StepOne = ({ updateNotionPages, websitePages = [], updateWebsitePages, - onFireCrawlJobIdChange, + onWebsiteCrawlProviderChange, + onWebsiteCrawlJobIdChange, crawlOptions, onCrawlOptionsChange, }: IStepOneProps) => { @@ -229,7 +231,8 @@ const StepOne = ({ onPreview={setCurrentWebsite} checkedCrawlResult={websitePages} onCheckedCrawlResultChange={updateWebsitePages} - onJobIdChange={onFireCrawlJobIdChange} + onCrawlProviderChange={onWebsiteCrawlProviderChange} + onJobIdChange={onWebsiteCrawlJobIdChange} crawlOptions={crawlOptions} onCrawlOptionsChange={onCrawlOptionsChange} /> diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index f4fc58ee2a2d87..ee01652de8dd2d 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -33,6 +33,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen import Toast from '@/app/components/base/toast' import { formatNumber } from '@/utils/format' import type { NotionPage } from '@/models/common' +import { DataSourceProvider } from '@/models/common' import { DataSourceType, DocForm } from '@/models/datasets' import NotionIcon from '@/app/components/base/notion-icon' import Switch from '@/app/components/base/switch' @@ -63,7 +64,8 @@ type StepTwoProps = { notionPages?: NotionPage[] websitePages?: CrawlResultItem[] crawlOptions?: CrawlOptions - fireCrawlJobId?: string + websiteCrawlProvider?: DataSourceProvider + websiteCrawlJobId?: string onStepChange?: (delta: number) => void updateIndexingTypeCache?: (type: string) => void updateResultCache?: (res: createDocumentResponse) => void @@ -94,7 +96,8 @@ const StepTwo = ({ notionPages = [], websitePages = [], crawlOptions, - fireCrawlJobId = '', + websiteCrawlProvider = DataSourceProvider.fireCrawl, + websiteCrawlJobId = '', onStepChange, updateIndexingTypeCache, updateResultCache, @@ -260,8 +263,8 @@ const StepTwo = ({ const getWebsiteInfo = () => { return { - provider: 'firecrawl', - job_id: fireCrawlJobId, + provider: websiteCrawlProvider, + job_id: websiteCrawlJobId, urls: websitePages.map(page => page.source_url), only_main_content: crawlOptions?.only_main_content, } diff --git a/web/app/components/datasets/create/website/firecrawl/base/checkbox-with-label.tsx b/web/app/components/datasets/create/website/base/checkbox-with-label.tsx similarity index 72% rename from web/app/components/datasets/create/website/firecrawl/base/checkbox-with-label.tsx rename to web/app/components/datasets/create/website/base/checkbox-with-label.tsx index 5c574ebe3e6195..25d40fe0763dab 100644 --- a/web/app/components/datasets/create/website/firecrawl/base/checkbox-with-label.tsx +++ b/web/app/components/datasets/create/website/base/checkbox-with-label.tsx @@ -3,6 +3,7 @@ import type { FC } from 'react' import React from 'react' import cn from '@/utils/classnames' import Checkbox from '@/app/components/base/checkbox' +import Tooltip from '@/app/components/base/tooltip' type Props = { className?: string @@ -10,6 +11,7 @@ type Props = { onChange: (isChecked: boolean) => void label: string labelClassName?: string + tooltip?: string } const CheckboxWithLabel: FC = ({ @@ -18,11 +20,20 @@ const CheckboxWithLabel: FC = ({ onChange, label, labelClassName, + tooltip, }) => { return ( ) } diff --git a/web/app/components/datasets/create/website/firecrawl/crawled-result-item.tsx b/web/app/components/datasets/create/website/base/crawled-result-item.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/crawled-result-item.tsx rename to web/app/components/datasets/create/website/base/crawled-result-item.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/crawled-result.tsx b/web/app/components/datasets/create/website/base/crawled-result.tsx similarity index 97% rename from web/app/components/datasets/create/website/firecrawl/crawled-result.tsx rename to web/app/components/datasets/create/website/base/crawled-result.tsx index 2bd51e4d731a95..d5c8d1b80a5a19 100644 --- a/web/app/components/datasets/create/website/firecrawl/crawled-result.tsx +++ b/web/app/components/datasets/create/website/base/crawled-result.tsx @@ -2,7 +2,7 @@ import type { FC } from 'react' import React, { useCallback } from 'react' import { useTranslation } from 'react-i18next' -import CheckboxWithLabel from './base/checkbox-with-label' +import CheckboxWithLabel from './checkbox-with-label' import CrawledResultItem from './crawled-result-item' import cn from '@/utils/classnames' import type { CrawlResultItem } from '@/models/datasets' diff --git a/web/app/components/datasets/create/website/firecrawl/crawling.tsx b/web/app/components/datasets/create/website/base/crawling.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/crawling.tsx rename to web/app/components/datasets/create/website/base/crawling.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/base/error-message.tsx b/web/app/components/datasets/create/website/base/error-message.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/base/error-message.tsx rename to web/app/components/datasets/create/website/base/error-message.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/base/field.tsx b/web/app/components/datasets/create/website/base/field.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/base/field.tsx rename to web/app/components/datasets/create/website/base/field.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/base/input.tsx b/web/app/components/datasets/create/website/base/input.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/base/input.tsx rename to web/app/components/datasets/create/website/base/input.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/mock-crawl-result.ts b/web/app/components/datasets/create/website/base/mock-crawl-result.ts similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/mock-crawl-result.ts rename to web/app/components/datasets/create/website/base/mock-crawl-result.ts diff --git a/web/app/components/datasets/create/website/firecrawl/base/options-wrap.tsx b/web/app/components/datasets/create/website/base/options-wrap.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/base/options-wrap.tsx rename to web/app/components/datasets/create/website/base/options-wrap.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/base/url-input.tsx b/web/app/components/datasets/create/website/base/url-input.tsx similarity index 100% rename from web/app/components/datasets/create/website/firecrawl/base/url-input.tsx rename to web/app/components/datasets/create/website/base/url-input.tsx diff --git a/web/app/components/datasets/create/website/firecrawl/index.tsx b/web/app/components/datasets/create/website/firecrawl/index.tsx index de4f8bb1293447..aa4dffc174315f 100644 --- a/web/app/components/datasets/create/website/firecrawl/index.tsx +++ b/web/app/components/datasets/create/website/firecrawl/index.tsx @@ -2,13 +2,13 @@ import type { FC } from 'react' import React, { useCallback, useEffect, useState } from 'react' import { useTranslation } from 'react-i18next' +import UrlInput from '../base/url-input' +import OptionsWrap from '../base/options-wrap' +import CrawledResult from '../base/crawled-result' +import Crawling from '../base/crawling' +import ErrorMessage from '../base/error-message' import Header from './header' -import UrlInput from './base/url-input' -import OptionsWrap from './base/options-wrap' import Options from './options' -import CrawledResult from './crawled-result' -import Crawling from './crawling' -import ErrorMessage from './base/error-message' import cn from '@/utils/classnames' import { useModalContext } from '@/context/modal-context' import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' diff --git a/web/app/components/datasets/create/website/firecrawl/options.tsx b/web/app/components/datasets/create/website/firecrawl/options.tsx index 20cc4f073fe43b..8cc2c6757c9615 100644 --- a/web/app/components/datasets/create/website/firecrawl/options.tsx +++ b/web/app/components/datasets/create/website/firecrawl/options.tsx @@ -2,8 +2,8 @@ import type { FC } from 'react' import React, { useCallback } from 'react' import { useTranslation } from 'react-i18next' -import CheckboxWithLabel from './base/checkbox-with-label' -import Field from './base/field' +import CheckboxWithLabel from '../base/checkbox-with-label' +import Field from '../base/field' import cn from '@/utils/classnames' import type { CrawlOptions } from '@/models/datasets' diff --git a/web/app/components/datasets/create/website/index.module.css b/web/app/components/datasets/create/website/index.module.css new file mode 100644 index 00000000000000..abaab4bea4b7a1 --- /dev/null +++ b/web/app/components/datasets/create/website/index.module.css @@ -0,0 +1,6 @@ +.jinaLogo { + @apply w-4 h-4 bg-center bg-no-repeat inline-block; + background-color: #F5FAFF; + background-image: url(../assets/jina.png); + background-size: 16px; +} diff --git a/web/app/components/datasets/create/website/index.tsx b/web/app/components/datasets/create/website/index.tsx index e06fbb4a1210b6..58b7f5f2fd77bd 100644 --- a/web/app/components/datasets/create/website/index.tsx +++ b/web/app/components/datasets/create/website/index.tsx @@ -1,8 +1,12 @@ 'use client' import type { FC } from 'react' import React, { useCallback, useEffect, useState } from 'react' +import { useTranslation } from 'react-i18next' +import s from './index.module.css' import NoData from './no-data' import Firecrawl from './firecrawl' +import JinaReader from './jina-reader' +import cn from '@/utils/classnames' import { useModalContext } from '@/context/modal-context' import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' import { fetchDataSources } from '@/service/datasets' @@ -12,6 +16,7 @@ type Props = { onPreview: (payload: CrawlResultItem) => void checkedCrawlResult: CrawlResultItem[] onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void + onCrawlProviderChange: (provider: DataSourceProvider) => void onJobIdChange: (jobId: string) => void crawlOptions: CrawlOptions onCrawlOptionsChange: (payload: CrawlOptions) => void @@ -21,17 +26,32 @@ const Website: FC = ({ onPreview, checkedCrawlResult, onCheckedCrawlResultChange, + onCrawlProviderChange, onJobIdChange, crawlOptions, onCrawlOptionsChange, }) => { + const { t } = useTranslation() const { setShowAccountSettingModal } = useModalContext() const [isLoaded, setIsLoaded] = useState(false) - const [isSetFirecrawlApiKey, setIsSetFirecrawlApiKey] = useState(false) + const [selectedProvider, setSelectedProvider] = useState(DataSourceProvider.jinaReader) + const [sources, setSources] = useState([]) + + useEffect(() => { + onCrawlProviderChange(selectedProvider) + }, [selectedProvider, onCrawlProviderChange]) + const checkSetApiKey = useCallback(async () => { const res = await fetchDataSources() as any - const isFirecrawlSet = res.sources.some((item: DataSourceItem) => item.provider === DataSourceProvider.fireCrawl) - setIsSetFirecrawlApiKey(isFirecrawlSet) + setSources(res.sources) + + // If users have configured one of the providers, select it. + const availableProviders = res.sources.filter((item: DataSourceItem) => + [DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider), + ) + + if (availableProviders.length > 0) + setSelectedProvider(availableProviders[0].provider) }, []) useEffect(() => { @@ -52,20 +72,66 @@ const Website: FC = ({ return (
- {isSetFirecrawlApiKey - ? ( - - ) - : ( - - )} +
+
+ {t('datasetCreation.stepOne.website.chooseProvider')} +
+
+ + +
+
+ + { + selectedProvider === DataSourceProvider.fireCrawl + ? sources.find(source => source.provider === DataSourceProvider.fireCrawl) + ? ( + + ) + : ( + + ) + : sources.find(source => source.provider === DataSourceProvider.jinaReader) + ? ( + + ) + : ( + + ) + }
) } diff --git a/web/app/components/datasets/create/website/jina-reader/header.tsx b/web/app/components/datasets/create/website/jina-reader/header.tsx new file mode 100644 index 00000000000000..85014a30ee2b12 --- /dev/null +++ b/web/app/components/datasets/create/website/jina-reader/header.tsx @@ -0,0 +1,42 @@ +'use client' +import type { FC } from 'react' +import React from 'react' +import { useTranslation } from 'react-i18next' +import { Settings01 } from '@/app/components/base/icons/src/vender/line/general' +import { BookOpen01 } from '@/app/components/base/icons/src/vender/line/education' + +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + onSetting: () => void +} + +const Header: FC = ({ + onSetting, +}) => { + const { t } = useTranslation() + + return ( +
+
+
{t(`${I18N_PREFIX}.jinaReaderTitle`)}
+
+
+ +
+
+ + + {t(`${I18N_PREFIX}.jinaReaderDoc`)} + +
+ ) +} +export default React.memo(Header) diff --git a/web/app/components/datasets/create/website/jina-reader/index.tsx b/web/app/components/datasets/create/website/jina-reader/index.tsx new file mode 100644 index 00000000000000..51d77d712140b7 --- /dev/null +++ b/web/app/components/datasets/create/website/jina-reader/index.tsx @@ -0,0 +1,232 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback, useEffect, useState } from 'react' +import { useTranslation } from 'react-i18next' +import UrlInput from '../base/url-input' +import OptionsWrap from '../base/options-wrap' +import CrawledResult from '../base/crawled-result' +import Crawling from '../base/crawling' +import ErrorMessage from '../base/error-message' +import Header from './header' +import Options from './options' +import cn from '@/utils/classnames' +import { useModalContext } from '@/context/modal-context' +import Toast from '@/app/components/base/toast' +import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets' +import { sleep } from '@/utils' +import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' + +const ERROR_I18N_PREFIX = 'common.errorMsg' +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + onPreview: (payload: CrawlResultItem) => void + checkedCrawlResult: CrawlResultItem[] + onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void + onJobIdChange: (jobId: string) => void + crawlOptions: CrawlOptions + onCrawlOptionsChange: (payload: CrawlOptions) => void +} + +enum Step { + init = 'init', + running = 'running', + finished = 'finished', +} + +const JinaReader: FC = ({ + onPreview, + checkedCrawlResult, + onCheckedCrawlResultChange, + onJobIdChange, + crawlOptions, + onCrawlOptionsChange, +}) => { + const { t } = useTranslation() + const [step, setStep] = useState(Step.init) + const [controlFoldOptions, setControlFoldOptions] = useState(0) + useEffect(() => { + if (step !== Step.init) + setControlFoldOptions(Date.now()) + }, [step]) + const { setShowAccountSettingModal } = useModalContext() + const handleSetting = useCallback(() => { + setShowAccountSettingModal({ + payload: 'data-source', + }) + }, [setShowAccountSettingModal]) + + const checkValid = useCallback((url: string) => { + let errorMsg = '' + if (!url) { + errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: 'url', + }) + } + + if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://')))) + errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`) + + if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) { + errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: t(`${I18N_PREFIX}.limit`), + }) + } + + return { + isValid: !errorMsg, + errorMsg, + } + }, [crawlOptions, t]) + + const isInit = step === Step.init + const isCrawlFinished = step === Step.finished + const isRunning = step === Step.running + const [crawlResult, setCrawlResult] = useState<{ + current: number + total: number + data: CrawlResultItem[] + time_consuming: number | string + } | undefined>(undefined) + const [crawlErrorMessage, setCrawlErrorMessage] = useState('') + const showError = isCrawlFinished && crawlErrorMessage + + const waitForCrawlFinished = useCallback(async (jobId: string) => { + try { + const res = await checkJinaReaderTaskStatus(jobId) as any + console.log('res', res) + if (res.status === 'completed') { + return { + isError: false, + data: { + ...res, + total: Math.min(res.total, parseFloat(crawlOptions.limit as string)), + }, + } + } + if (res.status === 'failed' || !res.status) { + return { + isError: true, + errorMessage: res.message, + data: { + data: [], + }, + } + } + // update the progress + setCrawlResult({ + ...res, + total: Math.min(res.total, parseFloat(crawlOptions.limit as string)), + }) + onCheckedCrawlResultChange(res.data || []) // default select the crawl result + await sleep(2500) + return await waitForCrawlFinished(jobId) + } + catch (e: any) { + const errorBody = await e.json() + return { + isError: true, + errorMessage: errorBody.message, + data: { + data: [], + }, + } + } + }, [crawlOptions.limit]) + + const handleRun = useCallback(async (url: string) => { + const { isValid, errorMsg } = checkValid(url) + if (!isValid) { + Toast.notify({ + message: errorMsg!, + type: 'error', + }) + return + } + setStep(Step.running) + try { + const startTime = Date.now() + const res = await createJinaReaderTask({ + url, + options: crawlOptions, + }) as any + + if (res.data) { + const data = { + current: 1, + total: 1, + data: [{ + title: res.data.title, + markdown: res.data.content, + description: res.data.description, + source_url: res.data.url, + }], + time_consuming: (Date.now() - startTime) / 1000, + } + setCrawlResult(data) + onCheckedCrawlResultChange(data.data || []) + setCrawlErrorMessage('') + } + else if (res.job_id) { + const jobId = res.job_id + onJobIdChange(jobId) + const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) + if (isError) { + setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) + } + else { + setCrawlResult(data) + onCheckedCrawlResultChange(data.data || []) // default select the crawl result + setCrawlErrorMessage('') + } + } + } + catch (e) { + setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) + console.log(e) + } + finally { + setStep(Step.finished) + } + }, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished]) + + return ( +
+
+
+ + + + + + {!isInit && ( +
+ {isRunning + && } + {showError && ( + + )} + {isCrawlFinished && !showError + && + } +
+ )} +
+
+ ) +} +export default React.memo(JinaReader) diff --git a/web/app/components/datasets/create/website/jina-reader/options.tsx b/web/app/components/datasets/create/website/jina-reader/options.tsx new file mode 100644 index 00000000000000..52cfaa8b3b40f3 --- /dev/null +++ b/web/app/components/datasets/create/website/jina-reader/options.tsx @@ -0,0 +1,59 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback } from 'react' +import { useTranslation } from 'react-i18next' +import CheckboxWithLabel from '../base/checkbox-with-label' +import Field from '../base/field' +import cn from '@/utils/classnames' +import type { CrawlOptions } from '@/models/datasets' + +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + className?: string + payload: CrawlOptions + onChange: (payload: CrawlOptions) => void +} + +const Options: FC = ({ + className = '', + payload, + onChange, +}) => { + const { t } = useTranslation() + + const handleChange = useCallback((key: keyof CrawlOptions) => { + return (value: any) => { + onChange({ + ...payload, + [key]: value, + }) + } + }, [payload, onChange]) + return ( +
+ + +
+ +
+
+ ) +} +export default React.memo(Options) diff --git a/web/app/components/datasets/create/website/no-data.tsx b/web/app/components/datasets/create/website/no-data.tsx index 13e5ee7dfbd508..8a508a48c6bb8e 100644 --- a/web/app/components/datasets/create/website/no-data.tsx +++ b/web/app/components/datasets/create/website/no-data.tsx @@ -2,35 +2,56 @@ import type { FC } from 'react' import React from 'react' import { useTranslation } from 'react-i18next' +import s from './index.module.css' import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others' import Button from '@/app/components/base/button' +import { DataSourceProvider } from '@/models/common' const I18N_PREFIX = 'datasetCreation.stepOne.website' type Props = { onConfig: () => void + provider: DataSourceProvider } const NoData: FC = ({ onConfig, + provider, }) => { const { t } = useTranslation() + const providerConfig = { + [DataSourceProvider.jinaReader]: { + emoji: , + title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`), + description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`), + }, + [DataSourceProvider.fireCrawl]: { + emoji: '🔥', + title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`), + description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`), + }, + } + + const currentProvider = providerConfig[provider] + return ( -
-
- 🔥 -
-
- {t(`${I18N_PREFIX}.fireCrawlNotConfigured`)} -
- {t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`)} + <> +
+
+ {currentProvider.emoji} +
+
+ {currentProvider.title} +
+ {currentProvider.description} +
+
- -
+ ) } export default React.memo(NoData) diff --git a/web/app/components/header/account-setting/data-source-page/data-source-website/config-firecrawl-modal.tsx b/web/app/components/header/account-setting/data-source-page/data-source-website/config-firecrawl-modal.tsx index d68fc79b0d5a78..a4a8b9b63722bd 100644 --- a/web/app/components/header/account-setting/data-source-page/data-source-website/config-firecrawl-modal.tsx +++ b/web/app/components/header/account-setting/data-source-page/data-source-website/config-firecrawl-modal.tsx @@ -9,7 +9,7 @@ import { import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security' import Button from '@/app/components/base/button' import type { FirecrawlConfig } from '@/models/common' -import Field from '@/app/components/datasets/create/website/firecrawl/base/field' +import Field from '@/app/components/datasets/create/website/base/field' import Toast from '@/app/components/base/toast' import { createDataSourceApiKeyBinding } from '@/service/datasets' import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general' diff --git a/web/app/components/header/account-setting/data-source-page/data-source-website/config-jina-reader-modal.tsx b/web/app/components/header/account-setting/data-source-page/data-source-website/config-jina-reader-modal.tsx new file mode 100644 index 00000000000000..c6d6ad02565cb3 --- /dev/null +++ b/web/app/components/header/account-setting/data-source-page/data-source-website/config-jina-reader-modal.tsx @@ -0,0 +1,140 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback, useState } from 'react' +import { useTranslation } from 'react-i18next' +import { + PortalToFollowElem, + PortalToFollowElemContent, +} from '@/app/components/base/portal-to-follow-elem' +import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security' +import Button from '@/app/components/base/button' +import { DataSourceProvider } from '@/models/common' +import Field from '@/app/components/datasets/create/website/base/field' +import Toast from '@/app/components/base/toast' +import { createDataSourceApiKeyBinding } from '@/service/datasets' +import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general' +type Props = { + onCancel: () => void + onSaved: () => void +} + +const I18N_PREFIX = 'datasetCreation.jinaReader' + +const ConfigJinaReaderModal: FC = ({ + onCancel, + onSaved, +}) => { + const { t } = useTranslation() + const [isSaving, setIsSaving] = useState(false) + const [apiKey, setApiKey] = useState('') + + const handleSave = useCallback(async () => { + if (isSaving) + return + let errorMsg = '' + if (!errorMsg) { + if (!apiKey) { + errorMsg = t('common.errorMsg.fieldRequired', { + field: 'API Key', + }) + } + } + + if (errorMsg) { + Toast.notify({ + type: 'error', + message: errorMsg, + }) + return + } + const postData = { + category: 'website', + provider: DataSourceProvider.jinaReader, + credentials: { + auth_type: 'bearer', + config: { + api_key: apiKey, + }, + }, + } + try { + setIsSaving(true) + await createDataSourceApiKeyBinding(postData) + Toast.notify({ + type: 'success', + message: t('common.api.success'), + }) + } + finally { + setIsSaving(false) + } + + onSaved() + }, [apiKey, onSaved, t, isSaving]) + + return ( + + +
+
+
+
+
{t(`${I18N_PREFIX}.configJinaReader`)}
+
+ +
+ setApiKey(value as string)} + placeholder={t(`${I18N_PREFIX}.apiKeyPlaceholder`)!} + /> +
+
+ + {t(`${I18N_PREFIX}.getApiKeyLinkText`)} + + +
+ + +
+ +
+
+
+
+ + {t('common.modelProvider.encrypted.front')} + + PKCS1_OAEP + + {t('common.modelProvider.encrypted.back')} +
+
+
+
+
+
+ ) +} +export default React.memo(ConfigJinaReaderModal) diff --git a/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx b/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx index 21f7660ef1dd16..628510c5dd3871 100644 --- a/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx @@ -2,11 +2,12 @@ import type { FC } from 'react' import React, { useCallback, useEffect, useState } from 'react' import { useTranslation } from 'react-i18next' -import { useBoolean } from 'ahooks' import Panel from '../panel' import { DataSourceType } from '../panel/types' import ConfigFirecrawlModal from './config-firecrawl-modal' +import ConfigJinaReaderModal from './config-jina-reader-modal' import cn from '@/utils/classnames' +import s from '@/app/components/datasets/create/website/index.module.css' import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets' import type { @@ -19,9 +20,11 @@ import { } from '@/models/common' import Toast from '@/app/components/base/toast' -type Props = {} +type Props = { + provider: DataSourceProvider +} -const DataSourceWebsite: FC = () => { +const DataSourceWebsite: FC = ({ provider }) => { const { t } = useTranslation() const { isCurrentWorkspaceManager } = useAppContext() const [sources, setSources] = useState([]) @@ -36,22 +39,26 @@ const DataSourceWebsite: FC = () => { // eslint-disable-next-line react-hooks/exhaustive-deps }, []) - const [isShowConfig, { - setTrue: showConfig, - setFalse: hideConfig, - }] = useBoolean(false) + const [configTarget, setConfigTarget] = useState(null) + const showConfig = useCallback((provider: DataSourceProvider) => { + setConfigTarget(provider) + }, [setConfigTarget]) + + const hideConfig = useCallback(() => { + setConfigTarget(null) + }, [setConfigTarget]) const handleAdded = useCallback(() => { checkSetApiKey() hideConfig() }, [checkSetApiKey, hideConfig]) - const getIdByProvider = (provider: string): string | undefined => { + const getIdByProvider = (provider: DataSourceProvider): string | undefined => { const source = sources.find(item => item.provider === provider) return source?.id } - const handleRemove = useCallback((provider: string) => { + const handleRemove = useCallback((provider: DataSourceProvider) => { return async () => { const dataSourceId = getIdByProvider(provider) if (dataSourceId) { @@ -69,22 +76,34 @@ const DataSourceWebsite: FC = () => { <> 0} - onConfigure={showConfig} + provider={provider} + isConfigured={sources.find(item => item.provider === provider) !== undefined} + onConfigure={() => showConfig(provider)} readOnly={!isCurrentWorkspaceManager} - configuredList={sources.map(item => ({ + configuredList={sources.filter(item => item.provider === provider).map(item => ({ id: item.id, logo: ({ className }: { className: string }) => ( -
🔥
+ item.provider === DataSourceProvider.fireCrawl + ? ( +
🔥
+ ) + : ( +
+ +
+ ) ), - name: 'Firecrawl', + name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader', isActive: true, }))} - onRemove={handleRemove(DataSourceProvider.fireCrawl)} + onRemove={handleRemove(provider)} /> - {isShowConfig && ( + {configTarget === DataSourceProvider.fireCrawl && ( )} + {configTarget === DataSourceProvider.jinaReader && ( + + )} ) diff --git a/web/app/components/header/account-setting/data-source-page/index.tsx b/web/app/components/header/account-setting/data-source-page/index.tsx index ede83152b223e1..c3da977ca4e203 100644 --- a/web/app/components/header/account-setting/data-source-page/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/index.tsx @@ -3,6 +3,7 @@ import { useTranslation } from 'react-i18next' import DataSourceNotion from './data-source-notion' import DataSourceWebsite from './data-source-website' import { fetchDataSource } from '@/service/common' +import { DataSourceProvider } from '@/models/common' export default function DataSourcePage() { const { t } = useTranslation() @@ -13,7 +14,8 @@ export default function DataSourcePage() {
{t('common.dataSource.add')}
- + +
) } diff --git a/web/app/components/header/account-setting/data-source-page/panel/index.tsx b/web/app/components/header/account-setting/data-source-page/panel/index.tsx index 988aedcaf74767..4a810020b440ed 100644 --- a/web/app/components/header/account-setting/data-source-page/panel/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/panel/index.tsx @@ -8,10 +8,12 @@ import ConfigItem from './config-item' import s from './style.module.css' import { DataSourceType } from './types' +import { DataSourceProvider } from '@/models/common' import cn from '@/utils/classnames' type Props = { type: DataSourceType + provider: DataSourceProvider isConfigured: boolean onConfigure: () => void readOnly: boolean @@ -25,6 +27,7 @@ type Props = { const Panel: FC = ({ type, + provider, isConfigured, onConfigure, readOnly, @@ -46,7 +49,7 @@ const Panel: FC = ({
{t(`common.dataSource.${type}.title`)}
{isWebsite && (
- {t('common.dataSource.website.with')} 🔥 Firecrawl + {t('common.dataSource.website.with')} { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'}
)}
diff --git a/web/i18n/en-US/dataset-creation.ts b/web/i18n/en-US/dataset-creation.ts index 32f9d596ca6ac9..1849b12757f9da 100644 --- a/web/i18n/en-US/dataset-creation.ts +++ b/web/i18n/en-US/dataset-creation.ts @@ -16,6 +16,11 @@ const translation = { apiKeyPlaceholder: 'API key from firecrawl.dev', getApiKeyLinkText: 'Get your API key from firecrawl.dev', }, + jinaReader: { + configJinaReader: 'Configure Jina Reader', + apiKeyPlaceholder: 'API key from jina.ai', + getApiKeyLinkText: 'Get your free API key at jina.ai', + }, stepOne: { filePreview: 'File Preview', pagePreview: 'Page Preview', @@ -56,13 +61,21 @@ const translation = { failed: 'Creation failed', }, website: { + chooseProvider: 'Select a provider', fireCrawlNotConfigured: 'Firecrawl is not configured', fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.', + jinaReaderNotConfigured: 'Jina Reader is not configured', + jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.', configure: 'Configure', run: 'Run', firecrawlTitle: 'Extract web content with 🔥Firecrawl', firecrawlDoc: 'Firecrawl docs', firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', + jinaReaderTitle: 'Convert the entire site to Markdown', + jinaReaderDoc: 'Learn more about Jina Reader', + jinaReaderDocLink: 'https://jina.ai/reader', + useSitemap: 'Use sitemap', + useSitemapTooltip: 'Follow the sitemap to crawl the site. If not, Jina Reader will crawl iteratively based on page relevance, yielding fewer but higher-quality pages.', options: 'Options', crawlSubPage: 'Crawl sub-pages', limit: 'Limit', @@ -70,7 +83,7 @@ const translation = { excludePaths: 'Exclude paths', includeOnlyPaths: 'Include only paths', extractOnlyMainContent: 'Extract only main content (no headers, navs, footers, etc.)', - exceptionErrorTitle: 'An exception occurred while running Firecrawl job:', + exceptionErrorTitle: 'An exception occurred while running crawling job:', unknownError: 'Unknown error', totalPageScraped: 'Total pages scraped:', selectAll: 'Select All', diff --git a/web/i18n/zh-Hans/dataset-creation.ts b/web/i18n/zh-Hans/dataset-creation.ts index 78f51707918773..4f6786a1919b03 100644 --- a/web/i18n/zh-Hans/dataset-creation.ts +++ b/web/i18n/zh-Hans/dataset-creation.ts @@ -16,6 +16,11 @@ const translation = { apiKeyPlaceholder: '从 firecrawl.dev 获取 API Key', getApiKeyLinkText: '从 firecrawl.dev 获取您的 API Key', }, + jinaReader: { + configJinaReader: '配置 Jina Reader', + apiKeyPlaceholder: '从 jina.ai 获取 API Key', + getApiKeyLinkText: '从 jina.ai 获取您的免费 API Key', + }, stepOne: { filePreview: '文件预览', pagePreview: '页面预览', @@ -56,13 +61,21 @@ const translation = { failed: '创建失败', }, website: { + chooseProvider: '选择工具', fireCrawlNotConfigured: 'Firecrawl 未配置', fireCrawlNotConfiguredDescription: '请配置 Firecrawl 的 API 密钥以使用它。', + jinaReaderNotConfigured: 'Jina Reader 未配置', + jinaReaderNotConfiguredDescription: '请配置 Jina Reader 的免费 API 密钥以访问它。', configure: '配置', run: '运行', firecrawlTitle: '使用 🔥Firecrawl 提取网页内容', firecrawlDoc: 'Firecrawl 文档', firecrawlDocLink: 'https://docs.dify.ai/v/zh-hans/guides/knowledge-base/sync-from-website', + jinaReaderTitle: '将整个站点内容转换为 Markdown 格式', + jinaReaderDoc: '了解更多关于 Jina Reader', + jinaReaderDocLink: 'https://jina.ai/reader', + useSitemap: '使用 sitemap', + useSitemapTooltip: '根据 sitemap 爬取站点。否则,Jina Reader 将基于页面相关性迭代爬取,抓取较少的页面,但质量更高。', options: '选项', crawlSubPage: '爬取子页面', limit: '限制数量', @@ -70,7 +83,7 @@ const translation = { excludePaths: '排除路径', includeOnlyPaths: '仅包含路径', extractOnlyMainContent: '仅提取主要内容(无标题、导航、页脚等)', - exceptionErrorTitle: '运行 Firecrawl 时发生异常:', + exceptionErrorTitle: '运行时发生异常:', unknownError: '未知错误', totalPageScraped: '抓取页面总数:', selectAll: '全选', diff --git a/web/models/common.ts b/web/models/common.ts index 78f09bee09649f..204e89ed9b6b7a 100644 --- a/web/models/common.ts +++ b/web/models/common.ts @@ -177,6 +177,7 @@ export enum DataSourceCategory { } export enum DataSourceProvider { fireCrawl = 'firecrawl', + jinaReader = 'jinareader', } export type FirecrawlConfig = { diff --git a/web/models/datasets.ts b/web/models/datasets.ts index 23d1fe6136b85e..9358f6fcb9fb44 100644 --- a/web/models/datasets.ts +++ b/web/models/datasets.ts @@ -49,6 +49,7 @@ export type CrawlOptions = { excludes: string limit: number | string max_depth: number | string + use_sitemap: boolean } export type CrawlResultItem = { diff --git a/web/service/datasets.ts b/web/service/datasets.ts index 4ca269a7d6e212..689ac7c4035f1c 100644 --- a/web/service/datasets.ts +++ b/web/service/datasets.ts @@ -23,7 +23,7 @@ import type { SegmentsResponse, createDocumentResponse, } from '@/models/datasets' -import type { CommonResponse, DataSourceNotionWorkspace } from '@/models/common' +import { type CommonResponse, type DataSourceNotionWorkspace, DataSourceProvider } from '@/models/common' import type { ApiKeysListResponse, CreateApiKeyResponse, @@ -253,7 +253,7 @@ export const createFirecrawlTask: Fetcher> = return post('website/crawl', { body: { ...body, - provider: 'firecrawl', + provider: DataSourceProvider.fireCrawl, }, }) } @@ -261,7 +261,26 @@ export const createFirecrawlTask: Fetcher> = export const checkFirecrawlTaskStatus: Fetcher = (jobId: string) => { return get(`website/crawl/status/${jobId}`, { params: { - provider: 'firecrawl', + provider: DataSourceProvider.fireCrawl, + }, + }, { + silent: true, + }) +} + +export const createJinaReaderTask: Fetcher> = (body) => { + return post('website/crawl', { + body: { + ...body, + provider: DataSourceProvider.jinaReader, + }, + }) +} + +export const checkJinaReaderTaskStatus: Fetcher = (jobId: string) => { + return get(`website/crawl/status/${jobId}`, { + params: { + provider: 'jinareader', }, }, { silent: true,