-
Notifications
You must be signed in to change notification settings - Fork 8.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(website-crawl): add jina reader as additional alternative for we…
…bsite crawling (#8761)
- Loading branch information
Showing
38 changed files
with
927 additions
and
75 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from core.rag.extractor.extractor_base import BaseExtractor | ||
from core.rag.models.document import Document | ||
from services.website_service import WebsiteService | ||
|
||
|
||
class JinaReaderWebExtractor(BaseExtractor): | ||
""" | ||
Crawl and scrape websites and return content in clean llm-ready markdown. | ||
""" | ||
|
||
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False): | ||
"""Initialize with url, api_key, base_url and mode.""" | ||
self._url = url | ||
self.job_id = job_id | ||
self.tenant_id = tenant_id | ||
self.mode = mode | ||
self.only_main_content = only_main_content | ||
|
||
def extract(self) -> list[Document]: | ||
"""Extract content from the URL.""" | ||
documents = [] | ||
if self.mode == "crawl": | ||
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id) | ||
if crawl_data is None: | ||
return [] | ||
document = Document( | ||
page_content=crawl_data.get("content", ""), | ||
metadata={ | ||
"source_url": crawl_data.get("url"), | ||
"description": crawl_data.get("description"), | ||
"title": crawl_data.get("title"), | ||
}, | ||
) | ||
documents.append(document) | ||
return documents |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import json | ||
|
||
import requests | ||
|
||
from services.auth.api_key_auth_base import ApiKeyAuthBase | ||
|
||
|
||
class JinaAuth(ApiKeyAuthBase): | ||
def __init__(self, credentials: dict): | ||
super().__init__(credentials) | ||
auth_type = credentials.get("auth_type") | ||
if auth_type != "bearer": | ||
raise ValueError("Invalid auth type, Jina Reader auth type must be Bearer") | ||
self.api_key = credentials.get("config").get("api_key", None) | ||
|
||
if not self.api_key: | ||
raise ValueError("No API key provided") | ||
|
||
def validate_credentials(self): | ||
headers = self._prepare_headers() | ||
options = { | ||
"url": "https://example.com", | ||
} | ||
response = self._post_request("https://r.jina.ai", options, headers) | ||
if response.status_code == 200: | ||
return True | ||
else: | ||
self._handle_error(response) | ||
|
||
def _prepare_headers(self): | ||
return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} | ||
|
||
def _post_request(self, url, data, headers): | ||
return requests.post(url, headers=headers, json=data) | ||
|
||
def _handle_error(self, response): | ||
if response.status_code in {402, 409, 500}: | ||
error_message = response.json().get("error", "Unknown error occurred") | ||
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") | ||
else: | ||
if response.text: | ||
error_message = json.loads(response.text).get("error", "Unknown error occurred") | ||
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") | ||
raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.