Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(deps): Bump idna from 3.7 to 3.10 #28

Open
wants to merge 19 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,14 @@ jobs:
- name: Cache pytest
uses: actions/[email protected]
with:
path: .pytest_cache
key: pytest-${{ github.head_ref || github.ref_name }}-${{ hashFiles('requirements-dev.txt') }}
path: .pytest_cache

- name: Cache Ruff
uses: actions/[email protected]
with:
path: .ruff_cache
key: ruff-${{ github.head_ref || github.ref_name }}-${{ hashFiles('requirements-dev.txt') }}
path: .ruff_cache

- name: Set up dependencies
run: make install-deps
Expand All @@ -86,6 +86,7 @@ jobs:

- name: Upload artifacts
uses: actions/[email protected]
if: always()
with:
compression-level: 9
name: test-results
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,10 @@ test-static-server:

test-unit-run:
@echo "➡️ Unit tests (Pytest)..."
pytest \
python3 -m pytest \
--junit-xml=test-reports/$(version_full).xml \
--maxprocesses=4 \
-n logical \
-n=logical \
tests/*.py

dev:
Expand Down
66 changes: 62 additions & 4 deletions app/helpers/resources.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import asyncio
import hashlib
from os.path import join
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
from datetime import UTC, datetime, timedelta
from os.path import dirname, join
from pathlib import Path

import click
from aiofiles.os import makedirs, path
from aiofiles import open
from aiofiles.os import makedirs, path, remove


def dir_tests(sub: str) -> str:
Expand Down Expand Up @@ -64,12 +69,15 @@ async def cache_dir() -> str:
"""
Get the path to the cache directory.

If the directory does not exist, it will be created.

See: https://click.palletsprojects.com/en/8.1.x/api/#click.get_app_dir
"""
# Resolve
res = await path.abspath(click.get_app_dir("scrape-it-now"))
# Create if not exists
if not await path.exists(res):
await makedirs(res)
await makedirs(res, exist_ok=True)
# Return
return res


Expand All @@ -94,3 +102,53 @@ async def local_disk_cache_path() -> str:
Get the path to the local disk persistence.
"""
return join(await cache_dir(), "local_disk")


@asynccontextmanager
async def file_lock(file_path: str, timeout: int = 60) -> AsyncGenerator[None, None]: # noqa: ASYNC109
"""
Lock a file for exclusive access.

File path is built with `.lock` appended to the file path. Timeout is in seconds. If the folder does not exist, it will be created.
"""
full_path = await path.abspath(file_path)
lock_file = f"{full_path}.lock"

# Create the directory if it doesn't exist
await makedirs(dirname(full_path), exist_ok=True)

# Wait until the lock file is removed
while await path.exists(lock_file):
# Wait a bit to now overwhelm the CPU
await asyncio.sleep(0.1)

try:
# Check if the lock file has been there for too long
if (
datetime.now(UTC)
- datetime.fromtimestamp(await path.getmtime(lock_file), UTC)
) > timedelta(seconds=timeout):
# Run anyway, the initial worker may have crashed, and the other workers are waiting but *would* have to wait again because of the lock file timestamp update
break
except FileNotFoundError:
# The lock file was removed, continue
break

# Create the empty lock file
async with open(
encoding="utf-8",
file=lock_file,
mode="a",
) as f:
await f.write("a")

try:
# Return to the caller
yield

finally:
try:
# Remove the lock file
await remove(lock_file)
except FileNotFoundError:
pass
36 changes: 3 additions & 33 deletions app/persistence/local_disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pydantic import BaseModel, Field

from app.helpers.logging import logger
from app.helpers.resources import local_disk_cache_path
from app.helpers.resources import file_lock, local_disk_cache_path
from app.models.message import Message
from app.persistence.iblob import (
BlobAlreadyExistsError,
Expand Down Expand Up @@ -69,8 +69,8 @@ async def lease_blob(

lease_file = await self._lease_path(blob)

# Ensure only this worker accesses the lease
async with self._file_lock(lease_file):
# Ensure only one worker is updating the lease
async with file_lock(lease_file):
# Skip if the lease file already exists and is not expired
if await path.exists(lease_file):
try:
Expand Down Expand Up @@ -231,36 +231,6 @@ async def delete_container(
await rmdir(join(root_name, dir_name))
logger.info('Deleted Local Disk Blob "%s"', self._config.name)

@asynccontextmanager
async def _file_lock(self, file_path: str) -> AsyncGenerator[None, None]:
full_path = await path.abspath(file_path)
lock_file = f"{full_path}.lock"

# Create the directory if it doesn't exist
await makedirs(dirname(full_path), exist_ok=True)

# Wait until the lock file is removed
while await path.exists(lock_file): # noqa: ASYNC110
await asyncio.sleep(0.1)

# Create the empty lock file
async with open(
file=lock_file,
mode="wb",
) as _:
pass

try:
# Return to the caller
yield

finally:
try:
# Remove the lock file
await remove(lock_file)
except FileNotFoundError:
pass

async def _lease_path(self, blob: str) -> str:
working_path = await self._config.working_path()
return await path.abspath(join(working_path, f"{blob}.lease"))
Expand Down
90 changes: 58 additions & 32 deletions app/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from app.helpers.resources import (
browsers_install_path,
dir_resources,
file_lock,
hash_url,
index_queue_name,
pandoc_install_path,
Expand Down Expand Up @@ -811,7 +812,7 @@ def _network_used_callback(size_bytes: int) -> None:
res = await page.goto(
url_clean.geturl(),
referer=referrer,
timeout=30000, # 30 seconds
timeout=60000, # 1 min
)
except TimeoutError: # TODO: Retry maybe a few times for timeout errors?
return _generic_error(
Expand Down Expand Up @@ -925,11 +926,20 @@ def _network_used_callback(size_bytes: int) -> None:
# Extract text content
# TODO: Make it async with a wrapper
try:
# Remove "src" attributes to avoid downloading external resources
full_html_minus_resources = full_html
for attribute in ("src", "srcset"):
full_html_minus_resources = re.sub(
rf"{attribute}=\"[^\"].*?\"", # Match attribute
f'{attribute}=""', # Replace with empty string
full_html_minus_resources,
)

# Convert HTML to Markdown
full_markdown = convert_text(
format="html", # Input is HTML
sandbox=True, # Enable sandbox mode, we don't know what we are scraping
source=full_html,
source=full_html_minus_resources,
to="markdown-fenced_divs-native_divs-raw_html-bracketed_spans-native_spans-link_attributes-header_attributes-inline_code_attributes",
extra_args=[
"--embed-resources=false",
Expand All @@ -953,9 +963,18 @@ def _network_used_callback(size_bytes: int) -> None:
full_markdown,
)

# Remove empty images
full_markdown = full_markdown.replace("![]()", "")

# Remove empty links
full_markdown = full_markdown.replace("[]()", "")

# Clean up by removing double newlines
full_markdown = re.sub(r"\n\n+", "\n\n", full_markdown)

# Strip
full_markdown = full_markdown.strip()

except (
RuntimeError
) as e: # pypandoc raises a RuntimeError if Pandoc returns one
Expand Down Expand Up @@ -1012,14 +1031,18 @@ async def _extract_meta(
"""
Extract a meta tag from an element.

Name and content are returned. Other attributes are ignored.
Name and content are returned. Other attributes are ignored. If the browser fails to extract the attributes, None is returned.

See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#attributes
"""
name, content = await asyncio.gather(
element.get_attribute("name"),
element.get_attribute("content"),
)
try:
name, content = await asyncio.gather(
element.get_attribute("name"),
element.get_attribute("content"),
)
except TimeoutError:
logger.debug("Timeout for selecting meta tag attributes", exc_info=True)
return
if not name:
return
return (name, content or None)
Expand All @@ -1041,7 +1064,7 @@ async def _extract_meta(
full_page=True, # Store the full page
quality=70, # Quality is not a concern, let's keep it cheap to store
scale="css", # Keep the same zoom level for all screenshots across random viewports
timeout=30000, # 30 seconds
timeout=60000, # 1 min
type="jpeg", # JPEG is good enough for screenshots
)
# Callback to save the screenshot
Expand Down Expand Up @@ -1108,10 +1131,13 @@ async def run( # noqa: PLR0913
browser_name = "chromium"
async with async_playwright() as p:
browser_type = getattr(p, browser_name)
await _install_browser(browser_type)

# Install Pandoc
await _install_pandoc()
await asyncio.gather(
# Install Playwright
_install_browser(browser_type),
# Install Pandoc
_install_pandoc(),
)

# Parse cache_refresh
cache_refresh_parsed = timedelta(hours=cache_refresh)
Expand Down Expand Up @@ -1222,17 +1248,19 @@ async def _install_browser(
# Get location of Playwright driver
driver_executable, driver_cli = compute_driver_executable()

# Build the command arguments
args = [driver_executable, driver_cli, "install", browser_type.name]
if with_deps:
args.append("--with-deps")

# Run
proc = await asyncio.create_subprocess_shell(
cmd=" ".join(args),
env=get_driver_env(),
)
await proc.wait()
# Ensure only one worker is installing the browser
async with file_lock(driver_executable):
# Build the command arguments
args = [driver_executable, driver_cli, "install", browser_type.name]
if with_deps:
args.append("--with-deps")

# Run
proc = await asyncio.create_subprocess_shell(
cmd=" ".join(args),
env=get_driver_env(),
)
await proc.wait()

# Display error logs if any
err = proc.stderr
Expand All @@ -1254,12 +1282,8 @@ async def _get_broswer(
"""
Launch a browser instance.
"""
# Using the application path not the default one from the SDK
playwright_path = await browsers_install_path()

# Launch the browser
browser = await browser_type.launch(
downloads_path=playwright_path,
chromium_sandbox=True, # Enable the sandbox for security, we don't know what we are scraping
# See: https://github.com/microsoft/playwright/blob/99a36310570617222290c09b96a2026beb8b00f9/packages/playwright-core/src/server/chromium/chromium.ts
args=[
Expand All @@ -1282,12 +1306,14 @@ async def _install_pandoc() -> None:
# Get location of Pandoc driver
install_path = await pandoc_install_path(version)

# Download Pandoc if not installed
ensure_pandoc_installed(
delete_installer=True,
targetfolder=install_path,
version=version,
)
# Ensure only one worker is installing Pandoc
async with file_lock(install_path):
# Download Pandoc if not installed
ensure_pandoc_installed(
delete_installer=True,
targetfolder=install_path,
version=version,
)

# Add installation path to the environment
# See: https://github.com/JessicaTegner/pypandoc?tab=readme-ov-file#specifying-the-location-of-pandoc-binaries
Expand Down
2 changes: 1 addition & 1 deletion cicd/test-unit-ci.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

# Start the first command in the background
make test-static-server &
make test-static-server 1>/dev/null 2>&1 &

# Capture the PID of the background process
UNIT_RUN_PID=$!
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ py-modules = [
generate-hashes = true
strip-extras = true

[tool.pytest.ini_options]
asyncio_mode = "auto"
junit_suite_name = "scrape-it-now"
log_file = "test-reports/last-logs.txt"
log_file_level = "INFO"

[tool.deptry]
ignore_notebooks = true
pep621_dev_dependency_groups = ["dev"]
Expand Down
6 changes: 3 additions & 3 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -543,9 +543,9 @@ httpx==0.27.0 \
--hash=sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5 \
--hash=sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5
# via openai
idna==3.7 \
--hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \
--hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0
idna==3.10 \
--hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
--hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
# via
# anyio
# httpx
Expand Down
Loading
Loading