Skip to content

Commit

Permalink
breaking: Deploy to PyPi instead of the executable
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Nov 8, 2024
1 parent 0691e66 commit c379672
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 83 deletions.
146 changes: 85 additions & 61 deletions .github/workflows/pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,10 @@ on:
- hotfix/*
- main

env:
# See: https://github.com/upx/upx/releases
UPX_VERSION: "4.2.4"

jobs:
init:
name: Init
runs-on: ubuntu-22.04
permissions:
contents: read
runs-on: ubuntu-24.04
outputs:
VERSION: ${{ steps.version.outputs.version }}
VERSION_FULL: ${{ steps.version.outputs.version_full }}
Expand All @@ -46,8 +40,7 @@ jobs:
name: Test
permissions:
id-token: write
contents: read
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
needs:
- init
strategy:
Expand All @@ -56,6 +49,9 @@ jobs:
step:
- test-static
# - test-unit
python-version:
- '3.12'
- '3.13'
steps:
- name: Checkout
uses: actions/[email protected]
Expand Down Expand Up @@ -102,37 +98,17 @@ jobs:
name: test-results
path: test-reports/*

build-app:
name: Build & archive app
permissions:
contents: write
packages: write
build-wheels:
name: Build wheels
runs-on: ${{ matrix.os }}
needs:
- init
- test
strategy:
fail-fast: false
matrix:
# TODO: Build for musl (Alpine Linux)
os:
# All versions of macOS supported by GitHub Actions
- macos-12
- macos-13
- macos-14
- macos-15
# All versions of Ubuntu supported by GitHub Actions
- ubuntu-20.04
- ubuntu-22.04
- ubuntu-24.04
# All versions of Windows supported by GitHub Actions
- windows-2019
- windows-2022
os: [ubuntu-24.04, windows-2022, macos-15]
steps:
- name: Configure Git (Windows)
if: runner.os == 'Windows'
run: git config --system core.longpaths true

- name: Checkout
uses: actions/[email protected]

Expand All @@ -142,28 +118,23 @@ jobs:
cache: pip
python-version: "3.13"

- name: Set up QEMU
if: runner.os == 'Linux'
uses: docker/[email protected]
with:
platforms: all

- name: Set up make (Windows)
if: runner.os == 'Windows'
run: |
choco install make
make --version
- name: Set up UPX (Windows)
if: runner.os == 'Windows'
run: |
choco install upx --version ${{ env.UPX_VERSION }}
upx --version
- name: Set up dependencies
run: |
python3 -m pip install --upgrade pip wheel setuptools
make install-deps
- name: Set up dependencies (Windows)
if: runner.os == 'Windows'
run: |
python3 -m pip install pywin32-ctypes pefile
- name: Write version (Linux)
if: runner.os == 'Linux' || runner.os == 'macOS'
run: echo '__version__ = "${{ needs.init.outputs.VERSION_FULL }}"' > app/__init__.py
Expand All @@ -175,26 +146,79 @@ jobs:
- name: Build to binary
run: make build

- name: Rename binary (Linux)
if: runner.os == 'Linux' || runner.os == 'macOS'
run: mv dist/scrape-it-now dist/scrape-it-now-${{ needs.init.outputs.VERSION }}-${{ matrix.os }}

- name: Rename binary (Windows)
if: runner.os == 'Windows'
run: mv dist\scrape-it-now.exe dist\scrape-it-now-${{ needs.init.outputs.VERSION }}-${{ matrix.os }}.exe

- name: Upload artifacts
uses: actions/[email protected]
with:
compression-level: 9
name: binary-${{ matrix.os }}
path: dist/*
name: wheels-${{ matrix.os }}-${{ strategy.job-index }}
path: dist/*.whl

publish-testpypi:
name: Publish to TestPyPI
environment:
name: testpypi
url: https://test.pypi.org/p/scrape-it-now
permissions:
id-token: write
runs-on: ubuntu-24.04
steps:
- name: Download artifacts
uses: actions/[email protected]
with:
merge-multiple: true
path: dist
pattern: wheels-*

- name: Publish to TestPyPI
uses: pypa/[email protected]
with:
repository-url: https://test.pypi.org/legacy

publish-pypi:
name: Publish to PyPI
environment:
name: pypi
url: https://pypi.org/p/scrape-it-now
permissions:
id-token: write
runs-on: ubuntu-24.04
if: github.ref == 'refs/heads/main'
steps:
- name: Download artifacts
uses: actions/[email protected]
with:
merge-multiple: true
path: dist
pattern: wheels-*

- name: Publish to PyPI
uses: pypa/[email protected]

attest-binaries:
name: Attest binaries
permissions:
contents: write
id-token: write
runs-on: ubuntu-24.04
needs:
- build-wheels
steps:
- name: Download artifacts
uses: actions/[email protected]
with:
name: wheels-*

- name: Generate attestations
uses: actions/[email protected]
with:
subject-path: dist/*

attest-dependencies:
name: Attest - Dependencies
name: Attest dependencies
permissions:
contents: write
runs-on: ubuntu-22.04
id-token: write
runs-on: ubuntu-24.04
needs:
- test
steps:
Expand All @@ -207,8 +231,8 @@ jobs:
directoryExclusionList: docs

attest-sbom:
name: Attest - SBOM
runs-on: ubuntu-22.04
name: Attest SBOM
runs-on: ubuntu-24.04
needs:
- init
- test
Expand All @@ -233,14 +257,14 @@ jobs:
name: Publish release
permissions:
contents: write
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
needs:
- attest-binaries
- attest-dependencies
- attest-sbom
- build-app
- build-wheels
- init
# Only publish on non-scheduled default branch
if: (github.event_name != 'schedule') && (github.ref == 'refs/heads/main')
if: github.ref == 'refs/heads/main'
steps:
- name: Download artifacts
id: download
Expand Down
10 changes: 1 addition & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,7 @@ dev:

build:
@echo "➡️ Building app..."
pyinstaller \
--add-data resources:resources \
--clean \
--hidden-import=tiktoken_ext \
--hidden-import=tiktoken_ext.openai_public \
--icon resources/logo.ico \
--name scrape-it-now \
--onefile \
app/app.py
python -m cibuildwheel --output-dir dist

lint:
@echo "➡️ Fix with formatter..."
Expand Down
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ Web scraper made for AI and simplicity in mind. It runs as a CLI that can be par
Shared:

- 🏗️ Decoupled architecture with [Azure Queue Storage](https://learn.microsoft.com/en-us/azure/storage/queues) or local [sqlite](https://sqlite.org)
- 🔧 Executable as a CLI with a [standalone binary](http://github.com/clemlesne/scrape-it-now/releases/latest)
- ⚙️ Idempotent operations that can be run in parallel
- 💾 Scraped content is stored in [Azure Blob Storage](https://learn.microsoft.com/en-us/azure/storage/blobs) or local disk

Expand All @@ -38,11 +37,16 @@ Indexer:

## Installation

### From binary
### From PyPI

[Download the latest release from the releases page](http://github.com/clemlesne/scrape-it-now/releases/latest). Binaries are available for Linux, macOS and Windows.
```bash
# Install the package
python3 -m pip install scrape-it-now
# Run the CLI
scrape-it-now --help
```

For configuring the CLI (including authentication to the backend services), use environment variables, a `.env` file or command line options.
To configure the CLI (including authentication to the backend services), use environment variables, a `.env` file or command line options.

### From sources

Expand Down Expand Up @@ -421,6 +425,6 @@ Proxies are not implemented in the application. Network security cannot be achie
### Bundle with a container
As the application is packaged as a binary, it can easily be bundled with a container. At every start, the application will download the dependencies (browser, etc.) and cache them. You can pre-download them by running the command `scrape-it-now scrape install`.
As the application is packaged to PyPi, it can easily be bundled with a container. At every start, the application will download the dependencies (browser, etc.) and cache them. You can pre-download them by running the command `scrape-it-now scrape install`.
A good technique for performance would also to parallelize the scraping and indexing jobs by running multiple containers of each. This can be achieved with [KEDA](https://keda.sh), by configuring a [queue scaler](https://keda.sh/docs/2.16/scalers/azure-storage-queue).
10 changes: 7 additions & 3 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def cli() -> None:
"""
🛰️ Scrape It Now!
A website to scrape? There's a simple way.
Web scraper made for AI and simplicity in mind. It runs as a CLI that can be parallelized and outputs high-quality markdown content.
"""
pass

Expand Down Expand Up @@ -581,8 +581,12 @@ def _job_name(job_name: str | None) -> str:
)


# If running in PyInstaller
if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"):
def cli_wrapper():
"""
Wrapper to run the CLI with a project script entrypoint.
This is the main entrypoint for the CLI. It is used to inject the system truststore into the SSL context.
"""
# Make sure all SSL certificates come from the system truststore
import truststore

Expand Down
42 changes: 37 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,31 @@
[project]
description = "A website to scrape? There's a simple way."
authors = [
{name = "Clémence Lesné", email = "[email protected]"}
]
maintainers = [
{name = "Clémence Lesné", email = "[email protected]"}
]
description = "Web scraper made for AI and simplicity in mind. It runs as a CLI that can be parallelized and outputs high-quality markdown content."
keywords = ["web", "scraper", "markdown", "ai", "parallel", "cli", "automation", "data-extraction", "web-crawling", "content-indexing"]
classifiers = [
# How mature is the project?
"Development Status :: 4 - Beta",
# For who the project is intended for
"Intended Audience :: Developers",
"Topic :: Internet :: WWW/HTTP :: Dynamic Content :: Content Management System",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Topic :: System :: Archiving :: Backup",
"Topic :: Text Processing :: Markup :: Markdown",
"Topic :: Utilities",
# Supported Python versions
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
dynamic = ["version"]
license = {file = "LICENSE"}
name = "scrape-it-now"
readme = "README.md"
requires-python = ">=3.13"
requires-python = ">=3.12"
dependencies = [
"aiodns~=3.2",
"aiofiles~=24.1",
Expand Down Expand Up @@ -35,6 +56,7 @@ dependencies = [

[project.optional-dependencies]
dev = [
"cibuildwheel~=2.21", # Build wheels for all platforms
"deptry~=0.19", # Dependency tree testing
"pip-tools~=7.4", # Compile requirements.txt from pyproject.toml
"pyinstaller~=6.11", # Create standalone executable
Expand All @@ -46,8 +68,18 @@ dev = [
"ruff~=0.6", # Linter
]

[project.urls]
Documentation = "https://github.com/clemlesne/scrape-it-now"
Homepage = "https://github.com/clemlesne/scrape-it-now"
Issues = "https://github.com/clemlesne/scrape-it-now/issues"
Repository = "https://github.com/clemlesne/scrape-it-now"

[project.scripts]
scrape-it-now = "app.app:cli"
scrape-it-now = "app.app:cli_wrapper"

[build-system]
requires = ["setuptools~=75.3"]
build-backend = "setuptools.build_meta"

[tool.setuptools]
py-modules = [
Expand Down Expand Up @@ -75,7 +107,7 @@ DEP002 = [
]

[tool.ruff]
target-version = "py313"
target-version = "py312"

[tool.ruff.lint.isort]
combine-as-imports = true
Expand All @@ -84,4 +116,4 @@ combine-as-imports = true
docstring-code-format = true

[tool.pyright]
pythonVersion = "3.13"
pythonVersion = "3.12"
Loading

0 comments on commit c379672

Please sign in to comment.