Skip to content

Commit

Permalink
breaking: Deploy to PyPi instead of the executable
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Nov 9, 2024
1 parent 9babdbd commit 14396e9
Show file tree
Hide file tree
Showing 41 changed files with 452 additions and 100,626 deletions.
193 changes: 107 additions & 86 deletions .github/workflows/pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,10 @@ on:
- hotfix/*
- main

env:
# See: https://github.com/upx/upx/releases
UPX_VERSION: "4.2.4"

jobs:
init:
name: Init
runs-on: ubuntu-22.04
permissions:
contents: read
runs-on: ubuntu-24.04
outputs:
VERSION: ${{ steps.version.outputs.version }}
VERSION_FULL: ${{ steps.version.outputs.version_full }}
Expand All @@ -46,8 +40,7 @@ jobs:
name: Test
permissions:
id-token: write
contents: read
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
needs:
- init
strategy:
Expand All @@ -56,6 +49,10 @@ jobs:
step:
- test-static
- test-unit
python-version:
- '3.11'
- '3.12'
- '3.13'
steps:
- name: Checkout
uses: actions/[email protected]
Expand All @@ -64,7 +61,7 @@ jobs:
uses: actions/[email protected]
with:
cache: pip
python-version: "3.12"
python-version: "3.13"

- name: Login to Azure
uses: Azure/[email protected]
Expand All @@ -88,6 +85,11 @@ jobs:
- name: Set up dependencies
run: make install-deps

# See: https://github.com/ungoogled-software/ungoogled-chromium/issues/2804#issuecomment-2081611387
- name: Configure AppArmor profile
run: |
sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0
- name: Configure environment variables
run: echo "${{ secrets.DOTENV_UNIT_TESTS }}" > .env

Expand All @@ -99,102 +101,72 @@ jobs:
if: always()
with:
compression-level: 9
name: test-results
name: test-${{ matrix.step }}-${{ matrix.python-version }}
path: test-reports/*

build-app:
name: Build & archive app
permissions:
contents: write
packages: write
runs-on: ${{ matrix.os }}
build-wheels:
name: Build wheels
runs-on: ubuntu-24.04
needs:
- init
- test
strategy:
fail-fast: false
matrix:
# TODO: Build for musl (Alpine Linux)
os:
# All versions of macOS supported by GitHub Actions
- macos-12
- macos-13
- macos-14
- macos-15
# All versions of Ubuntu supported by GitHub Actions
- ubuntu-20.04
- ubuntu-22.04
- ubuntu-24.04
# All versions of Windows supported by GitHub Actions
- windows-2019
- windows-2022
steps:
- name: Configure Git (Windows)
if: runner.os == 'Windows'
run: git config --system core.longpaths true

- name: Checkout
uses: actions/[email protected]

- name: Set up Python
uses: actions/[email protected]
with:
cache: pip
python-version: "3.12"
python-version: "3.13"

- name: Set up make (Windows)
if: runner.os == 'Windows'
- name: Set up build dependencies
run: |
choco install make
make --version
python3 -m pip install --upgrade pip build
- name: Set up UPX (Windows)
if: runner.os == 'Windows'
- name: Write version
run: |
choco install upx --version ${{ env.UPX_VERSION }}
upx --version
version=$(echo "${{ needs.init.outputs.VERSION }}" | sed 's/-/\.a/' | cut -d'.' -f1-4)
echo '__version__ = "'${version}'"' > src/scrape_it_now/__init__.py
- name: Set up dependencies
- name: Build wheels
run: |
python3 -m pip install --upgrade pip wheel setuptools
make install-deps
- name: Set up dependencies (Windows)
if: runner.os == 'Windows'
run: |
python3 -m pip install pywin32-ctypes pefile
- name: Write version (Linux)
if: runner.os == 'Linux' || runner.os == 'macOS'
run: echo '__version__ = "${{ needs.init.outputs.VERSION_FULL }}"' > app/__init__.py

- name: Write version (Windows)
if: runner.os == 'Windows'
run: echo '__version__ = "${{ needs.init.outputs.VERSION_FULL }}"' > app\__init__.py

- name: Build to binary
run: make build

- name: Rename binary (Linux)
if: runner.os == 'Linux' || runner.os == 'macOS'
run: mv dist/scrape-it-now dist/scrape-it-now-${{ needs.init.outputs.VERSION }}-${{ matrix.os }}

- name: Rename binary (Windows)
if: runner.os == 'Windows'
run: mv dist\scrape-it-now.exe dist\scrape-it-now-${{ needs.init.outputs.VERSION }}-${{ matrix.os }}.exe
make build
- name: Upload artifacts
uses: actions/[email protected]
with:
compression-level: 9
name: binary-${{ matrix.os }}
path: dist/*
name: wheels
path: dist/*.whl

attest-binaries:
name: Attest binaries
permissions:
attestations: write
id-token: write
runs-on: ubuntu-24.04
needs:
- build-wheels
steps:
- name: Download artifacts
uses: actions/[email protected]
with:
merge-multiple: true
path: dist
pattern: wheels

- name: Generate attestations
uses: actions/[email protected]
with:
subject-path: dist/*

attest-dependencies:
name: Attest - Dependencies
name: Attest dependencies
permissions:
contents: write
runs-on: ubuntu-22.04
id-token: write
runs-on: ubuntu-24.04
needs:
- test
steps:
Expand All @@ -207,8 +179,8 @@ jobs:
directoryExclusionList: docs

attest-sbom:
name: Attest - SBOM
runs-on: ubuntu-22.04
name: Attest SBOM
runs-on: ubuntu-24.04
needs:
- init
- test
Expand All @@ -229,30 +201,79 @@ jobs:
name: sbom
path: sbom-reports/*

publish-testpypi:
name: Publish to TestPyPI
environment:
name: testpypi
url: https://test.pypi.org/p/scrape-it-now
permissions:
id-token: write
runs-on: ubuntu-24.04
needs:
- attest-binaries
- attest-dependencies
- attest-sbom
- build-wheels
steps:
- name: Download artifacts
uses: actions/[email protected]
with:
merge-multiple: true
path: dist
pattern: wheels

- name: Publish to TestPyPI
uses: pypa/[email protected]
with:
repository-url: https://test.pypi.org/legacy/

publish-pypi:
name: Publish to PyPI
environment:
name: pypi
url: https://pypi.org/p/scrape-it-now
permissions:
id-token: write
runs-on: ubuntu-24.04
needs:
# Always publish first to TestPyPI as a safety precaution
- publish-testpypi
if: github.ref == 'refs/heads/main'
steps:
- name: Download artifacts
uses: actions/[email protected]
with:
merge-multiple: true
path: dist
pattern: wheels

- name: Publish to PyPI
uses: pypa/[email protected]

publish-release:
name: Publish release
permissions:
contents: write
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
needs:
- attest-binaries
- attest-dependencies
- attest-sbom
- build-app
- build-wheels
- init
# Only publish on non-scheduled default branch
if: (github.event_name != 'schedule') && (github.ref == 'refs/heads/main')
if: github.ref == 'refs/heads/main'
steps:
- name: Download artifacts
id: download
uses: actions/[email protected]
with:
merge-multiple: true
path: artifacts
path: dist

- name: Publish
uses: softprops/[email protected]
with:
files: artifacts/*
files: dist/*
generate_release_notes: true
make_latest: true
name: scrape-it-now v${{ needs.init.outputs.VERSION }}
Expand Down
2 changes: 1 addition & 1 deletion .python-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
scrapeitnow312
scrapeitnow313
17 changes: 4 additions & 13 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ upgrade:
git submodule update --init --recursive

@echo "➡️ Upgrading pip..."
python3 -m pip install --upgrade pip wheel setuptools
python3 -m pip install --upgrade pip wheel setuptools build

@echo "➡️ Upgrading pip-tools..."
python3 -m pip install --upgrade pip-tools
Expand All @@ -62,15 +62,15 @@ upgrade:
pyproject.toml

@echo "➡️ Updating DNS blocklist..."
curl -sSfL https://blocklistproject.github.io/Lists/alt-version/ads-nl.txt > resources/ads-nl.txt
curl -sSfL https://blocklistproject.github.io/Lists/alt-version/ads-nl.txt > src/scrape_it_now/resources/ads-nl.txt

test:
$(MAKE) test-static
$(MAKE) test-unit

test-static:
@echo "➡️ Test dependencies issues (deptry)..."
python3 -m deptry .
python3 -m deptry src

@echo "➡️ Test code smells (Ruff)..."
python3 -m ruff check --select I,PL,RUF,UP,ASYNC,A,DTZ,T20,ARG,PERF
Expand Down Expand Up @@ -99,16 +99,7 @@ dev:

build:
@echo "➡️ Building app..."
pyinstaller \
--add-data resources:resources \
--clean \
--hidden-import=tiktoken_ext \
--hidden-import=tiktoken_ext.openai_public \
--icon resources/logo.ico \
--name scrape-it-now \
--onefile \
--optimize 2 \
app/app.py
python3 -m build

lint:
@echo "➡️ Fix with formatter..."
Expand Down
23 changes: 14 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@

Web scraper made for AI and simplicity in mind. It runs as a CLI that can be parallelized and outputs high-quality markdown content.

<!-- github.com badges -->
[![Last release date](https://img.shields.io/github/release-date/clemlesne/scrape-it-now)](https://github.com/clemlesne/scrape-it-now/releases)
[![Project license](https://img.shields.io/github/license/clemlesne/scrape-it-now)](https://github.com/clemlesne/scrape-it-now/blob/main/LICENSE)
[![GitHub last release date](https://img.shields.io/github/release-date/clemlesne/scrape-it-now)](https://github.com/clemlesne/scrape-it-now/releases)
[![GitHub project license](https://img.shields.io/github/license/clemlesne/scrape-it-now)](https://github.com/clemlesne/scrape-it-now/blob/main/LICENSE)
[![PyPI package version](https://img.shields.io/pypi/v/scrape-it-now)](https://pypi.org/project/scrape-it-now)
[![PyPI supported Python versions](https://img.shields.io/pypi/pyversions/scrape-it-now)](https://pypi.org/project/scrape-it-now)

## Features

Shared:

- 🏗️ Decoupled architecture with [Azure Queue Storage](https://learn.microsoft.com/en-us/azure/storage/queues) or local [sqlite](https://sqlite.org)
- 🔧 Executable as a CLI with a [standalone binary](http://github.com/clemlesne/scrape-it-now/releases/latest)
- ⚙️ Idempotent operations that can be run in parallel
- 💾 Scraped content is stored in [Azure Blob Storage](https://learn.microsoft.com/en-us/azure/storage/blobs) or local disk

Expand All @@ -38,15 +38,20 @@ Indexer:

## Installation

### From binary
### From PyPI

[Download the latest release from the releases page](http://github.com/clemlesne/scrape-it-now/releases/latest). Binaries are available for Linux, macOS and Windows.
```bash
# Install the package
python3 -m pip install scrape-it-now
# Run the CLI
scrape-it-now --help
```

For configuring the CLI (including authentication to the backend services), use environment variables, a `.env` file or command line options.
To configure the CLI (including authentication to the backend services), use environment variables, a `.env` file or command line options.

### From sources

Application must be run with Python 3.12 or later. If this version is not installed, an easy way to install it is [pyenv](https://github.com/pyenv/pyenv).
Application must be run with Python 3.13 or later. If this version is not installed, an easy way to install it is [pyenv](https://github.com/pyenv/pyenv).

```bash
# Download the source code
Expand Down Expand Up @@ -421,6 +426,6 @@ Proxies are not implemented in the application. Network security cannot be achie
### Bundle with a container
As the application is packaged as a binary, it can easily be bundled with a container. At every start, the application will download the dependencies (browser, etc.) and cache them. You can pre-download them by running the command `scrape-it-now scrape install`.
As the application is packaged to PyPi, it can easily be bundled with a container. At every start, the application will download the dependencies (browser, etc.) and cache them. You can pre-download them by running the command `scrape-it-now scrape install`.
A good technique for performance would also to parallelize the scraping and indexing jobs by running multiple containers of each. This can be achieved with [KEDA](https://keda.sh), by configuring a [queue scaler](https://keda.sh/docs/2.16/scalers/azure-storage-queue).
Loading

0 comments on commit 14396e9

Please sign in to comment.