From e4a13156b91ec4eadf410fed21e7e06efe737de4 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 29 Oct 2020 00:46:57 +0100 Subject: [PATCH 1/3] i fixed the docker --- .dockerignore | 3 + .env | 1 + Dockerfile | 135 +++++++++++++++++------------------ docker-compose.env.example | 27 ++++--- docker-compose.yml.example | 92 +++++++++++------------- scripts/docker-entrypoint.sh | 113 ++++++++++++----------------- 6 files changed, 174 insertions(+), 197 deletions(-) create mode 100644 .dockerignore create mode 100644 .env diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..7534368f0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +src-ui/node_modules +src-ui/dist +.git diff --git a/.env b/.env new file mode 100644 index 000000000..511a1386d --- /dev/null +++ b/.env @@ -0,0 +1 @@ +COMPOSE_PROJECT_NAME=paperless \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 2b95288e6..4284b35ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,74 +1,71 @@ -FROM alpine:3.11 +############################################################################### +### Front end ### +############################################################################### -LABEL maintainer="The Paperless Project https://github.com/the-paperless-project/paperless" \ - contributors="Guy Addadi , Pit Kleyersburg , \ - Sven Fischer " +FROM node:current AS frontend + +WORKDIR /usr/src/paperless/src-ui/ + +COPY src-ui/package* ./ +RUN npm install + +COPY src-ui . +RUN node_modules/.bin/ng build --prod --output-hashing none + +############################################################################### +### Back end ### +############################################################################### + +FROM python:3.8-slim + +WORKDIR /usr/src/paperless/ + +COPY Pipfile* ./ + +#Dependencies +RUN apt-get update \ + && apt-get -y --no-install-recommends install \ + build-essential \ + curl \ + ghostscript \ + gnupg \ + imagemagick \ + libmagic-dev \ + libpoppler-cpp-dev \ + libpq-dev \ + optipng \ + sudo \ + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-deu \ + tesseract-ocr-fra \ + tesseract-ocr-ita \ + tesseract-ocr-spa \ + tzdata \ + unpaper \ + && pip install --upgrade pipenv \ + && pipenv install --system --deploy \ + && apt-get -y purge build-essential \ + && apt-get -y autoremove --purge \ + && rm -rf /var/lib/apt/lists/* + +# # Copy application +COPY scripts/gunicorn.conf.py ./ +COPY src/ ./src/ +COPY --from=frontend /usr/src/paperless/src-ui/dist/paperless-ui/ ./src/documents/static/ + +RUN addgroup --gid 1000 paperless && \ + useradd --uid 1000 --gid paperless --home-dir /usr/src/paperless paperless && \ + chown -R paperless:paperless . + +WORKDIR /usr/src/paperless/src/ + +RUN sudo -HEu paperless python3 manage.py collectstatic --clear --no-input + +VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/consume", "/usr/src/paperless/export"] -# Copy Pipfiles file, init script and gunicorn.conf -COPY Pipfile* /usr/src/paperless/ COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh -COPY scripts/gunicorn.conf /usr/src/paperless/ - -# Set export and consumption directories -ENV PAPERLESS_EXPORT_DIR=/export \ - PAPERLESS_CONSUMPTION_DIR=/consume - -RUN apk add --no-cache \ - bash \ - curl \ - ghostscript \ - gnupg \ - imagemagick \ - libmagic \ - libpq \ - optipng \ - poppler \ - python3 \ - shadow \ - sudo \ - tesseract-ocr \ - tzdata \ - unpaper && \ - apk add --no-cache --virtual .build-dependencies \ - g++ \ - gcc \ - jpeg-dev \ - musl-dev \ - poppler-dev \ - postgresql-dev \ - python3-dev \ - zlib-dev && \ -# Install python dependencies - python3 -m ensurepip && \ - rm -r /usr/lib/python*/ensurepip && \ - cd /usr/src/paperless && \ - pip3 install --upgrade pip pipenv && \ - pipenv install --system --deploy && \ -# Remove build dependencies - apk del .build-dependencies && \ -# Create the consumption directory - mkdir -p $PAPERLESS_CONSUMPTION_DIR && \ -# Create user - addgroup -g 1000 paperless && \ - adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \ - chown -Rh paperless:paperless /usr/src/paperless && \ - mkdir -p $PAPERLESS_EXPORT_DIR && \ -# Avoid setrlimit warnings -# See: https://gitlab.alpinelinux.org/alpine/aports/issues/11122 - echo 'Set disable_coredump false' >> /etc/sudo.conf && \ -# Setup entrypoint - chmod 755 /sbin/docker-entrypoint.sh - -WORKDIR /usr/src/paperless/src -# Mount volumes and set Entrypoint -VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"] +RUN chmod 755 /sbin/docker-entrypoint.sh ENTRYPOINT ["/sbin/docker-entrypoint.sh"] -CMD ["--help"] -# Copy application -COPY src/ /usr/src/paperless/src/ -COPY data/ /usr/src/paperless/data/ -COPY media/ /usr/src/paperless/media/ - -# Collect static files -RUN sudo -HEu paperless /usr/src/paperless/src/manage.py collectstatic --clear --no-input +CMD ["--help"] diff --git a/docker-compose.env.example b/docker-compose.env.example index 7117c7dad..d5339db1f 100644 --- a/docker-compose.env.example +++ b/docker-compose.env.example @@ -1,3 +1,11 @@ +PAPERLESS_DBENGINE="django.db.backends.postgresql_psycopg2" +PAPERLESS_DBHOST="db" +PAPERLESS_DBNAME="paperless" +PAPERLESS_DBUSER="paperless" +PAPERLESS_DBPASS="paperless" + +PAPERLESS_CONSUMPTION_DIR="../consume" + # Environment variables to set for Paperless # Commented out variables will be replaced with a default within Paperless. # @@ -5,24 +13,23 @@ # paperless.conf.example here. Values like: # # * PAPERLESS_PASSPHRASE -# * PAPERLESS_CONSUMPTION_DIR # * PAPERLESS_CONSUME_MAIL_HOST # # ...are all explained in that file but can be defined here, since the Docker # installation doesn't make use of paperless.conf. # Use this variable to set a timezone for the Paperless Docker containers. If not specified, defaults to UTC. -# TZ=America/Los_Angeles +#TZ=America/Los_Angeles # Additional languages to install for text recognition. Note that this is # different from PAPERLESS_OCR_LANGUAGE (default=eng), which defines the # default language used when guessing the language from the OCR output. -# PAPERLESS_OCR_LANGUAGES=deu ita - -# Set Paperless to use SSL for the web interface. -# Enabling this will require ssl.key and ssl.cert files in paperless' data directory. -# PAPERLESS_USE_SSL=false +# The container installs English, German, Italian, Spanish and French by +# default. +#PAPERLESS_OCR_LANGUAGES=deu ita spa fra -# You can change the default user and group id to a custom one -# USERMAP_UID=1000 -# USERMAP_GID=1000 +# The UID and GID of the user used to run paperless in the container. Set this +# to your UID and GID on the host so that you have write access to the +# consumption directory. +#USERMAP_UID=1000 +#USERMAP_GID=1000 diff --git a/docker-compose.yml.example b/docker-compose.yml.example index f8c920f95..563bec742 100644 --- a/docker-compose.yml.example +++ b/docker-compose.yml.example @@ -1,56 +1,44 @@ -version: '2.1' - +version: "3.8" services: - webserver: - build: ./ - # uncomment the following line to start automatically on system boot - # restart: always - ports: - # You can adapt the port you want Paperless to listen on by - # modifying the part before the `:`. - - "8000:8000" - healthcheck: - test: ["CMD", "curl" , "-f", "http://localhost:8000"] - interval: 30s - timeout: 10s - retries: 5 - volumes: - - data:/usr/src/paperless/data - - media:/usr/src/paperless/media - # You have to adapt the local path you want the consumption - # directory to mount to by modifying the part before the ':'. - - ./consume:/consume - env_file: docker-compose.env - # The reason the line is here is so that the webserver that doesn't do - # any text recognition and doesn't have to install unnecessary - # languages the user might have set in the env-file by overwriting the - # value with nothing. - environment: - - PAPERLESS_OCR_LANGUAGES= - command: ["gunicorn", "-b", "0.0.0.0:8000"] + db: + image: postgres:13 + #restart: always + environment: + POSTGRES_DB: paperless + POSTGRES_USER: paperless + POSTGRES_PASSWORD: paperless + + webserver: + build: . + image: paperless_app + #restart: always + depends_on: + - db + ports: + - "8000:8000" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000"] + interval: 30s + timeout: 10s + retries: 5 + volumes: + - data:/usr/src/paperless/data + env_file: docker-compose.env + environment: + - PAPERLESS_OCR_LANGUAGES= + command: ["gunicorn", "-b", "0.0.0.0:8000"] - consumer: - build: ./ - # uncomment the following line to start automatically on system boot - # restart: always - depends_on: - webserver: - condition: service_healthy - volumes: - - data:/usr/src/paperless/data - - media:/usr/src/paperless/media - # This should be set to the same value as the consume directory - # in the webserver service above. - - ./consume:/consume - # Likewise, you can add a local path to mount a directory for - # exporting. This is not strictly needed for paperless to - # function, only if you're exporting your files: uncomment - # it and fill in a local path if you know you're going to - # want to export your documents. - # - /path/to/another/arbitrary/place:/export - env_file: docker-compose.env - command: ["document_consumer"] + consumer: + image: paperless_app + depends_on: + - webserver + - db + restart: on-failure:5 + volumes: + - data:/usr/src/paperless/data + - ./consume:/usr/src/paperless/consume + env_file: docker-compose.env + command: ["document_consumer"] volumes: - data: - media: + data: diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh index 2028413d1..3c9ec5c4a 100644 --- a/scripts/docker-entrypoint.sh +++ b/scripts/docker-entrypoint.sh @@ -1,4 +1,5 @@ #!/bin/bash + set -e # Source: https://github.com/sameersbn/docker-gitlab/ @@ -14,83 +15,72 @@ map_uidgid() { fi } -set_permissions() { - # Set permissions for consumption and export directory - for dir in PAPERLESS_CONSUMPTION_DIR PAPERLESS_EXPORT_DIR; do - # Extract the name of the current directory from $dir for the error message - cur_dir_name=$(echo "$dir" | awk -F'_' '{ print tolower($2); }') - chgrp paperless "${!dir}" || { - echo "Changing group of ${cur_dir_name} directory:" - echo " ${!dir}" - echo "failed." - echo "" - echo "Either try to set it on your host-mounted directory" - echo "directly, or make sure that the directory has \`g+wx\`" - echo "permissions and the files in it at least \`o+r\`." - } >&2 - chmod g+wx "${!dir}" || { - echo "Changing group permissions of ${cur_dir_name} directory:" - echo " ${!dir}" - echo "failed." - echo "" - echo "Either try to set it on your host-mounted directory" - echo "directly, or make sure that the directory has \`g+wx\`" - echo "permissions and the files in it at least \`o+r\`." - } >&2 - done - # Set permissions for application directory - chown -Rh paperless:paperless /usr/src/paperless -} - migrations() { # A simple lock file in case other containers use this startup LOCKFILE="/usr/src/paperless/data/db.sqlite3.migration" - # check for and create lock file in one command + # check for and create lock file in one command if (set -o noclobber; echo "$$" > "${LOCKFILE}") 2> /dev/null then trap 'rm -f "${LOCKFILE}"; exit $?' INT TERM EXIT - sudo -HEu paperless "/usr/src/paperless/src/manage.py" "migrate" + sudo -HEu paperless python3 manage.py migrate rm ${LOCKFILE} fi } initialize() { - map_uidgid - set_permissions - migrations -} + map_uidgid -install_languages() { - local langs="$1" - read -ra langs <<<"$langs" + for data_dir in index media media/documents media/thumbnails; do + if [[ ! -d "../data/$data_dir" ]] + then + echo "creating directory ../data/$data_dir" + mkdir ../data/$data_dir + fi + done - # Check that it is not empty - if [ ${#langs[@]} -eq 0 ]; then - return - fi + chown -R paperless:paperless ../ + + migrations - # Loop over languages to be installed - for lang in "${langs[@]}"; do - pkg="tesseract-ocr-data-$lang" +} +install_languages() { + echo "TEST" + local langs="$1" + read -ra langs <<<"$langs" + + # Check that it is not empty + if [ ${#langs[@]} -eq 0 ]; then + return + fi + apt-get update + + for lang in "${langs[@]}"; do + pkg="tesseract-ocr-$lang" # English is installed by default - if [[ "$lang" == "eng" ]]; then - continue - fi + #if [[ "$lang" == "eng" ]]; then + # continue + #fi - if apk info -e "$pkg" > /dev/null 2>&1; then - continue + if dpkg -s $pkg &> /dev/null; then + echo "package $pkg already installed!" + continue fi - if ! apk --no-cache info "$pkg" > /dev/null 2>&1; then - continue + + if ! apt-cache show $pkg &> /dev/null; then + echo "package $pkg not found! :(" + continue fi - apk --no-cache --update add "$pkg" + echo "Installing package $pkg..." + if ! apt-get -y install "$pkg" &> /dev/null; then + echo "Could not install $pkg" + exit 1 + fi done } - if [[ "$1" != "/"* ]]; then initialize @@ -101,21 +91,12 @@ if [[ "$1" != "/"* ]]; then if [[ "$1" = "gunicorn" ]]; then shift - EXTRA_PARAMS="" - SSL_KEY_PATH="/usr/src/paperless/data/ssl.key" - SSL_CERT_PATH="/usr/src/paperless/data/ssl.cert" - if [ "${PAPERLESS_USE_SSL}" = "true" ]; then - if [ -f "${SSL_KEY_PATH}" ] && [ -f "${SSL_CERT_PATH}" ]; then - EXTRA_PARAMS="--certfile=${SSL_CERT_PATH} --keyfile=${SSL_KEY_PATH}" - else - echo "Error: Could not find certfile in ${SSL_CERT_PATH} or keyfile in ${SSL_KEY_PATH}, but \$PAPERLESS_USE_SSL is true. Starting without SSL enabled." - fi - fi cd /usr/src/paperless/src/ && \ - exec sudo -HEu paperless /usr/bin/gunicorn -c /usr/src/paperless/gunicorn.conf ${EXTRA_PARAMS} "$@" paperless.wsgi - else - exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@" + exec sudo -HEu paperless gunicorn -c /usr/src/paperless/gunicorn.conf.py "$@" paperless.wsgi fi + + exec sudo -HEu paperless python3 manage.py "$@" + fi exec "$@" From 7e9b584905adbfd0ab9b3dcf353f6ddd7d9896b7 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 29 Oct 2020 00:54:42 +0100 Subject: [PATCH 2/3] missing migrations --- .../migrations/1003_auto_20201028_1751.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 src/documents/migrations/1003_auto_20201028_1751.py diff --git a/src/documents/migrations/1003_auto_20201028_1751.py b/src/documents/migrations/1003_auto_20201028_1751.py new file mode 100644 index 000000000..66dd329e1 --- /dev/null +++ b/src/documents/migrations/1003_auto_20201028_1751.py @@ -0,0 +1,32 @@ +# Generated by Django 3.1.2 on 2020-10-28 17:51 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '1002_auto_20180823_1155'), + ] + + operations = [ + migrations.AlterModelOptions( + name='documenttype', + options={'ordering': ('name',)}, + ), + migrations.AlterField( + model_name='correspondent', + name='matching_algorithm', + field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'), + ), + migrations.AlterField( + model_name='documenttype', + name='matching_algorithm', + field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'), + ), + migrations.AlterField( + model_name='tag', + name='matching_algorithm', + field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'), + ), + ] From e6f9baf20f67a4a736f074621de6596bacf331eb Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Thu, 29 Oct 2020 01:11:57 +0100 Subject: [PATCH 3/3] Create codeql-analysis.yml --- .github/workflows/codeql-analysis.yml | 68 +++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 .github/workflows/codeql-analysis.yml diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 000000000..111e5bf91 --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,68 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# ******** NOTE ******** + +name: "CodeQL" + +on: + push: + branches: [ master ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ master ] + schedule: + - cron: '42 3 * * 1' + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + language: [ 'javascript', 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] + # Learn more... + # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v1 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + # queries: ./path/to/local/query, your-org/your-repo/queries@main + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v1 + + # ℹ️ Command-line programs to run using the OS shell. + # 📚 https://git.io/JvXDl + + # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines + # and modify them (or add more) to build your code if your project + # uses a compiled language + + #- run: | + # make bootstrap + # make release + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v1