From 724afa59c75853bf71e735650133e4d414558dfa Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg Date: Wed, 17 Feb 2016 18:45:04 +0100 Subject: [PATCH] Add Dockerfile for application and documentation This commit adds a `Dockerfile` to the root of the project, accompanied by a `docker-compose.yml.example` for simplified deployment. The `Dockerfile` is agnostic to whether it will be the webserver, the consumer, or if it is run for a one-off command (i.e. creation of a superuser, migration of the database, document export, ...). The containers entrypoint is the `scripts/docker-entrypoint.sh` script. This script verifies that the required permissions are set, remaps the default users and/or groups id if required and installs additional languages if the user wishes to. After initialization, it analyzes the command the user supplied: - If the command starts with a slash, it is expected that the user wants to execute a binary file and the command will be executed without further intervention. (Using `exec` to effectively replace the started shell-script and not have any reaping-issues.) - If the command does not start with a slash, the command will be passed directly to the `manage.py` script without further modification. (Again using `exec`.) The default command is set to `--help`. If the user wants to execute a command that is not meant for `manage.py` but doesn't start with a slash, the Docker `--entrypoint` parameter can be used to circumvent the mechanics of `docker-entrypoint.sh`. Further information can be found in `docs/setup.rst` and in `docs/migrating.rst`. For additional convenience, a `Dockerfile` has been added to the `docs/` directory which allows for easy building and serving of the documentation. This is documented in `docs/requirements.rst`. --- .gitignore | 1 + Dockerfile | 43 +++++ docker-compose.env | 15 ++ docker-compose.yml.example | 31 ++++ docs/Dockerfile | 18 ++ docs/migrating.rst | 95 ++++++++++ docs/requirements.rst | 13 ++ docs/setup.rst | 167 +++++++++++++++++- scripts/docker-entrypoint.sh | 74 ++++++++ .../management/commands/loaddata_stdin.py | 23 +++ 10 files changed, 474 insertions(+), 6 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.env create mode 100644 docker-compose.yml.example create mode 100644 docs/Dockerfile create mode 100644 scripts/docker-entrypoint.sh create mode 100644 src/documents/management/commands/loaddata_stdin.py diff --git a/.gitignore b/.gitignore index 908fa9748..2c65f8dcd 100644 --- a/.gitignore +++ b/.gitignore @@ -68,6 +68,7 @@ db.sqlite3 # Other stuff that doesn't belong virtualenv .vagrant +docker-compose.yml # Used for development scripts/import-for-development diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..dade863ca --- /dev/null +++ b/Dockerfile @@ -0,0 +1,43 @@ +FROM python:3.5.1 +MAINTAINER Pit Kleyersburg + +# Install dependencies +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + sudo \ + tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \ + && rm -rf /var/lib/apt/lists/* + +# Install python dependencies +RUN mkdir -p /usr/src/paperless +WORKDIR /usr/src/paperless +COPY requirements.txt /usr/src/paperless/ +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +RUN mkdir -p /usr/src/paperless/src +COPY src/ /usr/src/paperless/src/ + +# Set consumption directory +ENV PAPERLESS_CONSUME /consume +RUN mkdir -p $PAPERLESS_CONSUME + +# Migrate database +WORKDIR /usr/src/paperless/src +RUN mkdir /usr/src/paperless/data +RUN ./manage.py migrate + +# Create user +RUN groupadd -g 1000 paperless \ + && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \ + && chown -Rh paperless:paperless /usr/src/paperless + +# Setup entrypoint +COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh +RUN chmod 755 /sbin/docker-entrypoint.sh + +# Mount volumes +VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"] + +ENTRYPOINT ["/sbin/docker-entrypoint.sh"] +CMD ["--help"] diff --git a/docker-compose.env b/docker-compose.env new file mode 100644 index 000000000..13c74b6ab --- /dev/null +++ b/docker-compose.env @@ -0,0 +1,15 @@ +# Environment variables to set for Paperless +# Commented out variables will be replaced by a default within Paperless. + +# Passphrase Paperless uses to encrypt and decrypt your documents +PAPERLESS_PASSPHRASE=CHANGE_ME + +# The amount of threads to use for text recognition +# PAPERLESS_OCR_THREADS=4 + +# Additional languages to install for text recognition +# PAPERLESS_OCR_LANGUAGES=deu ita + +# You can change the default user and group id to a custom one +# USERMAP_UID=1000 +# USERMAP_GID=1000 diff --git a/docker-compose.yml.example b/docker-compose.yml.example new file mode 100644 index 000000000..f8e9b5b93 --- /dev/null +++ b/docker-compose.yml.example @@ -0,0 +1,31 @@ +version: '2' + +services: + webserver: + image: paperless + ports: + # You can adapt the port you want Paperless to listen on by + # modifying the part before the `:`. + - "8000:8000" + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + env_file: docker-compose.env + environment: + - PAPERLESS_OCR_LANGUAGES= + command: ["runserver", "0.0.0.0:8000"] + + consumer: + image: paperless + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + # You have to adapt the local path you want the consumption + # directory to mount to by modifying the part before the ':'. + - /path/to/arbitrary/place:/consume + env_file: docker-compose.env + command: ["document_consumer"] + +volumes: + paperless-data: + paperless-media: diff --git a/docs/Dockerfile b/docs/Dockerfile new file mode 100644 index 000000000..ee63aebb4 --- /dev/null +++ b/docs/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.5.1 +MAINTAINER Pit Kleyersburg + +# Install Sphinx and Pygments +RUN pip install Sphinx Pygments + +# Setup directories, copy data +RUN mkdir /build +COPY . /build +WORKDIR /build/docs + +# Build documentation +RUN make html + +# Start webserver +WORKDIR /build/docs/_build/html +EXPOSE 8000/tcp +CMD ["python3", "-m", "http.server"] diff --git a/docs/migrating.rst b/docs/migrating.rst index 46083533a..1e03bb3cb 100644 --- a/docs/migrating.rst +++ b/docs/migrating.rst @@ -30,6 +30,20 @@ as part of the update: Note that it's possible (even likely) that while ``git pull`` may update some files, the ``migrate`` step may not update anything. This is totally normal. +If you are :ref:`using Docker ` the update process +requires only one additional step: + +.. code-block:: shell-session + + $ cd /path/to/project + $ git pull + $ docker build -t paperless . + $ docker-compose up -d + $ docker-compose run --rm webserver migrate + +If ``git pull`` doesn't report any changes, there is no need to continue with +the remaining steps. + .. _migrating-backup: @@ -53,6 +67,65 @@ with Django's ``dumpdata`` command, which produces JSON output. $ ./manage.py document_export /path/to/arbitrary/place/ $ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json +If you are :ref:`using Docker `, exporting your tags +as JSON is almost as easy: + +.. code-block:: shell-session + + $ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json + +Exporting the documents though is a little more involved, since docker-compose +doesn't support mounting additional volumes with the ``run`` command. You have +three general options: + +1. Use the consumption directory if you happen to already have it mounted to a + host directory. + + .. code-block:: console + + $ # Stop the consumer so that it doesn't consume the exported documents + $ docker-compose stop consumer + $ # Export into the consumption directory + $ docker-compose run --rm consumer document_exporter /consume + +2. Add another volume to ``docker-compose.yml`` for exports and use + ``docker-compose run``: + + .. code-block:: diff + + diff --git a/docker-compose.yml b/docker-compose.yml + --- a/docker-compose.yml + +++ b/docker-compose.yml + @@ -17,9 +18,8 @@ services: + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + - /consume + + - /path/to/arbitrary/place:/export + + .. code-block:: shell-session + + $ docker-compose run --rm consumer document_exporter /export + +3. Use ``docker run`` directly, supplying the necessary commandline options: + + .. code-block:: shell-session + + $ # Identify your containers + $ docker-compose ps + Name Command State Ports + ------------------------------------------------------------------------- + paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 + paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 + + $ # Make sure to replace your passphrase and remove or adapt the id mapping + $ docker run --rm \ + --volumes-from paperless_data_1 \ + --volume /path/to/arbitrary/place:/export \ + -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ + -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ + paperless document_exporter /export + .. _migrating-restoring: @@ -77,3 +150,25 @@ exported documents into the consumption directory and start up the consumer. $ cp /path/to/exported/docs/* /path/to/consumption/dir/ $ ./manage.py document_consumer +Importing your data if you are :ref:`using Docker ` +is almost as simple: + +.. code-block:: shell-session + + $ # Stop and remove your current containers + $ docker-compose stop + $ docker-compose rm -f + + $ # Recreate them, add the superuser + $ docker-compose up -d + $ docker-compose run --rm webserver createsuperuser + + $ # Load the tags + $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin - + + $ # Load your exported documents into the consumption directory + $ # (How you do this highly depends on how you have set this up) + $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/ + +After loading the documents into the consumption directory the consumer will +immediately start consuming the documents. diff --git a/docs/requirements.rst b/docs/requirements.rst index 1c4f989db..ee287d835 100644 --- a/docs/requirements.rst +++ b/docs/requirements.rst @@ -101,3 +101,16 @@ you'd like to generate your own docs locally, you'll need to: $ pip install sphinx and then cd into the ``docs`` directory and type ``make html``. + +If you are using Docker, you can use the following commands to build the +documentation and run a webserver serving it on `port 8001`_: + +.. code:: bash + + $ pwd + /path/to/paperless + + $ docker build -t paperless:docs -f docs/Dockerfile . + $ docker run --rm -it -p "8001:8000" paperless:docs + +.. _port 8001: http://127.0.0.1:8001 diff --git a/docs/setup.rst b/docs/setup.rst index 24a9b9fa2..796de88e6 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -37,11 +37,18 @@ or just download the tarball and go that route: Installation & Configuration ---------------------------- -You can go two routes with setting up and running Paperless. The *Vagrant* -route is quick & easy, but means you're running a VM which comes with memory -consumption etc. Alternatively the standard, "bare metal" approach is a little -more complicated. +You can go multiple routes with setting up and running Paperless. The `Vagrant +route`_ is quick & easy, but means you're running a VM which comes with memory +consumption etc. We also `support Docker`_, which you can use natively under +Linux and in a VM with `Docker Machine`_ (this guide was written for native +Docker usage under Linux, you might have to adapt it for Docker Machine.) +Alternatively the standard, `bare metal`_ approach is a little more complicated. +.. _Vagrant route: setup-installation-vagrant_ +.. _support Docker: setup-installation-docker_ +.. _bare metal: setup-installation-standard_ + +.. _Docker Machine: https://docs.docker.com/machine/ .. _setup-installation-standard: @@ -118,6 +125,150 @@ Vagrant Method .. _Paperless server: http://172.28.128.4:8000 +.. _setup-installation-docker: + +Docker Method +............. + +1. Install `Docker`_. + + .. caution:: + + As mentioned earlier, this guide assumes that you use Docker natively + under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows, + you will have to adapt IP addresses, volume-mounting, command execution + and maybe more. + +2. Install `docker-compose`_. [#compose]_ + + .. caution:: + + If you want to use the included ``docker-compose.yml.example`` file, you + need to have at least Docker version **1.10.0** and docker-compose + version **1.6.0**. + + See the `Docker installation guide`_ on how to install the current + version of Docker for your operating system or Linux distribution of + choice. To get an up-to-date version of docker-compose, follow the + `docker-compose installation guide`_ if your package repository doesn't + include it. + + .. _Docker installation guide: https://docs.docker.com/engine/installation/ + .. _docker-compose installation guide: https://docs.docker.com/compose/install/ + +3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml``. +4. Modify ``docker-compose.env`` and adapt the following environment variables: + + ``PAPERLESS_PASSPHRASE`` + This is the passphrase Paperless uses to encrypt/decrypt the original + document. + + ``PAPERLESS_OCR_THREADS`` + This is the number of threads the OCR process will spawn to process + document pages in parallel. If the variable is not set, Python determines + the core-count of your CPU and uses that value. + + ``PAPERLESS_OCR_LANGUAGES`` + If you want the OCR to recognize other languages in addition to the default + English, set this parameter to a space separated list of three-letter + language-codes after `ISO 639-2/T`_. For a list of available languages -- + including their three letter codes -- see the `Debian packagelist`_. + + ``USERMAP_UID`` and ``USERMAP_GID`` + If you want to mount the consumption volume (directory ``/consume`` within + the containers) to a host-directory -- which you probably want to do -- + access rights might be an issue. The default user and group ``paperless`` + in the containers have an id of 1000. The containers will enforce that the + owning group of the consumption directory will be ``paperless`` to be able + to delete consumed documents. If your host-system has a group with an id of + 1000 and you don't want this group to have access rights to the consumption + directory, you can use ``USERMAP_GID`` to change the id in the container + and thus the one of the consumption directory. Furthermore, you can change + the id of the default user as well using ``USERMAP_UID``. + +5. Run ``docker-compose up -d``. This will create and start the necessary + containers. +6. To be able to login, you will need a super user. To create it, execute the + following command: + + .. code-block:: shell-session + + $ docker-compose run --rm webserver createsuperuser + + This will prompt you to set a username (default ``paperless``), an optional + e-mail address and finally a password. +7. The default ``docker-compose.yml`` exports the webserver on your local port + 8000. If you haven't adapted this, you should now be able to visit your + `Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the + user and password you just created. +8. Add files to consumption directory the way you prefer to. Following are two + possible options: + + 1. Mount the consumption directory to a local host path by modifying your + ``docker-compose.yml``: + + .. code-block:: diff + + diff --git a/docker-compose.yml b/docker-compose.yml + --- a/docker-compose.yml + +++ b/docker-compose.yml + @@ -17,9 +18,8 @@ services: + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + - - /consume + + - /local/path/you/choose:/consume + + .. danger:: + + While the consumption container will ensure at startup that it can + **delete** a consumed file from a host-mounted directory, it might not + be able to **read** the document in the first place if the access + rights to the file are incorrect. + + Make sure that the documents you put into the consumption directory + will either be readable by everyone (``chmod o+r file.pdf``) or + readable by the default user or group id 1000 (or the one you have set + with ``USERMAP_UID`` or ``USERMAP_GID`` respectively). + + 2. Use ``docker cp`` to copy your files directly into the container: + + .. code-block:: shell-session + + $ # Identify your containers + $ docker-compose ps + Name Command State Ports + ------------------------------------------------------------------------- + paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 + paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 + + $ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume + + ``docker cp`` is a one-shot-command, just like ``cp``. This means that + every time you want to consume a new document, you will have to execute + ``docker cp`` again. You can of course automate this process, but option 1 + is generally the preferred one. + + .. danger:: + + ``docker cp`` will change the owning user and group of a copied file + to the acting user at the destination, which will be ``root``. + + You therefore need to ensure that the documents you want to copy into + the container are readable by everyone (``chmod o+r file.pdf``) before + copying them. + + +.. _Docker: https://www.docker.com/ +.. _docker-compose: https://docs.docker.com/compose/install/ +.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes +.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr- + +.. [#compose] You of course don't have to use docker-compose, but it + simplifies deployment immensely. If you know your way around Docker, feel + free to tinker around without using compose! + + .. _making-things-a-little-more-permanent: Making Things a Little more Permanent @@ -126,5 +277,9 @@ Making Things a Little more Permanent Once you've tested things and are happy with the work flow, you can automate the process of starting the webserver and consumer automatically. If you're running on a bare metal system that's using Systemd, you can use the service unit files -in the ``scripts`` directory to set this up. If you're on a SysV or other -startup system (like the Vagrant box), then you're currently on your own. +in the ``scripts`` directory to set this up. If you're on another startup +system or are using a Vagrant box, then you're currently on your own. If you are +using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to +have the containers automatically start with the Docker daemon. + +.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh new file mode 100644 index 000000000..9001574a1 --- /dev/null +++ b/scripts/docker-entrypoint.sh @@ -0,0 +1,74 @@ +#!/bin/bash +set -e + +# Source: https://github.com/sameersbn/docker-gitlab/ +map_uidgid() { + USERMAP_ORIG_UID=$(id -u paperless) + USERMAP_ORIG_UID=$(id -g paperless) + USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}} + USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID} + if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then + echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID" + groupmod -g ${USERMAP_GID} paperless + sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd + fi +} + +set_permissions() { + # Set permissions for consumption directory + chgrp paperless "$PAPERLESS_CONSUME" + chmod g+x "$PAPERLESS_CONSUME" + + # Set permissions for application directory + chown -Rh paperless:paperless /usr/src/paperless +} + +initialize() { + map_uidgid + set_permissions +} + +install_languages() { + local langs="$1" + read -ra langs <<<"$langs" + + # Check that it is not empty + if [ ${#langs[@]} -eq 0 ]; then + return + fi + + # Update apt-lists + apt-get update + + # Loop over languages to be installed + for lang in "${langs[@]}"; do + pkg="tesseract-ocr-$lang" + if dpkg -s "$pkg" 2>&1 > /dev/null; then + continue + fi + + if ! apt-cache show "$pkg" 2>&1 > /dev/null; then + continue + fi + + apt-get install "$pkg" + done + + # Remove apt lists + rm -rf /var/lib/apt/lists/* +} + + +if [[ "$1" != "/"* ]]; then + initialize + + # Install additional languages if specified + if [ ! -z "$PAPERLESS_OCR_LANGUAGES" ]; then + install_languages "$PAPERLESS_OCR_LANGUAGES" + fi + + exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@" +fi + +exec "$@" + diff --git a/src/documents/management/commands/loaddata_stdin.py b/src/documents/management/commands/loaddata_stdin.py new file mode 100644 index 000000000..b6848f1eb --- /dev/null +++ b/src/documents/management/commands/loaddata_stdin.py @@ -0,0 +1,23 @@ +""" +Source: + https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 + +License: + MIT + Copyright (c) 2016 Baptiste Mispelon +""" +import sys + +from django.core.management.commands.loaddata import Command as LoadDataCommand + + +class Command(LoadDataCommand): + def parse_name(self, fixture_name): + self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None) + if fixture_name == '-': + return '-', 'json', 'stdin' + + def find_fixtures(self, fixture_label): + if fixture_label == '-': + return [('-', None, '-')] + return super(Command, self).find_fixtures(fixture_label)