Skip to content
This repository has been archived by the owner on Feb 16, 2023. It is now read-only.

Commit

Permalink
added a task scheduler for recurring tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
Jonas Winkler committed Nov 9, 2020
1 parent 5871ce3 commit 9d22d9c
Show file tree
Hide file tree
Showing 15 changed files with 240 additions and 189 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ target/
.virtualenv
virtualenv
/venv
docker-compose.yml
docker-compose.env

# Used for development
Expand Down
8 changes: 2 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ COPY Pipfile* ./
#Dependencies
RUN apt-get update \
&& DEBIAN_FRONTEND="noninteractive" apt-get -y --no-install-recommends install \
anacron \
build-essential \
curl \
ghostscript \
Expand Down Expand Up @@ -60,7 +59,6 @@ RUN apt-get update \
COPY scripts/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml
COPY scripts/gunicorn.conf.py ./
COPY scripts/supervisord.conf /etc/supervisord.conf
COPY scripts/paperless-cron /etc/cron.daily/
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh

# copy app
Expand All @@ -71,16 +69,14 @@ COPY --from=frontend /usr/src/paperless/src-ui/dist/paperless-ui/ ./src/document
RUN addgroup --gid 1000 paperless \
&& useradd --uid 1000 --gid paperless --home-dir /usr/src/paperless paperless \
&& chown -R paperless:paperless . \
&& chmod 755 /sbin/docker-entrypoint.sh \
&& chmod +x /etc/cron.daily/paperless-cron \
&& rm /etc/cron.daily/apt-compat /etc/cron.daily/dpkg
&& chmod 755 /sbin/docker-entrypoint.sh

WORKDIR /usr/src/paperless/src/

RUN sudo -HEu paperless python3 manage.py collectstatic --clear --no-input

VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/consume", "/usr/src/paperless/export"]
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
CMD ["python3", "manage.py", "--help"]
CMD ["supervisord", "-c", "/etc/supervisord.conf"]

LABEL maintainer="Jonas Winkler <[email protected]>"
4 changes: 3 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ gunicorn = "*"
whitenoise = "*"
fuzzywuzzy = "*"
python-Levenshtein = "*"
django-extensions = ""
django-extensions = "*"
watchdog = "*"
pathvalidate = "*"
django-q = "*"
redis = "*"

[dev-packages]
coveralls = "*"
Expand Down
177 changes: 111 additions & 66 deletions Pipfile.lock

Large diffs are not rendered by default.

4 changes: 0 additions & 4 deletions docker-compose.env.example
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
# Database settings for paperless
# If you want to use sqlite instead, remove this setting.
PAPERLESS_DBHOST="db"

# The UID and GID of the user used to run paperless in the container. Set this
# to your UID and GID on the host so that you have write access to the
# consumption directory.
Expand Down
12 changes: 9 additions & 3 deletions docker-compose.yml.example → docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
version: "3.4"
services:
broker:
image: redis:latest
#restart: always

db:
image: postgres:13
#restart: always
Expand All @@ -11,13 +15,12 @@ services:
POSTGRES_PASSWORD: paperless

webserver:
build: .
image: paperless-ng
image: paperless-ng:latest
#restart: always
depends_on:
- db
ports:
- "8000:8000"
- 8000:8000
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000"]
interval: 30s
Expand All @@ -29,6 +32,9 @@ services:
- ./export:/usr/src/paperless/export
- ./consume:/usr/src/paperless/consume
env_file: docker-compose.env
environment:
PAPERLESS_REDIS: redis://broker:6379
PAPERLESS_DBHOST: db
command: ["supervisord", "-c", "/etc/supervisord.conf"]


Expand Down
10 changes: 10 additions & 0 deletions paperless.conf.example
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@
# As this file contains passwords it should only be readable by the user
# running paperless.

###############################################################################
#### Message Broker ####
###############################################################################

# This is required for processing scheduled tasks such as email fetching, index
# optimization and for training the automatic document matcher.
# Defaults to localhost:6379.
#PAPERLESS_REDIS="redis://localhost:6379"


###############################################################################
#### Database Settings ####
###############################################################################
Expand Down
5 changes: 0 additions & 5 deletions scripts/paperless-cron

This file was deleted.

5 changes: 3 additions & 2 deletions scripts/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0

[program:anacron]
command=anacron -d
[program:scheduler]
command=python3 manage.py qcluster
user=paperless

stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
Expand Down
31 changes: 2 additions & 29 deletions src/documents/management/commands/document_create_classifier.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import logging

from django.core.management.base import BaseCommand
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from paperless import settings
from ...mixins import Renderable
from ...tasks import train_classifier


class Command(Renderable, BaseCommand):
Expand All @@ -18,27 +14,4 @@ def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)

def handle(self, *args, **options):
classifier = DocumentClassifier()

try:
# load the classifier, since we might not have to train it again.
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError):
# This is what we're going to fix here.
pass

try:
if classifier.train():
logging.getLogger(__name__).info(
"Saving updated classifier model to {}...".format(settings.MODEL_FILE)
)
classifier.save_classifier()
else:
logging.getLogger(__name__).debug(
"Training data unchanged."
)

except Exception as e:
logging.getLogger(__name__).error(
"Classifier error: " + str(e)
)
train_classifier()
15 changes: 3 additions & 12 deletions src/documents/management/commands/document_index.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from django.core.management import BaseCommand
from whoosh.writing import AsyncWriter

import documents.index as index
from documents.mixins import Renderable
from documents.models import Document
from documents.tasks import index_reindex, index_optimize


class Command(Renderable, BaseCommand):
Expand All @@ -22,13 +20,6 @@ def handle(self, *args, **options):
self.verbosity = options["verbosity"]

if options['command'] == 'reindex':
documents = Document.objects.all()

ix = index.open_index(recreate=True)

with AsyncWriter(ix) as writer:
for document in documents:
index.update_document(writer, document)

index_reindex()
elif options['command'] == 'optimize':
index.open_index().optimize()
index_optimize()
60 changes: 0 additions & 60 deletions src/documents/management/commands/document_rerun_ocr.py

This file was deleted.

28 changes: 28 additions & 0 deletions src/documents/migrations/1001_auto_20201109_1636.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Generated by Django 3.1.3 on 2020-11-09 16:36

from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule


def add_schedules(apps, schema_editor):
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)


def remove_schedules(apps, schema_editor):
Schedule.objects.all().delete()


class Migration(migrations.Migration):

dependencies = [
('documents', '1000_update_paperless_all'),
('django_q', '0013_task_attempt_count'),
]

operations = [
RunPython(add_schedules, remove_schedules)
]
57 changes: 57 additions & 0 deletions src/documents/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import logging

from django.conf import settings
from django_q.tasks import async_task, result
from whoosh.writing import AsyncWriter

from documents import index
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from documents.mail import MailFetcher
from documents.models import Document


def consume_mail():
MailFetcher().pull()


def index_optimize():
index.open_index().optimize()


def index_reindex():
documents = Document.objects.all()

ix = index.open_index(recreate=True)

with AsyncWriter(ix) as writer:
for document in documents:
index.update_document(writer, document)


def train_classifier():
classifier = DocumentClassifier()

try:
# load the classifier, since we might not have to train it again.
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError):
# This is what we're going to fix here.
pass

try:
if classifier.train():
logging.getLogger(__name__).info(
"Saving updated classifier model to {}...".format(
settings.MODEL_FILE)
)
classifier.save_classifier()
else:
logging.getLogger(__name__).debug(
"Training data unchanged."
)

except Exception as e:
logging.getLogger(__name__).error(
"Classifier error: " + str(e)
)
12 changes: 12 additions & 0 deletions src/paperless/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def __get_boolean(key, default="NO"):
"rest_framework",
"django_filters",

"django_q",

]

REST_FRAMEWORK = {
Expand Down Expand Up @@ -242,6 +244,16 @@ def __get_boolean(key, default="NO"):
},
}

###############################################################################
# Task queue #
###############################################################################

Q_CLUSTER = {
'name': 'paperless',
'catch_up': False,
'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
}

###############################################################################
# Paperless Specific Settings #
###############################################################################
Expand Down

0 comments on commit 9d22d9c

Please sign in to comment.