From a70b40f618501cd13b2d506e099e523b3269779b Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 30 Jan 2016 01:18:52 +0000 Subject: [PATCH] Broke the consumer script into separate files and started on a mail consumer --- .gitignore | 5 +- src/documents/consumers/__init__.py | 3 + src/documents/consumers/base.py | 157 ++++++++++++ src/documents/consumers/file.py | 106 ++++++++ src/documents/consumers/mail.py | 69 +++++ .../management/commands/document_consumer.py | 242 ++---------------- .../management/commands/document_exporter.py | 2 +- src/paperless/settings.py | 20 +- 8 files changed, 376 insertions(+), 228 deletions(-) create mode 100644 src/documents/consumers/__init__.py create mode 100644 src/documents/consumers/base.py create mode 100644 src/documents/consumers/file.py create mode 100644 src/documents/consumers/mail.py diff --git a/.gitignore b/.gitignore index 5fbfdbac6..908fa9748 100644 --- a/.gitignore +++ b/.gitignore @@ -67,8 +67,9 @@ db.sqlite3 # Other stuff that doesn't belong virtualenv +.vagrant +# Used for development scripts/import-for-development +environment -# Vagrant -.vagrant diff --git a/src/documents/consumers/__init__.py b/src/documents/consumers/__init__.py new file mode 100644 index 000000000..d54da1d91 --- /dev/null +++ b/src/documents/consumers/__init__.py @@ -0,0 +1,3 @@ +from .base import Consumer +from .file import FileConsumer, FileConsumerError +from .mail import MailConsumer, MailConsumerError diff --git a/src/documents/consumers/base.py b/src/documents/consumers/base.py new file mode 100644 index 000000000..4a72f906a --- /dev/null +++ b/src/documents/consumers/base.py @@ -0,0 +1,157 @@ +import datetime +import glob +import langdetect +import os +import random +import re +import subprocess + +import pyocr + +from PIL import Image + +from django.conf import settings +from django.utils import timezone + +from paperless.db import GnuPG + +from ..models import Tag, Document +from ..languages import ISO639 + + +class OCRError(Exception): + pass + + +class Consumer(object): + + SCRATCH = settings.SCRATCH_DIR + CONVERT = settings.CONVERT_BINARY + + OCR = pyocr.get_available_tools()[0] + DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE + + def __init__(self, verbosity=1): + + self.verbosity = verbosity + + try: + os.makedirs(self.SCRATCH) + except FileExistsError: + pass + + def _get_greyscale(self, doc): + + self._render(" Generating greyscale image", 2) + + i = random.randint(1000000, 9999999) + png = os.path.join(self.SCRATCH, "{}.png".format(i)) + + subprocess.Popen(( + self.CONVERT, "-density", "300", "-depth", "8", + "-type", "grayscale", doc, png + )).wait() + + return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) + + def _get_ocr(self, pngs): + + self._render(" OCRing the document", 2) + + raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE) + + guessed_language = langdetect.detect(raw_text) + + self._render(" Language detected: {}".format(guessed_language), 2) + + if guessed_language not in ISO639: + self._render("Language detection failed!", 0) + if settings.FORGIVING_OCR: + self._render( + "As FORGIVING_OCR is enabled, we're going to make the best " + "with what we have.", + 1 + ) + return raw_text + raise OCRError + + if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: + return raw_text + + try: + return self._ocr(pngs, ISO639[guessed_language]) + except pyocr.pyocr.tesseract.TesseractError: + if settings.FORGIVING_OCR: + self._render( + "OCR for {} failed, but we're going to stick with what " + "we've got since FORGIVING_OCR is enabled.".format( + guessed_language + ), + 0 + ) + return raw_text + raise OCRError + + def _ocr(self, pngs, lang): + + self._render(" Parsing for {}".format(lang), 2) + + r = "" + for png in pngs: + with Image.open(os.path.join(self.SCRATCH, png)) as f: + self._render(" {}".format(f.filename), 3) + r += self.OCR.image_to_string(f, lang=lang) + + # Strip out excess white space to allow matching to go smoother + return re.sub(r"\s+", " ", r) + + def _guess_file_attributes(self, doc): + raise NotImplementedError( + "At the very least a consumer should determine the file type.") + + def _store(self, text, doc): + + sender, title, file_type = self._guess_file_attributes(doc) + + lower_text = text.lower() + relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)] + + stats = os.stat(doc) + + self._render(" Saving record to database", 2) + + document = Document.objects.create( + sender=sender, + title=title, + content=text, + file_type=file_type, + created=timezone.make_aware( + datetime.datetime.fromtimestamp(stats.st_mtime)), + modified=timezone.make_aware( + datetime.datetime.fromtimestamp(stats.st_mtime)) + ) + + if relevant_tags: + tag_names = ", ".join([t.slug for t in relevant_tags]) + self._render(" Tagging with {}".format(tag_names), 2) + document.tags.add(*relevant_tags) + + with open(doc, "rb") as unencrypted: + with open(document.source_path, "wb") as encrypted: + self._render(" Encrypting", 3) + encrypted.write(GnuPG.encrypted(unencrypted)) + + def _cleanup(self, pngs, doc): + + png_glob = os.path.join( + self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) + + for f in list(glob.glob(png_glob)) + [doc]: + self._render(" Deleting {}".format(f), 2) + os.unlink(f) + + self._render("", 2) + + def _render(self, text, verbosity): + if self.verbosity >= verbosity: + print(text) diff --git a/src/documents/consumers/file.py b/src/documents/consumers/file.py new file mode 100644 index 000000000..e8a8737f4 --- /dev/null +++ b/src/documents/consumers/file.py @@ -0,0 +1,106 @@ +import os +import re + +from django.conf import settings +from django.template.defaultfilters import slugify + +from ..models import Sender +from . import Consumer, OCRError + + +class FileConsumerError(Exception): + pass + + +class FileConsumer(Consumer): + + CONSUME = settings.CONSUMPTION_DIR + + PARSER_REGEX_TITLE = re.compile( + r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE) + PARSER_REGEX_SENDER_TITLE = re.compile( + r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE) + + def __init__(self, *args, **kwargs): + + Consumer.__init__(self, *args, **kwargs) + + self.stats = {} + self._ignore = [] + + if not self.CONSUME: + raise FileConsumerError( + "The CONSUMPTION_DIR settings variable does not appear to be " + "set." + ) + + if not os.path.exists(self.CONSUME): + raise FileConsumerError( + "Consumption directory {} does not exist".format(self.CONSUME)) + + def consume(self): + + for doc in os.listdir(self.CONSUME): + + doc = os.path.join(self.CONSUME, doc) + + if not os.path.isfile(doc): + continue + + if not re.match(self.PARSER_REGEX_TITLE, doc): + continue + + if doc in self._ignore: + continue + + if self._is_ready(doc): + continue + + self._render("Consuming {}".format(doc), 1) + + pngs = self._get_greyscale(doc) + + try: + text = self._get_ocr(pngs) + except OCRError: + self._ignore.append(doc) + self._render("OCR FAILURE: {}".format(doc), 0) + continue + + self._store(text, doc) + self._cleanup(pngs, doc) + + def _is_ready(self, doc): + """ + Detect whether `doc` is ready to consume or if it's still being written + to by the uploader. + """ + + t = os.stat(doc).st_mtime + + if self.stats.get(doc) == t: + del(self.stats[doc]) + return True + + self.stats[doc] = t + + return False + + def _guess_file_attributes(self, doc): + """ + We use a crude naming convention to make handling the sender and title + easier: + " - .<suffix>" + """ + + # First we attempt "<sender> - <title>.<suffix>" + m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc) + if m: + sender_name, title, file_type = m.group(1), m.group(2), m.group(3) + sender, __ = Sender.objects.get_or_create( + name=sender_name, defaults={"slug": slugify(sender_name)}) + return sender, title, file_type + + # That didn't work, so we assume sender is None + m = re.match(self.PARSER_REGEX_TITLE, doc) + return None, m.group(1), m.group(2) diff --git a/src/documents/consumers/mail.py b/src/documents/consumers/mail.py new file mode 100644 index 000000000..99106f0eb --- /dev/null +++ b/src/documents/consumers/mail.py @@ -0,0 +1,69 @@ +import datetime +import imaplib + +from django.conf import settings + +from . import Consumer + + +class MailConsumerError(Exception): + pass + + +class MailConsumer(Consumer): + + def __init__(self, *args, **kwargs): + + Consumer.__init__(self, *args, **kwargs) + + self._connection = None + self._host = settings.MAIL_CONSUMPTION["HOST"] + self._port = settings.MAIL_CONSUMPTION["PORT"] + self._username = settings.MAIL_CONSUMPTION["USERNAME"] + self._password = settings.MAIL_CONSUMPTION["PASSWORD"] + self._inbox = settings.MAIL_CONSUMPTION["INBOX"] + + self._enabled = bool(self._host) + + self.last_checked = datetime.datetime.now() + + def _connect(self): + self._connection = imaplib.IMAP4_SSL(self._host, self._port) + + def _login(self): + + login = self._connection.login(self._username, self._password) + if not login[0] == "OK": + raise MailConsumerError("Can't log into mail: {}".format(login[1])) + + inbox = self._connection.select("INBOX") + if not inbox[0] == "OK": + raise MailConsumerError("Can't find the inbox: {}".format(inbox[1])) + + def _fetch(self): + for num in self._connection.search(None, "ALL")[1][0].split(): + typ, data = self._connection.fetch(num, "(RFC822)") + # self._connection.store(num, "+FLAGS", "\\Deleted") + yield data[0][1] + + def consume(self): + + if self._enabled: + self.get_messages() + + self.last_checked = datetime.datetime.now() + + def get_messages(self): + + self._connect() + self._login() + + for message in self._fetch(): + print(message) # Now we have to do something with the attachment + + self._connection.expunge() + self._connection.close() + self._connection.logout() + + def _guess_file_attributes(self, doc): + return None, None, "jpg" diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index e2669fa4b..67d68c6e6 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -1,29 +1,12 @@ import datetime -import glob -import langdetect import os -import random -import re -import subprocess import time -import pyocr - -from PIL import Image - from django.conf import settings from django.core.management.base import BaseCommand, CommandError -from django.template.defaultfilters import slugify -from django.utils import timezone - -from paperless.db import GnuPG - -from ...languages import ISO639 -from ...models import Document, Sender, Tag - -class OCRError(BaseException): - pass +from ...consumers import ( + FileConsumer, FileConsumerError, MailConsumer, MailConsumerError) class Command(BaseCommand): @@ -37,25 +20,16 @@ class Command(BaseCommand): """ LOOP_TIME = 10 # Seconds + MAIL_DELTA = datetime.timedelta(minutes=10) - CONVERT = settings.CONVERT_BINARY - SCRATCH = settings.SCRATCH_DIR - CONSUME = settings.CONSUMPTION_DIR - - OCR = pyocr.get_available_tools()[0] - DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE MEDIA_DOCS = os.path.join(settings.MEDIA_ROOT, "documents") - PARSER_REGEX_TITLE = re.compile( - r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE) - PARSER_REGEX_SENDER_TITLE = re.compile( - r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE) - def __init__(self, *args, **kwargs): self.verbosity = 0 - self.stats = {} - self._ignore = [] + + self.file_consumer = None + self.mail_consumer = None BaseCommand.__init__(self, *args, **kwargs) @@ -63,7 +37,16 @@ def handle(self, *args, **options): self.verbosity = options["verbosity"] - self._setup() + try: + self.file_consumer = FileConsumer(verbosity=self.verbosity) + self.mail_consumer = MailConsumer(verbosity=self.verbosity) + except (FileConsumerError, MailConsumerError) as e: + raise CommandError(e) + + try: + os.makedirs(self.MEDIA_DOCS) + except FileExistsError: + pass try: while True: @@ -76,196 +59,11 @@ def handle(self, *args, **options): def loop(self): - for doc in os.listdir(self.CONSUME): - - doc = os.path.join(self.CONSUME, doc) - - if not os.path.isfile(doc): - continue - - if not re.match(self.PARSER_REGEX_TITLE, doc): - continue - - if doc in self._ignore: - continue - - if self._is_ready(doc): - continue - - self._render("Consuming {}".format(doc), 1) - - pngs = self._get_greyscale(doc) - - try: - text = self._get_ocr(pngs) - except OCRError: - self._ignore.append(doc) - self._render("OCR FAILURE: {}".format(doc), 0) - continue - - self._store(text, doc) - self._cleanup(pngs, doc) - - def _setup(self): - - if not self.CONSUME: - raise CommandError( - "The CONSUMPTION_DIR settings variable does not appear to be " - "set." - ) - - if not os.path.exists(self.CONSUME): - raise CommandError("Consumption directory {} does not exist".format( - self.CONSUME)) - - for d in (self.SCRATCH, self.MEDIA_DOCS): - try: - os.makedirs(d) - except FileExistsError: - pass - - def _is_ready(self, doc): - """ - Detect whether `doc` is ready to consume or if it's still being written - to by the scanner. - """ - - t = os.stat(doc).st_mtime - - if self.stats.get(doc) == t: - del(self.stats[doc]) - return True - - self.stats[doc] = t - - return False - - def _get_greyscale(self, doc): - - self._render(" Generating greyscale image", 2) - - i = random.randint(1000000, 9999999) - png = os.path.join(self.SCRATCH, "{}.png".format(i)) - - subprocess.Popen(( - self.CONVERT, "-density", "300", "-depth", "8", - "-type", "grayscale", doc, png - )).wait() - - return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) - - def _get_ocr(self, pngs): - - self._render(" OCRing the document", 2) - - raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE) - - guessed_language = langdetect.detect(raw_text) - - self._render(" Language detected: {}".format(guessed_language), 2) - - if guessed_language not in ISO639: - self._render("Language detection failed!", 0) - if settings.FORGIVING_OCR: - self._render( - "As FORGIVING_OCR is enabled, we're going to make the best " - "with what we have.", - 1 - ) - return raw_text - raise OCRError - - if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: - return raw_text - - try: - return self._ocr(pngs, ISO639[guessed_language]) - except pyocr.pyocr.tesseract.TesseractError: - if settings.FORGIVING_OCR: - self._render( - "OCR for {} failed, but we're going to stick with what " - "we've got since FORGIVING_OCR is enabled.".format( - guessed_language - ), - 0 - ) - return raw_text - raise OCRError - - def _ocr(self, pngs, lang): - - self._render(" Parsing for {}".format(lang), 2) - - r = "" - for png in pngs: - with Image.open(os.path.join(self.SCRATCH, png)) as f: - self._render(" {}".format(f.filename), 3) - r += self.OCR.image_to_string(f, lang=lang) - - # Strip out excess white space to allow matching to go smoother - return re.sub(r"\s+", " ", r) - - def _store(self, text, doc): - - sender, title, file_type = self._parse_file_name(doc) - - lower_text = text.lower() - relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)] - - stats = os.stat(doc) - - self._render(" Saving record to database", 2) - - document = Document.objects.create( - sender=sender, - title=title, - content=text, - file_type=file_type, - created=timezone.make_aware( - datetime.datetime.fromtimestamp(stats.st_mtime)), - modified=timezone.make_aware( - datetime.datetime.fromtimestamp(stats.st_mtime)) - ) - - if relevant_tags: - tag_names = ", ".join([t.slug for t in relevant_tags]) - self._render(" Tagging with {}".format(tag_names), 2) - document.tags.add(*relevant_tags) - - with open(doc, "rb") as unencrypted: - with open(document.source_path, "wb") as encrypted: - self._render(" Encrypting", 3) - encrypted.write(GnuPG.encrypted(unencrypted)) - - def _parse_file_name(self, doc): - """ - We use a crude naming convention to make handling the sender and title - easier: - "<sender> - <title>.<suffix>" - """ - - # First we attempt "<sender> - <title>.<suffix>" - m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc) - if m: - sender_name, title, file_type = m.group(1), m.group(2), m.group(3) - sender, __ = Sender.objects.get_or_create( - name=sender_name, defaults={"slug": slugify(sender_name)}) - return sender, title, file_type - - # That didn't work, so we assume sender is None - m = re.match(self.PARSER_REGEX_TITLE, doc) - return None, m.group(1), m.group(2) - - def _cleanup(self, pngs, doc): - - png_glob = os.path.join( - self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) - - for f in list(glob.glob(png_glob)) + [doc]: - self._render(" Deleting {}".format(f), 2) - os.unlink(f) + self.file_consumer.consume() - self._render("", 2) + now = datetime.datetime.now() + if self.mail_consumer.last_checked + self.MAIL_DELTA > now: + self.mail_consumer.consume() def _render(self, text, verbosity): if self.verbosity >= verbosity: diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 616605883..5b70f7050 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -47,7 +47,7 @@ def handle(self, *args, **options): self._render("Exporting: {}".format(target), 1) with open(target, "wb") as f: - f.write(GnuPG.decrypted(document.pdf)) + f.write(GnuPG.decrypted(document.source_file)) t = int(time.mktime(document.created.timetuple())) os.utime(target, times=(t, t)) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 95ba4b1fc..8c4a73d78 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -162,7 +162,21 @@ # This is where Paperless will look for PDFs to index CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME") -# Set this and change the permissions on this file to 0600, or set it to -# `None` and you'll be prompted for the passphrase at runtime. The default -# looks for an environment variable. +# If you want to use IMAP mail consumption, populate this with useful values. +# If you leave HOST set to None, we assume you're not going to use this feature. +MAIL_CONSUMPTION = { + "HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"), + "PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"), + "USERNAME": os.environ.get("PAPERLESS_CONSUME_MAIL_USER"), + "PASSWORD": os.environ.get("PAPERLESS_CONSUME_MAIL_PASS"), + "USE_SSL": True, # If True, use SSL/TLS to connect + "INBOX": "INBOX" # The name of the inbox on the server +} + +# This is used to encrypt the original documents and decrypt them later when you +# want to download them. Set it and change the permissions on this file to +# 0600, or set it to `None` and you'll be prompted for the passphrase at +# runtime. The default looks for an environment variable. +# DON'T FORGET TO SET THIS as leaving it blank may cause some strang things with +# GPG, including an interesting case where it may "encrypt" zero-byte files. PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")