Skip to content
This repository has been archived by the owner on Feb 16, 2023. It is now read-only.

Commit

Permalink
Broke the consumer script into separate files and started on a mail c…
Browse files Browse the repository at this point in the history
…onsumer
  • Loading branch information
danielquinn committed Jan 30, 2016
1 parent 84d5f8c commit a70b40f
Show file tree
Hide file tree
Showing 8 changed files with 376 additions and 228 deletions.
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,9 @@ db.sqlite3

# Other stuff that doesn't belong
virtualenv
.vagrant

# Used for development
scripts/import-for-development
environment

# Vagrant
.vagrant
3 changes: 3 additions & 0 deletions src/documents/consumers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .base import Consumer
from .file import FileConsumer, FileConsumerError
from .mail import MailConsumer, MailConsumerError
157 changes: 157 additions & 0 deletions src/documents/consumers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import datetime
import glob
import langdetect
import os
import random
import re
import subprocess

import pyocr

from PIL import Image

from django.conf import settings
from django.utils import timezone

from paperless.db import GnuPG

from ..models import Tag, Document
from ..languages import ISO639


class OCRError(Exception):
pass


class Consumer(object):

SCRATCH = settings.SCRATCH_DIR
CONVERT = settings.CONVERT_BINARY

OCR = pyocr.get_available_tools()[0]
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE

def __init__(self, verbosity=1):

self.verbosity = verbosity

try:
os.makedirs(self.SCRATCH)
except FileExistsError:
pass

def _get_greyscale(self, doc):

self._render(" Generating greyscale image", 2)

i = random.randint(1000000, 9999999)
png = os.path.join(self.SCRATCH, "{}.png".format(i))

subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", doc, png
)).wait()

return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))

def _get_ocr(self, pngs):

self._render(" OCRing the document", 2)

raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)

guessed_language = langdetect.detect(raw_text)

self._render(" Language detected: {}".format(guessed_language), 2)

if guessed_language not in ISO639:
self._render("Language detection failed!", 0)
if settings.FORGIVING_OCR:
self._render(
"As FORGIVING_OCR is enabled, we're going to make the best "
"with what we have.",
1
)
return raw_text
raise OCRError

if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
return raw_text

try:
return self._ocr(pngs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
self._render(
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
),
0
)
return raw_text
raise OCRError

def _ocr(self, pngs, lang):

self._render(" Parsing for {}".format(lang), 2)

r = ""
for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f:
self._render(" {}".format(f.filename), 3)
r += self.OCR.image_to_string(f, lang=lang)

# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)

def _guess_file_attributes(self, doc):
raise NotImplementedError(
"At the very least a consumer should determine the file type.")

def _store(self, text, doc):

sender, title, file_type = self._guess_file_attributes(doc)

lower_text = text.lower()
relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]

stats = os.stat(doc)

self._render(" Saving record to database", 2)

document = Document.objects.create(
sender=sender,
title=title,
content=text,
file_type=file_type,
created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
)

if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
self._render(" Tagging with {}".format(tag_names), 2)
document.tags.add(*relevant_tags)

with open(doc, "rb") as unencrypted:
with open(document.source_path, "wb") as encrypted:
self._render(" Encrypting", 3)
encrypted.write(GnuPG.encrypted(unencrypted))

def _cleanup(self, pngs, doc):

png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))

for f in list(glob.glob(png_glob)) + [doc]:
self._render(" Deleting {}".format(f), 2)
os.unlink(f)

self._render("", 2)

def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)
106 changes: 106 additions & 0 deletions src/documents/consumers/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os
import re

from django.conf import settings
from django.template.defaultfilters import slugify

from ..models import Sender
from . import Consumer, OCRError


class FileConsumerError(Exception):
pass


class FileConsumer(Consumer):

CONSUME = settings.CONSUMPTION_DIR

PARSER_REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE)
PARSER_REGEX_SENDER_TITLE = re.compile(
r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE)

def __init__(self, *args, **kwargs):

Consumer.__init__(self, *args, **kwargs)

self.stats = {}
self._ignore = []

if not self.CONSUME:
raise FileConsumerError(
"The CONSUMPTION_DIR settings variable does not appear to be "
"set."
)

if not os.path.exists(self.CONSUME):
raise FileConsumerError(
"Consumption directory {} does not exist".format(self.CONSUME))

def consume(self):

for doc in os.listdir(self.CONSUME):

doc = os.path.join(self.CONSUME, doc)

if not os.path.isfile(doc):
continue

if not re.match(self.PARSER_REGEX_TITLE, doc):
continue

if doc in self._ignore:
continue

if self._is_ready(doc):
continue

self._render("Consuming {}".format(doc), 1)

pngs = self._get_greyscale(doc)

try:
text = self._get_ocr(pngs)
except OCRError:
self._ignore.append(doc)
self._render("OCR FAILURE: {}".format(doc), 0)
continue

self._store(text, doc)
self._cleanup(pngs, doc)

def _is_ready(self, doc):
"""
Detect whether `doc` is ready to consume or if it's still being written
to by the uploader.
"""

t = os.stat(doc).st_mtime

if self.stats.get(doc) == t:
del(self.stats[doc])
return True

self.stats[doc] = t

return False

def _guess_file_attributes(self, doc):
"""
We use a crude naming convention to make handling the sender and title
easier:
"<sender> - <title>.<suffix>"
"""

# First we attempt "<sender> - <title>.<suffix>"
m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc)
if m:
sender_name, title, file_type = m.group(1), m.group(2), m.group(3)
sender, __ = Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)})
return sender, title, file_type

# That didn't work, so we assume sender is None
m = re.match(self.PARSER_REGEX_TITLE, doc)
return None, m.group(1), m.group(2)
69 changes: 69 additions & 0 deletions src/documents/consumers/mail.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import datetime
import imaplib

from django.conf import settings

from . import Consumer


class MailConsumerError(Exception):
pass


class MailConsumer(Consumer):

def __init__(self, *args, **kwargs):

Consumer.__init__(self, *args, **kwargs)

self._connection = None
self._host = settings.MAIL_CONSUMPTION["HOST"]
self._port = settings.MAIL_CONSUMPTION["PORT"]
self._username = settings.MAIL_CONSUMPTION["USERNAME"]
self._password = settings.MAIL_CONSUMPTION["PASSWORD"]
self._inbox = settings.MAIL_CONSUMPTION["INBOX"]

self._enabled = bool(self._host)

self.last_checked = datetime.datetime.now()

def _connect(self):
self._connection = imaplib.IMAP4_SSL(self._host, self._port)

def _login(self):

login = self._connection.login(self._username, self._password)
if not login[0] == "OK":
raise MailConsumerError("Can't log into mail: {}".format(login[1]))

inbox = self._connection.select("INBOX")
if not inbox[0] == "OK":
raise MailConsumerError("Can't find the inbox: {}".format(inbox[1]))

def _fetch(self):
for num in self._connection.search(None, "ALL")[1][0].split():
typ, data = self._connection.fetch(num, "(RFC822)")
# self._connection.store(num, "+FLAGS", "\\Deleted")
yield data[0][1]

def consume(self):

if self._enabled:
self.get_messages()

self.last_checked = datetime.datetime.now()

def get_messages(self):

self._connect()
self._login()

for message in self._fetch():
print(message) # Now we have to do something with the attachment

self._connection.expunge()
self._connection.close()
self._connection.logout()

def _guess_file_attributes(self, doc):
return None, None, "jpg"
Loading

0 comments on commit a70b40f

Please sign in to comment.