This repository has been archived by the owner on Feb 16, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 353
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Broke the consumer script into separate files and started on a mail c…
…onsumer
- Loading branch information
1 parent
84d5f8c
commit a70b40f
Showing
8 changed files
with
376 additions
and
228 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .base import Consumer | ||
from .file import FileConsumer, FileConsumerError | ||
from .mail import MailConsumer, MailConsumerError |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
import datetime | ||
import glob | ||
import langdetect | ||
import os | ||
import random | ||
import re | ||
import subprocess | ||
|
||
import pyocr | ||
|
||
from PIL import Image | ||
|
||
from django.conf import settings | ||
from django.utils import timezone | ||
|
||
from paperless.db import GnuPG | ||
|
||
from ..models import Tag, Document | ||
from ..languages import ISO639 | ||
|
||
|
||
class OCRError(Exception): | ||
pass | ||
|
||
|
||
class Consumer(object): | ||
|
||
SCRATCH = settings.SCRATCH_DIR | ||
CONVERT = settings.CONVERT_BINARY | ||
|
||
OCR = pyocr.get_available_tools()[0] | ||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||
|
||
def __init__(self, verbosity=1): | ||
|
||
self.verbosity = verbosity | ||
|
||
try: | ||
os.makedirs(self.SCRATCH) | ||
except FileExistsError: | ||
pass | ||
|
||
def _get_greyscale(self, doc): | ||
|
||
self._render(" Generating greyscale image", 2) | ||
|
||
i = random.randint(1000000, 9999999) | ||
png = os.path.join(self.SCRATCH, "{}.png".format(i)) | ||
|
||
subprocess.Popen(( | ||
self.CONVERT, "-density", "300", "-depth", "8", | ||
"-type", "grayscale", doc, png | ||
)).wait() | ||
|
||
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) | ||
|
||
def _get_ocr(self, pngs): | ||
|
||
self._render(" OCRing the document", 2) | ||
|
||
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE) | ||
|
||
guessed_language = langdetect.detect(raw_text) | ||
|
||
self._render(" Language detected: {}".format(guessed_language), 2) | ||
|
||
if guessed_language not in ISO639: | ||
self._render("Language detection failed!", 0) | ||
if settings.FORGIVING_OCR: | ||
self._render( | ||
"As FORGIVING_OCR is enabled, we're going to make the best " | ||
"with what we have.", | ||
1 | ||
) | ||
return raw_text | ||
raise OCRError | ||
|
||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: | ||
return raw_text | ||
|
||
try: | ||
return self._ocr(pngs, ISO639[guessed_language]) | ||
except pyocr.pyocr.tesseract.TesseractError: | ||
if settings.FORGIVING_OCR: | ||
self._render( | ||
"OCR for {} failed, but we're going to stick with what " | ||
"we've got since FORGIVING_OCR is enabled.".format( | ||
guessed_language | ||
), | ||
0 | ||
) | ||
return raw_text | ||
raise OCRError | ||
|
||
def _ocr(self, pngs, lang): | ||
|
||
self._render(" Parsing for {}".format(lang), 2) | ||
|
||
r = "" | ||
for png in pngs: | ||
with Image.open(os.path.join(self.SCRATCH, png)) as f: | ||
self._render(" {}".format(f.filename), 3) | ||
r += self.OCR.image_to_string(f, lang=lang) | ||
|
||
# Strip out excess white space to allow matching to go smoother | ||
return re.sub(r"\s+", " ", r) | ||
|
||
def _guess_file_attributes(self, doc): | ||
raise NotImplementedError( | ||
"At the very least a consumer should determine the file type.") | ||
|
||
def _store(self, text, doc): | ||
|
||
sender, title, file_type = self._guess_file_attributes(doc) | ||
|
||
lower_text = text.lower() | ||
relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)] | ||
|
||
stats = os.stat(doc) | ||
|
||
self._render(" Saving record to database", 2) | ||
|
||
document = Document.objects.create( | ||
sender=sender, | ||
title=title, | ||
content=text, | ||
file_type=file_type, | ||
created=timezone.make_aware( | ||
datetime.datetime.fromtimestamp(stats.st_mtime)), | ||
modified=timezone.make_aware( | ||
datetime.datetime.fromtimestamp(stats.st_mtime)) | ||
) | ||
|
||
if relevant_tags: | ||
tag_names = ", ".join([t.slug for t in relevant_tags]) | ||
self._render(" Tagging with {}".format(tag_names), 2) | ||
document.tags.add(*relevant_tags) | ||
|
||
with open(doc, "rb") as unencrypted: | ||
with open(document.source_path, "wb") as encrypted: | ||
self._render(" Encrypting", 3) | ||
encrypted.write(GnuPG.encrypted(unencrypted)) | ||
|
||
def _cleanup(self, pngs, doc): | ||
|
||
png_glob = os.path.join( | ||
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) | ||
|
||
for f in list(glob.glob(png_glob)) + [doc]: | ||
self._render(" Deleting {}".format(f), 2) | ||
os.unlink(f) | ||
|
||
self._render("", 2) | ||
|
||
def _render(self, text, verbosity): | ||
if self.verbosity >= verbosity: | ||
print(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import os | ||
import re | ||
|
||
from django.conf import settings | ||
from django.template.defaultfilters import slugify | ||
|
||
from ..models import Sender | ||
from . import Consumer, OCRError | ||
|
||
|
||
class FileConsumerError(Exception): | ||
pass | ||
|
||
|
||
class FileConsumer(Consumer): | ||
|
||
CONSUME = settings.CONSUMPTION_DIR | ||
|
||
PARSER_REGEX_TITLE = re.compile( | ||
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE) | ||
PARSER_REGEX_SENDER_TITLE = re.compile( | ||
r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE) | ||
|
||
def __init__(self, *args, **kwargs): | ||
|
||
Consumer.__init__(self, *args, **kwargs) | ||
|
||
self.stats = {} | ||
self._ignore = [] | ||
|
||
if not self.CONSUME: | ||
raise FileConsumerError( | ||
"The CONSUMPTION_DIR settings variable does not appear to be " | ||
"set." | ||
) | ||
|
||
if not os.path.exists(self.CONSUME): | ||
raise FileConsumerError( | ||
"Consumption directory {} does not exist".format(self.CONSUME)) | ||
|
||
def consume(self): | ||
|
||
for doc in os.listdir(self.CONSUME): | ||
|
||
doc = os.path.join(self.CONSUME, doc) | ||
|
||
if not os.path.isfile(doc): | ||
continue | ||
|
||
if not re.match(self.PARSER_REGEX_TITLE, doc): | ||
continue | ||
|
||
if doc in self._ignore: | ||
continue | ||
|
||
if self._is_ready(doc): | ||
continue | ||
|
||
self._render("Consuming {}".format(doc), 1) | ||
|
||
pngs = self._get_greyscale(doc) | ||
|
||
try: | ||
text = self._get_ocr(pngs) | ||
except OCRError: | ||
self._ignore.append(doc) | ||
self._render("OCR FAILURE: {}".format(doc), 0) | ||
continue | ||
|
||
self._store(text, doc) | ||
self._cleanup(pngs, doc) | ||
|
||
def _is_ready(self, doc): | ||
""" | ||
Detect whether `doc` is ready to consume or if it's still being written | ||
to by the uploader. | ||
""" | ||
|
||
t = os.stat(doc).st_mtime | ||
|
||
if self.stats.get(doc) == t: | ||
del(self.stats[doc]) | ||
return True | ||
|
||
self.stats[doc] = t | ||
|
||
return False | ||
|
||
def _guess_file_attributes(self, doc): | ||
""" | ||
We use a crude naming convention to make handling the sender and title | ||
easier: | ||
"<sender> - <title>.<suffix>" | ||
""" | ||
|
||
# First we attempt "<sender> - <title>.<suffix>" | ||
m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc) | ||
if m: | ||
sender_name, title, file_type = m.group(1), m.group(2), m.group(3) | ||
sender, __ = Sender.objects.get_or_create( | ||
name=sender_name, defaults={"slug": slugify(sender_name)}) | ||
return sender, title, file_type | ||
|
||
# That didn't work, so we assume sender is None | ||
m = re.match(self.PARSER_REGEX_TITLE, doc) | ||
return None, m.group(1), m.group(2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import datetime | ||
import imaplib | ||
|
||
from django.conf import settings | ||
|
||
from . import Consumer | ||
|
||
|
||
class MailConsumerError(Exception): | ||
pass | ||
|
||
|
||
class MailConsumer(Consumer): | ||
|
||
def __init__(self, *args, **kwargs): | ||
|
||
Consumer.__init__(self, *args, **kwargs) | ||
|
||
self._connection = None | ||
self._host = settings.MAIL_CONSUMPTION["HOST"] | ||
self._port = settings.MAIL_CONSUMPTION["PORT"] | ||
self._username = settings.MAIL_CONSUMPTION["USERNAME"] | ||
self._password = settings.MAIL_CONSUMPTION["PASSWORD"] | ||
self._inbox = settings.MAIL_CONSUMPTION["INBOX"] | ||
|
||
self._enabled = bool(self._host) | ||
|
||
self.last_checked = datetime.datetime.now() | ||
|
||
def _connect(self): | ||
self._connection = imaplib.IMAP4_SSL(self._host, self._port) | ||
|
||
def _login(self): | ||
|
||
login = self._connection.login(self._username, self._password) | ||
if not login[0] == "OK": | ||
raise MailConsumerError("Can't log into mail: {}".format(login[1])) | ||
|
||
inbox = self._connection.select("INBOX") | ||
if not inbox[0] == "OK": | ||
raise MailConsumerError("Can't find the inbox: {}".format(inbox[1])) | ||
|
||
def _fetch(self): | ||
for num in self._connection.search(None, "ALL")[1][0].split(): | ||
typ, data = self._connection.fetch(num, "(RFC822)") | ||
# self._connection.store(num, "+FLAGS", "\\Deleted") | ||
yield data[0][1] | ||
|
||
def consume(self): | ||
|
||
if self._enabled: | ||
self.get_messages() | ||
|
||
self.last_checked = datetime.datetime.now() | ||
|
||
def get_messages(self): | ||
|
||
self._connect() | ||
self._login() | ||
|
||
for message in self._fetch(): | ||
print(message) # Now we have to do something with the attachment | ||
|
||
self._connection.expunge() | ||
self._connection.close() | ||
self._connection.logout() | ||
|
||
def _guess_file_attributes(self, doc): | ||
return None, None, "jpg" |
Oops, something went wrong.