diff --git a/src/documents/admin.py b/src/documents/admin.py
index 2ac8f2f3f..d706e54a6 100644
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -34,7 +34,7 @@ def queryset(self, request, queryset):
class DocumentAdmin(admin.ModelAdmin):
search_fields = ("sender__name", "title", "content",)
- list_display = ("edit", "created", "sender", "title", "tags_", "pdf")
+ list_display = ("edit", "created", "sender", "title", "tags_", "document")
list_filter = (MonthListFilter, "tags", "sender")
list_editable = ("sender", "title",)
list_per_page = 25
@@ -44,14 +44,14 @@ def edit(self, obj):
static("documents/img/edit.png"))
edit.allow_tags = True
- def pdf(self, obj):
+ def document(self, obj):
return '' \
'' \
''.format(
reverse("fetch", kwargs={"pk": obj.pk}),
static("documents/img/application-pdf.png")
)
- pdf.allow_tags = True
+ document.allow_tags = True
def tags_(self, obj):
r = ""
diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py
index 8595619f6..e2669fa4b 100644
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -31,9 +31,9 @@ class Command(BaseCommand):
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale png
2. Use tesseract on the png
- 3. Encrypt and store the PDF in the MEDIA_ROOT
+ 3. Encrypt and store the document in the MEDIA_ROOT
4. Store the OCR'd text in the database
- 5. Delete the pdf and image(s)
+ 5. Delete the document and image(s)
"""
LOOP_TIME = 10 # Seconds
@@ -44,10 +44,12 @@ class Command(BaseCommand):
OCR = pyocr.get_available_tools()[0]
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
- MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
+ MEDIA_DOCS = os.path.join(settings.MEDIA_ROOT, "documents")
- PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
- PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
+ PARSER_REGEX_TITLE = re.compile(
+ r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE)
+ PARSER_REGEX_SENDER_TITLE = re.compile(
+ r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE)
def __init__(self, *args, **kwargs):
@@ -74,35 +76,35 @@ def handle(self, *args, **options):
def loop(self):
- for pdf in os.listdir(self.CONSUME):
+ for doc in os.listdir(self.CONSUME):
- pdf = os.path.join(self.CONSUME, pdf)
+ doc = os.path.join(self.CONSUME, doc)
- if not os.path.isfile(pdf):
+ if not os.path.isfile(doc):
continue
- if not re.match(self.PARSER_REGEX_TITLE, pdf):
+ if not re.match(self.PARSER_REGEX_TITLE, doc):
continue
- if pdf in self._ignore:
+ if doc in self._ignore:
continue
- if self._is_ready(pdf):
+ if self._is_ready(doc):
continue
- self._render("Consuming {}".format(pdf), 1)
+ self._render("Consuming {}".format(doc), 1)
- pngs = self._get_greyscale(pdf)
+ pngs = self._get_greyscale(doc)
try:
text = self._get_ocr(pngs)
except OCRError:
- self._ignore.append(pdf)
- self._render("OCR FAILURE: {}".format(pdf), 0)
+ self._ignore.append(doc)
+ self._render("OCR FAILURE: {}".format(doc), 0)
continue
- self._store(text, pdf)
- self._cleanup(pngs, pdf)
+ self._store(text, doc)
+ self._cleanup(pngs, doc)
def _setup(self):
@@ -116,29 +118,29 @@ def _setup(self):
raise CommandError("Consumption directory {} does not exist".format(
self.CONSUME))
- for d in (self.SCRATCH, self.MEDIA_PDF):
+ for d in (self.SCRATCH, self.MEDIA_DOCS):
try:
os.makedirs(d)
except FileExistsError:
pass
- def _is_ready(self, pdf):
+ def _is_ready(self, doc):
"""
- Detect whether `pdf` is ready to consume or if it's still being written
+ Detect whether `doc` is ready to consume or if it's still being written
to by the scanner.
"""
- t = os.stat(pdf).st_mtime
+ t = os.stat(doc).st_mtime
- if self.stats.get(pdf) == t:
- del(self.stats[pdf])
+ if self.stats.get(doc) == t:
+ del(self.stats[doc])
return True
- self.stats[pdf] = t
+ self.stats[doc] = t
return False
- def _get_greyscale(self, pdf):
+ def _get_greyscale(self, doc):
self._render(" Generating greyscale image", 2)
@@ -147,14 +149,14 @@ def _get_greyscale(self, pdf):
subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8",
- "-type", "grayscale", pdf, png
+ "-type", "grayscale", doc, png
)).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_ocr(self, pngs):
- self._render(" OCRing the PDF", 2)
+ self._render(" OCRing the document", 2)
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
@@ -203,19 +205,22 @@ def _ocr(self, pngs, lang):
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)
- def _store(self, text, pdf):
+ def _store(self, text, doc):
- sender, title = self._parse_file_name(pdf)
- relevant_tags = [t for t in Tag.objects.all() if t.matches(text.lower())]
+ sender, title, file_type = self._parse_file_name(doc)
- stats = os.stat(pdf)
+ lower_text = text.lower()
+ relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]
+
+ stats = os.stat(doc)
self._render(" Saving record to database", 2)
- doc = Document.objects.create(
+ document = Document.objects.create(
sender=sender,
title=title,
content=text,
+ file_type=file_type,
created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(
@@ -225,38 +230,38 @@ def _store(self, text, pdf):
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
self._render(" Tagging with {}".format(tag_names), 2)
- doc.tags.add(*relevant_tags)
+ document.tags.add(*relevant_tags)
- with open(pdf, "rb") as unencrypted:
- with open(doc.pdf_path, "wb") as encrypted:
+ with open(doc, "rb") as unencrypted:
+ with open(document.source_path, "wb") as encrypted:
self._render(" Encrypting", 3)
encrypted.write(GnuPG.encrypted(unencrypted))
- def _parse_file_name(self, pdf):
+ def _parse_file_name(self, doc):
"""
We use a crude naming convention to make handling the sender and title
easier:
- "sender - title.pdf"
+ " - ."
"""
- # First we attempt "sender - title.pdf"
- m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf)
+ # First we attempt " - ."
+ m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc)
if m:
- sender_name, title = m.group(1), m.group(2)
+ sender_name, title, file_type = m.group(1), m.group(2), m.group(3)
sender, __ = Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)})
- return sender, title
+ return sender, title, file_type
# That didn't work, so we assume sender is None
- m = re.match(self.PARSER_REGEX_TITLE, pdf)
- return None, m.group(1)
+ m = re.match(self.PARSER_REGEX_TITLE, doc)
+ return None, m.group(1), m.group(2)
- def _cleanup(self, pngs, pdf):
+ def _cleanup(self, pngs, doc):
png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
- for f in list(glob.glob(png_glob)) + [pdf]:
+ for f in list(glob.glob(png_glob)) + [doc]:
self._render(" Deleting {}".format(f), 2)
os.unlink(f)
diff --git a/src/documents/migrations/0008_document_file_type.py b/src/documents/migrations/0008_document_file_type.py
new file mode 100644
index 000000000..6d4bd2775
--- /dev/null
+++ b/src/documents/migrations/0008_document_file_type.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.9 on 2016-01-29 22:58
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('documents', '0007_auto_20160126_2114'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='document',
+ name='file_type',
+ field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF')], default='pdf', editable=False, max_length=4),
+ preserve_default=False,
+ ),
+ ]
diff --git a/src/documents/models.py b/src/documents/models.py
index b299ad146..960644854 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -111,10 +111,22 @@ def save(self, *args, **kwargs):
class Document(models.Model):
+ TYPE_PDF = "pdf"
+ TYPE_PNG = "png"
+ TYPE_JPG = "jpg"
+ TYPE_GIF = "gif"
+ TYPE_TIF = "tiff"
+ TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
+
sender = models.ForeignKey(
Sender, blank=True, null=True, related_name="documents")
title = models.CharField(max_length=128, blank=True, db_index=True)
content = models.TextField(db_index=True)
+ file_type = models.CharField(
+ max_length=4,
+ editable=False,
+ choices=tuple([(t, t.upper()) for t in TYPES])
+ )
tags = models.ManyToManyField(Tag, related_name="documents")
created = models.DateTimeField(default=timezone.now, editable=False)
modified = models.DateTimeField(auto_now=True, editable=False)
@@ -131,20 +143,19 @@ def __str__(self):
return str(created)
@property
- def pdf_path(self):
+ def source_path(self):
return os.path.join(
settings.MEDIA_ROOT,
"documents",
- "pdf",
- "{:07}.pdf.gpg".format(self.pk)
+ "{:07}.{}.gpg".format(self.pk, self.file_type)
)
@property
- def pdf(self):
- return open(self.pdf_path, "rb")
+ def source_file(self):
+ return open(self.source_path, "rb")
@property
def parseable_file_name(self):
if self.sender and self.title:
- return "{} - {}.pdf".format(self.sender, self.title)
- return os.path.basename(self.pdf_path)
+ return "{} - {}.{}".format(self.sender, self.title, self.file_types)
+ return os.path.basename(self.source_path)
diff --git a/src/documents/views.py b/src/documents/views.py
index 74590769c..517b4192e 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -16,9 +16,19 @@ def render_to_response(self, context, **response_kwargs):
Override the default to return the unencrypted PDF as raw data.
"""
+ content_types = {
+ Document.TYPE_PDF: "application/pdf",
+ Document.TYPE_PNG: "image/png",
+ Document.TYPE_JPG: "image/jpeg",
+ Document.TYPE_GIF: "image/gif",
+ Document.TYPE_TIF: "image/tiff",
+ }
+
response = HttpResponse(
- GnuPG.decrypted(self.object.pdf), content_type="application/pdf")
+ GnuPG.decrypted(self.object.source_file),
+ content_type=content_types[self.object.file_type]
+ )
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
- slugify(str(self.object)) + ".pdf")
+ slugify(str(self.object)) + "." + self.object.file_type)
return response