#12: Support image documents

jonaswinkler · Jan 29, 2016 · ace9389 · ace9389
1 parent d6f4ef2
commit ace9389
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 57 deletions.
diff --git a/src/documents/admin.py b/src/documents/admin.py
@@ -34,7 +34,7 @@ def queryset(self, request, queryset):
 class DocumentAdmin(admin.ModelAdmin):
 
     search_fields = ("sender__name", "title", "content",)
-    list_display = ("edit", "created", "sender", "title", "tags_", "pdf")
+    list_display = ("edit", "created", "sender", "title", "tags_", "document")
     list_filter = (MonthListFilter, "tags", "sender")
     list_editable = ("sender", "title",)
     list_per_page = 25
@@ -44,14 +44,14 @@ def edit(self, obj):
             static("documents/img/edit.png"))
     edit.allow_tags = True
 
-    def pdf(self, obj):
+    def document(self, obj):
         return '<a href="{}">' \
                  '<img src="{}" width="22" height="22" alt="PDF icon">' \
                '</a>'.format(
                     reverse("fetch", kwargs={"pk": obj.pk}),
                     static("documents/img/application-pdf.png")
                 )
-    pdf.allow_tags = True
+    document.allow_tags = True
 
     def tags_(self, obj):
         r = ""

diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py
@@ -31,9 +31,9 @@ class Command(BaseCommand):
     Loop over every file found in CONSUMPTION_DIR and:
       1. Convert it to a greyscale png
       2. Use tesseract on the png
-      3. Encrypt and store the PDF in the MEDIA_ROOT
+      3. Encrypt and store the document in the MEDIA_ROOT
       4. Store the OCR'd text in the database
-      5. Delete the pdf and image(s)
+      5. Delete the document and image(s)
     """
 
     LOOP_TIME = 10  # Seconds
@@ -44,10 +44,12 @@ class Command(BaseCommand):
 
     OCR = pyocr.get_available_tools()[0]
     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
-    MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
+    MEDIA_DOCS = os.path.join(settings.MEDIA_ROOT, "documents")
 
-    PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
-    PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
+    PARSER_REGEX_TITLE = re.compile(
+        r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE)
+    PARSER_REGEX_SENDER_TITLE = re.compile(
+        r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE)
 
     def __init__(self, *args, **kwargs):
 
@@ -74,35 +76,35 @@ def handle(self, *args, **options):
 
     def loop(self):
 
-        for pdf in os.listdir(self.CONSUME):
+        for doc in os.listdir(self.CONSUME):
 
-            pdf = os.path.join(self.CONSUME, pdf)
+            doc = os.path.join(self.CONSUME, doc)
 
-            if not os.path.isfile(pdf):
+            if not os.path.isfile(doc):
                 continue
 
-            if not re.match(self.PARSER_REGEX_TITLE, pdf):
+            if not re.match(self.PARSER_REGEX_TITLE, doc):
                 continue
 
-            if pdf in self._ignore:
+            if doc in self._ignore:
                 continue
 
-            if self._is_ready(pdf):
+            if self._is_ready(doc):
                 continue
 
-            self._render("Consuming {}".format(pdf), 1)
+            self._render("Consuming {}".format(doc), 1)
 
-            pngs = self._get_greyscale(pdf)
+            pngs = self._get_greyscale(doc)
 
             try:
                 text = self._get_ocr(pngs)
             except OCRError:
-                self._ignore.append(pdf)
-                self._render("OCR FAILURE: {}".format(pdf), 0)
+                self._ignore.append(doc)
+                self._render("OCR FAILURE: {}".format(doc), 0)
                 continue
 
-            self._store(text, pdf)
-            self._cleanup(pngs, pdf)
+            self._store(text, doc)
+            self._cleanup(pngs, doc)
 
     def _setup(self):
 
@@ -116,29 +118,29 @@ def _setup(self):
             raise CommandError("Consumption directory {} does not exist".format(
                 self.CONSUME))
 
-        for d in (self.SCRATCH, self.MEDIA_PDF):
+        for d in (self.SCRATCH, self.MEDIA_DOCS):
             try:
                 os.makedirs(d)
             except FileExistsError:
                 pass
 
-    def _is_ready(self, pdf):
+    def _is_ready(self, doc):
         """
-        Detect whether `pdf` is ready to consume or if it's still being written
+        Detect whether `doc` is ready to consume or if it's still being written
         to by the scanner.
         """
 
-        t = os.stat(pdf).st_mtime
+        t = os.stat(doc).st_mtime
 
-        if self.stats.get(pdf) == t:
-            del(self.stats[pdf])
+        if self.stats.get(doc) == t:
+            del(self.stats[doc])
             return True
 
-        self.stats[pdf] = t
+        self.stats[doc] = t
 
         return False
 
-    def _get_greyscale(self, pdf):
+    def _get_greyscale(self, doc):
 
         self._render("  Generating greyscale image", 2)
 
@@ -147,14 +149,14 @@ def _get_greyscale(self, pdf):
 
         subprocess.Popen((
             self.CONVERT, "-density", "300", "-depth", "8",
-            "-type", "grayscale", pdf, png
+            "-type", "grayscale", doc, png
         )).wait()
 
         return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
 
     def _get_ocr(self, pngs):
 
-        self._render("  OCRing the PDF", 2)
+        self._render("  OCRing the document", 2)
 
         raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
 
@@ -203,19 +205,22 @@ def _ocr(self, pngs, lang):
         # Strip out excess white space to allow matching to go smoother
         return re.sub(r"\s+", " ", r)
 
-    def _store(self, text, pdf):
+    def _store(self, text, doc):
 
-        sender, title = self._parse_file_name(pdf)
-        relevant_tags = [t for t in Tag.objects.all() if t.matches(text.lower())]
+        sender, title, file_type = self._parse_file_name(doc)
 
-        stats = os.stat(pdf)
+        lower_text = text.lower()
+        relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]
+
+        stats = os.stat(doc)
 
         self._render("  Saving record to database", 2)
 
-        doc = Document.objects.create(
+        document = Document.objects.create(
             sender=sender,
             title=title,
             content=text,
+            file_type=file_type,
             created=timezone.make_aware(
                 datetime.datetime.fromtimestamp(stats.st_mtime)),
             modified=timezone.make_aware(
@@ -225,38 +230,38 @@ def _store(self, text, pdf):
         if relevant_tags:
             tag_names = ", ".join([t.slug for t in relevant_tags])
             self._render("    Tagging with {}".format(tag_names), 2)
-            doc.tags.add(*relevant_tags)
+            document.tags.add(*relevant_tags)
 
-        with open(pdf, "rb") as unencrypted:
-            with open(doc.pdf_path, "wb") as encrypted:
+        with open(doc, "rb") as unencrypted:
+            with open(document.source_path, "wb") as encrypted:
                 self._render("  Encrypting", 3)
                 encrypted.write(GnuPG.encrypted(unencrypted))
 
-    def _parse_file_name(self, pdf):
+    def _parse_file_name(self, doc):
         """
         We use a crude naming convention to make handling the sender and title
         easier:
-          "sender - title.pdf"
+          "<sender> - <title>.<suffix>"
         """
 
-        # First we attempt "sender - title.pdf"
-        m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf)
+        # First we attempt "<sender> - <title>.<suffix>"
+        m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc)
         if m:
-            sender_name, title = m.group(1), m.group(2)
+            sender_name, title, file_type = m.group(1), m.group(2), m.group(3)
             sender, __ = Sender.objects.get_or_create(
                 name=sender_name, defaults={"slug": slugify(sender_name)})
-            return sender, title
+            return sender, title, file_type
 
         # That didn't work, so we assume sender is None
-        m = re.match(self.PARSER_REGEX_TITLE, pdf)
-        return None, m.group(1)
+        m = re.match(self.PARSER_REGEX_TITLE, doc)
+        return None, m.group(1), m.group(2)
 
-    def _cleanup(self, pngs, pdf):
+    def _cleanup(self, pngs, doc):
 
         png_glob = os.path.join(
             self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
 
-        for f in list(glob.glob(png_glob)) + [pdf]:
+        for f in list(glob.glob(png_glob)) + [doc]:
             self._render("  Deleting {}".format(f), 2)
             os.unlink(f)
 

diff --git a/src/documents/migrations/0008_document_file_type.py b/src/documents/migrations/0008_document_file_type.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.9 on 2016-01-29 22:58
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0007_auto_20160126_2114'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='document',
+            name='file_type',
+            field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF')], default='pdf', editable=False, max_length=4),
+            preserve_default=False,
+        ),
+    ]
diff --git a/src/documents/models.py b/src/documents/models.py
@@ -111,10 +111,22 @@ def save(self, *args, **kwargs):
 
 class Document(models.Model):
 
+    TYPE_PDF = "pdf"
+    TYPE_PNG = "png"
+    TYPE_JPG = "jpg"
+    TYPE_GIF = "gif"
+    TYPE_TIF = "tiff"
+    TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
+
     sender = models.ForeignKey(
         Sender, blank=True, null=True, related_name="documents")
     title = models.CharField(max_length=128, blank=True, db_index=True)
     content = models.TextField(db_index=True)
+    file_type = models.CharField(
+        max_length=4,
+        editable=False,
+        choices=tuple([(t, t.upper()) for t in TYPES])
+    )
     tags = models.ManyToManyField(Tag, related_name="documents")
     created = models.DateTimeField(default=timezone.now, editable=False)
     modified = models.DateTimeField(auto_now=True, editable=False)
@@ -131,20 +143,19 @@ def __str__(self):
         return str(created)
 
     @property
-    def pdf_path(self):
+    def source_path(self):
         return os.path.join(
             settings.MEDIA_ROOT,
             "documents",
-            "pdf",
-            "{:07}.pdf.gpg".format(self.pk)
+            "{:07}.{}.gpg".format(self.pk, self.file_type)
         )
 
     @property
-    def pdf(self):
-        return open(self.pdf_path, "rb")
+    def source_file(self):
+        return open(self.source_path, "rb")
 
     @property
     def parseable_file_name(self):
         if self.sender and self.title:
-            return "{} - {}.pdf".format(self.sender, self.title)
-        return os.path.basename(self.pdf_path)
+            return "{} - {}.{}".format(self.sender, self.title, self.file_types)
+        return os.path.basename(self.source_path)
diff --git a/src/documents/views.py b/src/documents/views.py
@@ -16,9 +16,19 @@ def render_to_response(self, context, **response_kwargs):
         Override the default to return the unencrypted PDF as raw data.
         """
 
+        content_types = {
+            Document.TYPE_PDF: "application/pdf",
+            Document.TYPE_PNG: "image/png",
+            Document.TYPE_JPG: "image/jpeg",
+            Document.TYPE_GIF: "image/gif",
+            Document.TYPE_TIF: "image/tiff",
+        }
+
         response = HttpResponse(
-            GnuPG.decrypted(self.object.pdf), content_type="application/pdf")
+            GnuPG.decrypted(self.object.source_file),
+            content_type=content_types[self.object.file_type]
+        )
         response["Content-Disposition"] = 'attachment; filename="{}"'.format(
-            slugify(str(self.object)) + ".pdf")
+            slugify(str(self.object)) + "." + self.object.file_type)
 
         return response