From 46f8f492f57f1b5b43b6674197f71f87660f7c2f Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg Date: Sun, 14 Feb 2016 17:40:37 +0100 Subject: [PATCH 1/2] Safely and non-randomly create scratch directory Creating the scratch-files in `_get_grayscale` using a random integer is for one inherently unsafe and can cause a collision. On the other hand, it should be unnecessary given that the files will be cleaned up after the OCR run. Since we don't know if OCR runs might be parallel in the future, this commit implements thread-safe and deterministic directory-creation. Additionally it fixes the call to `_cleanup` by `consume`. In the current implementation `_cleanup` will not be called if the last consumed document failed with an `OCRError`, this commit fixes this. --- src/documents/consumer.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index c432ee261..d7ee0e9ee 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,15 +1,16 @@ import datetime -import glob +import tempfile from multiprocessing.pool import Pool import itertools + import langdetect import os -import random import re import subprocess import pyocr +import shutil from PIL import Image @@ -111,34 +112,35 @@ def consume(self): Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) - pngs = self._get_greyscale(doc) + tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) + pngs = self._get_greyscale(tempdir, doc) try: text = self._get_ocr(pngs) + self._store(text, doc) except OCRError: self._ignore.append(doc) Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) continue + finally: + self._cleanup(tempdir, doc) - self._store(text, doc) - self._cleanup(pngs, doc) - - def _get_greyscale(self, doc): + def _get_greyscale(self, tempdir, doc): Log.debug( "Generating greyscale image from {}".format(doc), Log.COMPONENT_CONSUMER ) - i = random.randint(1000000, 9999999) - png = os.path.join(self.SCRATCH, "{}.png".format(i)) + png = os.path.join(tempdir, "convert.png") subprocess.Popen(( self.CONVERT, "-density", "300", "-depth", "8", "-type", "grayscale", doc, png )).wait() - return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) + pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")] + return sorted(filter(lambda f: os.path.isfile(f), pngs)) @staticmethod def _guess_language(text): @@ -303,14 +305,14 @@ def _store(self, text, doc): Log.debug("Encrypting", Log.COMPONENT_CONSUMER) encrypted.write(GnuPG.encrypted(unencrypted)) - def _cleanup(self, pngs, doc): - - png_glob = os.path.join( - self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) + def _cleanup(self, tempdir, doc): + # Remove temporary directory recursively + Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER) + shutil.rmtree(tempdir) - for f in list(glob.glob(png_glob)) + [doc]: - Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER) - os.unlink(f) + # Remove doc + Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER) + os.unlink(doc) def _is_ready(self, doc): """ From 6f95b052872a669b3f710d9b86783c9fd1f398db Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Wed, 17 Feb 2016 00:10:05 +0000 Subject: [PATCH 2/2] Support appropriate sorting for long documents --- src/documents/consumer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index d7ee0e9ee..5ca42813b 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -132,7 +132,7 @@ def _get_greyscale(self, tempdir, doc): Log.COMPONENT_CONSUMER ) - png = os.path.join(tempdir, "convert.png") + png = os.path.join(tempdir, "convert-%04d.jpg") subprocess.Popen(( self.CONVERT, "-density", "300", "-depth", "8",