From 98b147b62249d9fbb96764785740869cb48cfe62 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sun, 14 Feb 2021 17:08:29 +0100 Subject: [PATCH] better sanity checker that logs messages in the log files and does not fail on warnings. --- docs/changelog.rst | 6 +- .../commands/document_sanity_checker.py | 16 +-- src/documents/sanity_checker.py | 100 ++++++++------- src/documents/tasks.py | 14 ++- src/documents/tests/test_management.py | 27 ++-- src/documents/tests/test_sanity_check.py | 119 +++++++++++++++--- src/documents/tests/test_tasks.py | 36 ++++-- 7 files changed, 204 insertions(+), 114 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index fcd14c59d..e5048a097 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -10,8 +10,10 @@ paperless-ng 1.1.2 * Always show top left corner of thumbnails, even for extra wide documents. -* Added a management command for executing the sanity checker directly. See :ref:`utilities-sanity-checker`. - +* Added a management command for executing the sanity checker directly. + See :ref:`utilities-sanity-checker`. + The sanity checker will also report errors in the log files. + * Fixed an issue with the metadata tab not reporting anything in case of missing files. * Reverted a change from 1.1.0 that caused huge memory usage due to redis caching. diff --git a/src/documents/management/commands/document_sanity_checker.py b/src/documents/management/commands/document_sanity_checker.py index b5982ed90..c04a5efef 100644 --- a/src/documents/management/commands/document_sanity_checker.py +++ b/src/documents/management/commands/document_sanity_checker.py @@ -1,8 +1,5 @@ -import logging from django.core.management.base import BaseCommand -from documents.sanity_checker import check_sanity, SanityError, SanityWarning - -logger = logging.getLogger("paperless.management.sanity_checker") +from documents.sanity_checker import check_sanity class Command(BaseCommand): @@ -15,13 +12,4 @@ def handle(self, *args, **options): messages = check_sanity(progress=True) - if len(messages) == 0: - logger.info("No issues found.") - else: - for msg in messages: - if type(msg) == SanityError: - logger.error(str(msg)) - elif type(msg) == SanityWarning: - logger.warning(str(msg)) - else: - logger.info((str(msg))) + messages.log_messages() diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py index f42062154..2e1777841 100644 --- a/src/documents/sanity_checker.py +++ b/src/documents/sanity_checker.py @@ -1,4 +1,5 @@ import hashlib +import logging import os from django.conf import settings @@ -7,40 +8,48 @@ from documents.models import Document -class SanityMessage: - message = None +class SanityCheckMessages: + def __init__(self): + self._messages = [] -class SanityWarning(SanityMessage): - def __init__(self, message): - self.message = message + def error(self, message): + self._messages.append({"level": logging.ERROR, "message": message}) - def __str__(self): - return f"Warning: {self.message}" + def warning(self, message): + self._messages.append({"level": logging.WARNING, "message": message}) + def info(self, message): + self._messages.append({"level": logging.INFO, "message": message}) -class SanityError(SanityMessage): - def __init__(self, message): - self.message = message + def log_messages(self): + logger = logging.getLogger("paperless.sanity_checker") - def __str__(self): - return f"ERROR: {self.message}" + if len(self._messages) == 0: + logger.info("Sanity checker detected no issues.") + else: + for msg in self._messages: + logger.log(msg['level'], msg['message']) + + def __len__(self): + return len(self._messages) + + def __getitem__(self, item): + return self._messages[item] + def has_error(self): + return any([msg['level'] == logging.ERROR for msg in self._messages]) -class SanityFailedError(Exception): + def has_warning(self): + return any([msg['level'] == logging.WARNING for msg in self._messages]) - def __init__(self, messages): - self.messages = messages - def __str__(self): - message_string = "\n".join([str(m) for m in self.messages]) - return ( - f"The following issuse were found by the sanity checker:\n" - f"{message_string}\n\n===============\n\n") +class SanityCheckFailedException(Exception): + pass def check_sanity(progress=False): - messages = [] + messages = SanityCheckMessages() present_files = [] for root, subdirs, files in os.walk(settings.MEDIA_ROOT): @@ -59,8 +68,7 @@ def check_sanity(progress=False): for doc in docs: # Check sanity of the thumbnail if not os.path.isfile(doc.thumbnail_path): - messages.append(SanityError( - f"Thumbnail of document {doc.pk} does not exist.")) + messages.error(f"Thumbnail of document {doc.pk} does not exist.") else: if os.path.normpath(doc.thumbnail_path) in present_files: present_files.remove(os.path.normpath(doc.thumbnail_path)) @@ -68,15 +76,14 @@ def check_sanity(progress=False): with doc.thumbnail_file as f: f.read() except OSError as e: - messages.append(SanityError( + messages.error( f"Cannot read thumbnail file of document {doc.pk}: {e}" - )) + ) # Check sanity of the original file # TODO: extract method if not os.path.isfile(doc.source_path): - messages.append(SanityError( - f"Original of document {doc.pk} does not exist.")) + messages.error(f"Original of document {doc.pk} does not exist.") else: if os.path.normpath(doc.source_path) in present_files: present_files.remove(os.path.normpath(doc.source_path)) @@ -84,31 +91,31 @@ def check_sanity(progress=False): with doc.source_file as f: checksum = hashlib.md5(f.read()).hexdigest() except OSError as e: - messages.append(SanityError( - f"Cannot read original file of document {doc.pk}: {e}")) + messages.error( + f"Cannot read original file of document {doc.pk}: {e}") else: if not checksum == doc.checksum: - messages.append(SanityError( + messages.error( f"Checksum mismatch of document {doc.pk}. " f"Stored: {doc.checksum}, actual: {checksum}." - )) + ) # Check sanity of the archive file. if doc.archive_checksum and not doc.archive_filename: - messages.append(SanityError( + messages.error( f"Document {doc.pk} has an archive file checksum, but no " f"archive filename." - )) + ) elif not doc.archive_checksum and doc.archive_filename: - messages.append(SanityError( + messages.error( f"Document {doc.pk} has an archive file, but its checksum is " f"missing." - )) + ) elif doc.has_archive_version: if not os.path.isfile(doc.archive_path): - messages.append(SanityError( + messages.error( f"Archived version of document {doc.pk} does not exist." - )) + ) else: if os.path.normpath(doc.archive_path) in present_files: present_files.remove(os.path.normpath(doc.archive_path)) @@ -116,26 +123,23 @@ def check_sanity(progress=False): with doc.archive_file as f: checksum = hashlib.md5(f.read()).hexdigest() except OSError as e: - messages.append(SanityError( + messages.error( f"Cannot read archive file of document {doc.pk}: {e}" - )) + ) else: if not checksum == doc.archive_checksum: - messages.append(SanityError( + messages.error( f"Checksum mismatch of archived document " f"{doc.pk}. " - f"Stored: {doc.checksum}, actual: {checksum}." - )) + f"Stored: {doc.archive_checksum}, " + f"actual: {checksum}." + ) # other document checks if not doc.content: - messages.append(SanityWarning( - f"Document {doc.pk} has no content." - )) + messages.info(f"Document {doc.pk} has no content.") for extra_file in present_files: - messages.append(SanityWarning( - f"Orphaned file in media dir: {extra_file}" - )) + messages.warning(f"Orphaned file in media dir: {extra_file}") return messages diff --git a/src/documents/tasks.py b/src/documents/tasks.py index f74a3d420..ddfd2eac3 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -9,8 +9,7 @@ from documents.classifier import DocumentClassifier, load_classifier from documents.consumer import Consumer, ConsumerError from documents.models import Document, Tag, DocumentType, Correspondent -from documents.sanity_checker import SanityFailedError - +from documents.sanity_checker import SanityCheckFailedException logger = logging.getLogger("paperless.tasks") @@ -94,8 +93,15 @@ def consume_file(path, def sanity_check(): messages = sanity_checker.check_sanity() - if len(messages) > 0: - raise SanityFailedError(messages) + messages.log_messages() + + if messages.has_error(): + raise SanityCheckFailedException( + "Sanity check failed with errors. See log.") + elif messages.has_warning(): + return "Sanity check exited with warnings. See log." + elif len(messages) > 0: + return "Sanity check exited with infos. See log." else: return "No issues detected." diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py index 56710c6e5..40eaaf277 100644 --- a/src/documents/tests/test_management.py +++ b/src/documents/tests/test_management.py @@ -159,33 +159,20 @@ def test_create_classifier(self, m): class TestSanityChecker(DirectoriesMixin, TestCase): - def test_no_errors(self): + def test_no_issues(self): with self.assertLogs() as capture: call_command("document_sanity_checker") self.assertEqual(len(capture.output), 1) - self.assertIn("No issues found.", capture.output[0]) + self.assertIn("Sanity checker detected no issues.", capture.output[0]) - @mock.patch("documents.management.commands.document_sanity_checker.logger.warning") - @mock.patch("documents.management.commands.document_sanity_checker.logger.error") - def test_warnings(self, error, warning): - doc = Document.objects.create(title="test", filename="test.pdf", checksum="d41d8cd98f00b204e9800998ecf8427e") - Path(doc.source_path).touch() - Path(doc.thumbnail_path).touch() - - call_command("document_sanity_checker") - - error.assert_not_called() - warning.assert_called() - - @mock.patch("documents.management.commands.document_sanity_checker.logger.warning") - @mock.patch("documents.management.commands.document_sanity_checker.logger.error") - def test_errors(self, error, warning): + def test_errors(self): doc = Document.objects.create(title="test", content="test", filename="test.pdf", checksum="abc") Path(doc.source_path).touch() Path(doc.thumbnail_path).touch() - call_command("document_sanity_checker") + with self.assertLogs() as capture: + call_command("document_sanity_checker") - warning.assert_not_called() - error.assert_called() + self.assertEqual(len(capture.output), 1) + self.assertIn("Checksum mismatch of document", capture.output[0]) diff --git a/src/documents/tests/test_sanity_check.py b/src/documents/tests/test_sanity_check.py index fae18aaca..6f73e4962 100644 --- a/src/documents/tests/test_sanity_check.py +++ b/src/documents/tests/test_sanity_check.py @@ -1,3 +1,4 @@ +import logging import os import shutil from pathlib import Path @@ -7,10 +8,59 @@ from django.test import TestCase from documents.models import Document -from documents.sanity_checker import check_sanity, SanityFailedError +from documents.sanity_checker import check_sanity, SanityCheckMessages from documents.tests.utils import DirectoriesMixin +class TestSanityCheckMessages(TestCase): + + def test_no_messages(self): + messages = SanityCheckMessages() + self.assertEqual(len(messages), 0) + self.assertFalse(messages.has_error()) + self.assertFalse(messages.has_warning()) + with self.assertLogs() as capture: + messages.log_messages() + self.assertEqual(len(capture.output), 1) + self.assertEqual(capture.records[0].levelno, logging.INFO) + self.assertEqual(capture.records[0].message, "Sanity checker detected no issues.") + + def test_info(self): + messages = SanityCheckMessages() + messages.info("Something might be wrong") + self.assertEqual(len(messages), 1) + self.assertFalse(messages.has_error()) + self.assertFalse(messages.has_warning()) + with self.assertLogs() as capture: + messages.log_messages() + self.assertEqual(len(capture.output), 1) + self.assertEqual(capture.records[0].levelno, logging.INFO) + self.assertEqual(capture.records[0].message, "Something might be wrong") + + def test_warning(self): + messages = SanityCheckMessages() + messages.warning("Something is wrong") + self.assertEqual(len(messages), 1) + self.assertFalse(messages.has_error()) + self.assertTrue(messages.has_warning()) + with self.assertLogs() as capture: + messages.log_messages() + self.assertEqual(len(capture.output), 1) + self.assertEqual(capture.records[0].levelno, logging.WARNING) + self.assertEqual(capture.records[0].message, "Something is wrong") + + def test_error(self): + messages = SanityCheckMessages() + messages.error("Something is seriously wrong") + self.assertEqual(len(messages), 1) + self.assertTrue(messages.has_error()) + self.assertFalse(messages.has_warning()) + with self.assertLogs() as capture: + messages.log_messages() + self.assertEqual(len(capture.output), 1) + self.assertEqual(capture.records[0].levelno, logging.ERROR) + self.assertEqual(capture.records[0].message, "Something is seriously wrong") + class TestSanityCheck(DirectoriesMixin, TestCase): def make_test_data(self): @@ -33,72 +83,105 @@ def test_success(self): def test_no_thumbnail(self): doc = self.make_test_data() os.remove(doc.thumbnail_path) - self.assertEqual(len(check_sanity()), 1) + messages = check_sanity() + self.assertTrue(messages.has_error()) + self.assertEqual(len(messages), 1) + self.assertRegex(messages[0]['message'], "Thumbnail of document .* does not exist") def test_thumbnail_no_access(self): doc = self.make_test_data() os.chmod(doc.thumbnail_path, 0o000) - self.assertEqual(len(check_sanity()), 1) + messages = check_sanity() + self.assertTrue(messages.has_error()) + self.assertEqual(len(messages), 1) + self.assertRegex(messages[0]['message'], "Cannot read thumbnail file of document") os.chmod(doc.thumbnail_path, 0o777) def test_no_original(self): doc = self.make_test_data() os.remove(doc.source_path) - self.assertEqual(len(check_sanity()), 1) + messages = check_sanity() + self.assertTrue(messages.has_error()) + self.assertEqual(len(messages), 1) + self.assertRegex(messages[0]['message'], "Original of document .* does not exist.") def test_original_no_access(self): doc = self.make_test_data() os.chmod(doc.source_path, 0o000) - self.assertEqual(len(check_sanity()), 1) + messages = check_sanity() + self.assertTrue(messages.has_error()) + self.assertEqual(len(messages), 1) + self.assertRegex(messages[0]['message'], "Cannot read original file of document") os.chmod(doc.source_path, 0o777) def test_original_checksum_mismatch(self): doc = self.make_test_data() doc.checksum = "WOW" doc.save() - self.assertEqual(len(check_sanity()), 1) + messages = check_sanity() + self.assertTrue(messages.has_error()) + self.assertEqual(len(messages), 1) + self.assertRegex(messages[0]['message'], "Checksum mismatch of document") def test_no_archive(self): doc = self.make_test_data() os.remove(doc.archive_path) - self.assertEqual(len(check_sanity()), 1) + messages = check_sanity() + self.assertTrue(messages.has_error()) + self.assertEqual(len(messages), 1) + self.assertRegex(messages[0]['message'], "Archived version of document .* does not exist.") def test_archive_no_access(self): doc = self.make_test_data() os.chmod(doc.archive_path, 0o000) - self.assertEqual(len(check_sanity()), 1) + messages = check_sanity() + self.assertTrue(messages.has_error()) + self.assertEqual(len(messages), 1) + self.assertRegex(messages[0]['message'], "Cannot read archive file of document") os.chmod(doc.archive_path, 0o777) def test_archive_checksum_mismatch(self): doc = self.make_test_data() doc.archive_checksum = "WOW" doc.save() - self.assertEqual(len(check_sanity()), 1) + messages = check_sanity() + self.assertTrue(messages.has_error()) + self.assertEqual(len(messages), 1) + self.assertRegex(messages[0]['message'], "Checksum mismatch of archived document") def test_empty_content(self): doc = self.make_test_data() doc.content = "" doc.save() - self.assertEqual(len(check_sanity()), 1) + messages = check_sanity() + self.assertFalse(messages.has_error()) + self.assertFalse(messages.has_warning()) + self.assertEqual(len(messages), 1) + self.assertRegex(messages[0]['message'], "Document .* has no content.") def test_orphaned_file(self): doc = self.make_test_data() Path(self.dirs.originals_dir, "orphaned").touch() - self.assertEqual(len(check_sanity()), 1) - - def test_error_tostring(self): - Document.objects.create(title="test", checksum="dgfhj", archive_checksum="dfhg", content="", pk=1, filename="0000001.pdf", archive_filename="0000001.pdf") - string = str(SanityFailedError(check_sanity())) - self.assertIsNotNone(string) + messages = check_sanity() + self.assertFalse(messages.has_error()) + self.assertTrue(messages.has_warning()) + self.assertEqual(len(messages), 1) + self.assertRegex(messages[0]['message'], "Orphaned file in media dir") def test_archive_filename_no_checksum(self): doc = self.make_test_data() doc.archive_checksum = None doc.save() - self.assertEqual(len(check_sanity()), 2) + messages = check_sanity() + self.assertTrue(messages.has_error()) + self.assertEqual(len(messages), 2) + self.assertRegex(messages[0]['message'], "has an archive file, but its checksum is missing.") def test_archive_checksum_no_filename(self): doc = self.make_test_data() doc.archive_filename = None doc.save() - self.assertEqual(len(check_sanity()), 2) + messages = check_sanity() + self.assertTrue(messages.has_error()) + self.assertEqual(len(messages), 2) + self.assertRegex(messages[0]['message'], "has an archive file checksum, but no archive filename.") diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py index 8c5a1b5df..dab8ebfb9 100644 --- a/src/documents/tests/test_tasks.py +++ b/src/documents/tests/test_tasks.py @@ -2,12 +2,12 @@ from unittest import mock from django.conf import settings -from django.test import TestCase, override_settings +from django.test import TestCase from django.utils import timezone from documents import tasks from documents.models import Document, Tag, Correspondent, DocumentType -from documents.sanity_checker import SanityError, SanityFailedError +from documents.sanity_checker import SanityCheckMessages, SanityCheckFailedException from documents.tests.utils import DirectoriesMixin @@ -74,13 +74,33 @@ def test_train_classifier(self): self.assertNotEqual(mtime2, mtime3) @mock.patch("documents.tasks.sanity_checker.check_sanity") - def test_sanity_check(self, m): - m.return_value = [] - tasks.sanity_check() + def test_sanity_check_success(self, m): + m.return_value = SanityCheckMessages() + self.assertEqual(tasks.sanity_check(), "No issues detected.") m.assert_called_once() - m.reset_mock() - m.return_value = [SanityError("")] - self.assertRaises(SanityFailedError, tasks.sanity_check) + + @mock.patch("documents.tasks.sanity_checker.check_sanity") + def test_sanity_check_error(self, m): + messages = SanityCheckMessages() + messages.error("Some error") + m.return_value = messages + self.assertRaises(SanityCheckFailedException, tasks.sanity_check) + m.assert_called_once() + + @mock.patch("documents.tasks.sanity_checker.check_sanity") + def test_sanity_check_warning(self, m): + messages = SanityCheckMessages() + messages.warning("Some warning") + m.return_value = messages + self.assertEqual(tasks.sanity_check(), "Sanity check exited with warnings. See log.") + m.assert_called_once() + + @mock.patch("documents.tasks.sanity_checker.check_sanity") + def test_sanity_check_info(self, m): + messages = SanityCheckMessages() + messages.info("Some info") + m.return_value = messages + self.assertEqual(tasks.sanity_check(), "Sanity check exited with infos. See log.") m.assert_called_once() def test_bulk_update_documents(self):