From 8f3e9884c77515777a7747fca3d350947f768429 Mon Sep 17 00:00:00 2001 From: DanAtShenTech Date: Mon, 15 Apr 2019 19:47:22 -0400 Subject: [PATCH 1/7] Pass None to async_transcribe_document() options parameter is required but no value is necessary when saving a document --- src/wagtail_textract/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wagtail_textract/models.py b/src/wagtail_textract/models.py index 7091a33..7478cf1 100644 --- a/src/wagtail_textract/models.py +++ b/src/wagtail_textract/models.py @@ -18,7 +18,7 @@ def save(self, **kwargs): transcribe = kwargs.pop('transcribe', True) super(TranscriptionMixin, self).save(**kwargs) if transcribe: - async_transcribe_document(self) + async_transcribe_document(self, None) class Document(TranscriptionMixin, WagtailDocument): From 63144cc5365a0083610f170831844ad25792b1a1 Mon Sep 17 00:00:00 2001 From: DanAtShenTech Date: Mon, 15 Apr 2019 19:58:42 -0400 Subject: [PATCH 2/7] Add --slice, --dry-run, --verbosity arguments Output is determined by --verbosity: only output the transcribed text if verbosity is 3. Order documents by title to ensure iterating in the same order each time. When verbosity is >= 1, output the document number and the slice information. Output count of documents that will be transcribed and blank lines for readability. --- .../commands/transcribe_documents.py | 43 ++++++++++++++++--- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/src/wagtail_textract/management/commands/transcribe_documents.py b/src/wagtail_textract/management/commands/transcribe_documents.py index 7e836ec..8ef13ec 100644 --- a/src/wagtail_textract/management/commands/transcribe_documents.py +++ b/src/wagtail_textract/management/commands/transcribe_documents.py @@ -1,15 +1,48 @@ from django.core.management.base import BaseCommand - from wagtail.documents.models import get_document_model - from wagtail_textract.handlers import async_transcribe_document class Command(BaseCommand): """Extract text from all Documents.""" + help = 'Extract text from Documents' + def add_arguments(self, parser): + # Named (optional) arguments + parser.add_argument('-s', '--slice', type=str, help="Transcribe a subset of documents using Python's basic slicing syntax") + parser.add_argument('-d', '--dry-run', action='store_true', dest='dry_run', help="Show what actions will be undertaken with a given transcribe command and its associated parameters") + def handle(self, *args, **options): """Extract text from all Documents.""" - for document in get_document_model().objects.all(): - self.stdout.write("Transcribing %s" % document) - async_transcribe_document(document) + ctr = 1 + slice_ctr = 0 + if options['slice']: + slices = [x for x in options['slice'].split(':') if x] + if len(slices) == 2: + docs = get_document_model().objects.all().order_by('title')[int(slices[0]):int(slices[1])] + slice_ctr = int(slices[0]) + elif options['slice'].startswith(':') and len(slices) == 1: + docs = get_document_model().objects.all().order_by('title')[:int(slices[0])] + elif options['slice'].endswith(':') and len(slices) == 1: + docs = get_document_model().objects.all().order_by('title')[int(slices[0]):] + slice_ctr = int(slices[0]) + else: + docs = get_document_model().objects.all().order_by('title') + else: + docs = get_document_model().objects.all().order_by('title') + + if options['dry_run']: + self.stdout.write("\n{:,} documents will be transcribed\n\n".format( docs.count())) + else: + self.stdout.write("\nStarting Transcription of {:,} documents\n\n".format( docs.count())) + for document in docs: + if options['verbosity'] >= 1: + print("{:,} (-s {}:{}) - {}".format(ctr, slice_ctr, slice_ctr + 1, document)) + if not options['dry_run']: + async_transcribe_document(document, options) + ctr += 1 + slice_ctr += 1 + if not options['dry_run']: + self.stdout.write("\n{:,} documents being processed asynchonously\n\n--- AWAITING COMPLETION ---\n\n".format( docs.count())) + else: + self.stdout.write("") From 28c8c2d53ed6dde53f5ab2659558d3ac1554289f Mon Sep 17 00:00:00 2001 From: DanAtShenTech Date: Mon, 15 Apr 2019 20:09:31 -0400 Subject: [PATCH 3/7] Implement verbosity argument Only output statement about falling back to tesseract if verbosity >= 2. Only output transcribed text if verbosity >= 3. --- src/wagtail_textract/handlers.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/wagtail_textract/handlers.py b/src/wagtail_textract/handlers.py index 4f7c64b..9ea2166 100644 --- a/src/wagtail_textract/handlers.py +++ b/src/wagtail_textract/handlers.py @@ -6,12 +6,13 @@ loop = asyncio.get_event_loop() -def transcribe_document(document): +def transcribe_document(document, options): """Store the Document file's text in the transcription field.""" try: text = textract.process(document.file.path).strip() if not text: - logger.debug('No text found, falling back to tesseract.') + if 'verbosity' in options and options['verbosity'] >= 2: + print('No text found - falling back to tesseract: {} ({})'.format(document, document.filename)) text = textract.process( document.file.path, method='tesseract', @@ -20,7 +21,7 @@ def transcribe_document(document): except Exception as err: text = None logger.error( - 'Text extraction error with file {file}: {message}'.format( + '\n\nText extraction error with file {file}: {message}\n\n'.format( file=document.filename, message=str(err), ) @@ -29,11 +30,13 @@ def transcribe_document(document): if text: document.transcription = text.decode() document.save(transcribe=False) - print("Saved transcription: %s" % text) + if 'verbosity' in options and options['verbosity'] == 3: + print("Saved transcription for {}:\n{}\n".format(document, text)) else: - logger.error('No text found.') + logger.error('No text found: {} ({})'.format(document, document.filename)) -def async_transcribe_document(document): +def async_transcribe_document(document, options): """Defer transcription to an asyncio executor.""" - loop.run_in_executor(None, transcribe_document, document) + loop.run_in_executor(None, transcribe_document, document, options) + From f3067957b276f16f6256844bbfbd4ac4c0343153 Mon Sep 17 00:00:00 2001 From: DanAtShenTech Date: Mon, 15 Apr 2019 23:43:39 -0400 Subject: [PATCH 4/7] Document --slice, --verbosity, and --dry-run arguments Arguments added to the transcribe_documents management command. --- README.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 815049e..0862244 100644 --- a/README.md +++ b/README.md @@ -73,13 +73,26 @@ Create a `tessdata` directory in your project directory, and download the Transcription is done automatically after Document save, in an [`asyncio`][7] executor to prevent blocking the response during processing. -To transcribe all existing Documents, run the management command:: +### Transcribe existing documents + +To transcribe all existing Documents, run the management command: ./manage.py transcribe_documents -This may take a long time, obviously. +Transcribing every document may take a long time. To transcribe a subset of documents, include a slice notation that will be applied to the document queryset (the alternative `-s` syntax may also be used, e.g. `-s 4:7`): + + ./manage.py transcribe_documents --slice 4:7 + +To control the amount of text written to the terminal while transcribing, set `--verbosity` to a level between 0 and 3 (the alternative `-v` syntax may also be used, e.g. `-v 2`): + + ./manage.py transcribe_documents --slice 4:7 --verbosity 2 + +Verbosity level 0 outputs only the number of documents to be trancribed. Verbosity level 1 also outputs the name of each document, the number of the document in the subject queryset, and the slice notation that would cause a particular document to be transcribed. Verbosity level 2 also outputs a message when Tesseract is invoked. Verbosity level 3 adds to the output of levels 0 through 2 by outputting the text that was transcribed for a document. +To do a dry run without actually starting transcription, include the `--dry-run` flag (the alternative `-d` syntax may also be used): + ./manage.py transcribe_documents --slice 4:7 --verbosity 2 --dry-run + ## Usage in custom view Here is a code example for a search view (outside Wagtail's admin interface) From bd1dbc7bb125206d2d5f812bef64da847a6f2e46 Mon Sep 17 00:00:00 2001 From: DanAtShenTech Date: Tue, 23 Apr 2019 09:45:24 -0400 Subject: [PATCH 5/7] Update requirement for six library: >=1.11,<2.0 >=1.11,<2.0 is the current Wagtail requirement for six --- travis-textract-requirements/python.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis-textract-requirements/python.txt b/travis-textract-requirements/python.txt index 98fb9b0..3a43ca9 100644 --- a/travis-textract-requirements/python.txt +++ b/travis-textract-requirements/python.txt @@ -12,5 +12,5 @@ xlrd==1.0.0 EbookLib==0.16 SpeechRecognition==3.7.1 https://github.com/mattgwwalker/msg-extractor/zipball/master -six==1.10.0 +six>=1.11,<2.0 pocketsphinx==0.1.3 From 550c7f35e9de5480e4cc7a6e21175370a00a2710 Mon Sep 17 00:00:00 2001 From: Dan Swain Date: Fri, 24 Jul 2020 08:25:17 -0400 Subject: [PATCH 6/7] Update tox.ini through Wagtail 2.9 --- tox.ini | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tox.ini b/tox.ini index d144a6e..669c12a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,15 +1,21 @@ [tox] envlist = py{34,35,36}-dj{20}-wt{20,21,22} - py{35,36}-dj{21}-wt{23} - py{35,36,37}-dj{21}-wt{24} + py{35,36}-dj{20,21}-wt{23} + py{35,36,37}-dj{20,21}-wt{24} + py{35,36,37}-dj{20,21,22}-wt{25,26} + py{35,36,37,38}-dj{20,21,22}-wt{27} + py{36,37,38}-dj{21,22,30}-wt{28} + py{36,37,38}-dj{22,30}-wt{29} [testenv] basepython = py34: python3.4 - py35: python3.5 + py35: python3.5 py36: python3.6 py37: python3.7 + py38: python3.8 + deps = pytest @@ -17,12 +23,18 @@ deps = coverage codecov dj20: Django>=2.0,<2.1 - dj21: Django>=2.1,<2.2 + dj21: Django>=2.1,<2.2 + dj22: Django>=2.2,<2.3 + dj30: Django>=3.0,<3.1 wt20: wagtail>=2.0,<2.1 wt21: wagtail>=2.1,<2.2 wt22: wagtail>=2.2,<2.3 wt23: wagtail>=2.3,<2.4 wt24: wagtail>=2.4,<2.5 + wt25: wagtail>=2.5,<2.6 + wt26: wagtail>=2.6,<2.7 + wt27: wagtail>=2.7,<2.8 + wt28: wagtail>=2.8,<2.9 whitelist_externals = make From d378fcfbfb272ab3bdd240054d264c90a11b595b Mon Sep 17 00:00:00 2001 From: Dan Swain Date: Fri, 24 Jul 2020 08:39:52 -0400 Subject: [PATCH 7/7] Update tox.ini - remove Py 3.4 support --- tox.ini | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index 669c12a..c6e8c2c 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] envlist = - py{34,35,36}-dj{20}-wt{20,21,22} + py{35,36}-dj{20}-wt{20,21,22} py{35,36}-dj{20,21}-wt{23} py{35,36,37}-dj{20,21}-wt{24} py{35,36,37}-dj{20,21,22}-wt{25,26} @@ -10,7 +10,6 @@ envlist = [testenv] basepython = - py34: python3.4 py35: python3.5 py36: python3.6 py37: python3.7