Merge pull request #5 from jina-ai/docs-manifest

jina-ai · Apr 4, 2022 · 5c4b722 · 5c4b722
2 parents 63cc428 + 778c13c
commit 5c4b722
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 3 deletions.
diff --git a/pdf_segmenter.py b/pdf_segmenter.py
@@ -54,12 +54,14 @@ def _parse_pdf(self, doc: Document):
         pdf_img = None
         pdf_text = None
         try:
-            if doc.uri:
-                pdf_img = fitz.open(doc.uri)
-                pdf_text = pdfplumber.open(doc.uri)
+            # when loading from URI, we should prioritize blob
+            # order is important. check test `tests/unit/test_exec.py::test_order_blob_uri`
             if doc.blob:
                 pdf_img = fitz.open(stream=doc.blob, filetype='pdf')
                 pdf_text = pdfplumber.open(io.BytesIO(doc.blob))
+            elif doc.uri:
+                pdf_img = fitz.open(doc.uri)
+                pdf_text = pdfplumber.open(doc.uri)
         except Exception as ex:
             self.logger.error(f'Failed to open due to: {ex}')
         return pdf_img, pdf_text

diff --git a/tests/unit/test_exec.py b/tests/unit/test_exec.py
@@ -84,3 +84,15 @@ def test_io_img(executor_from_config, test_dir, doc_generator_img):
                     assert tensor.shape == (660, 1024, 3)
                 if idx == 1:
                     assert tensor.shape == (626, 1191, 3)
+
+
+def test_order_blob_uri(executor_from_config):
+    pdf = 'tests/data/cats_are_awesome.pdf'
+    doc = Document(uri=pdf)
+    doc.load_uri_to_blob()
+    docs = DocumentArray(doc)
+
+    # this is why the order is important in `_parse_pdf` method in segmenter
+    executor_from_config.craft(docs)
+
+    assert len(docs[0].chunks) > 0