Skip to content

Commit

Permalink
Merge pull request #5 from jina-ai/docs-manifest
Browse files Browse the repository at this point in the history
  • Loading branch information
cristianmtr authored Apr 4, 2022
2 parents 63cc428 + 778c13c commit 5c4b722
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 3 deletions.
8 changes: 5 additions & 3 deletions pdf_segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,14 @@ def _parse_pdf(self, doc: Document):
pdf_img = None
pdf_text = None
try:
if doc.uri:
pdf_img = fitz.open(doc.uri)
pdf_text = pdfplumber.open(doc.uri)
# when loading from URI, we should prioritize blob
# order is important. check test `tests/unit/test_exec.py::test_order_blob_uri`
if doc.blob:
pdf_img = fitz.open(stream=doc.blob, filetype='pdf')
pdf_text = pdfplumber.open(io.BytesIO(doc.blob))
elif doc.uri:
pdf_img = fitz.open(doc.uri)
pdf_text = pdfplumber.open(doc.uri)
except Exception as ex:
self.logger.error(f'Failed to open due to: {ex}')
return pdf_img, pdf_text
Expand Down
12 changes: 12 additions & 0 deletions tests/unit/test_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,15 @@ def test_io_img(executor_from_config, test_dir, doc_generator_img):
assert tensor.shape == (660, 1024, 3)
if idx == 1:
assert tensor.shape == (626, 1191, 3)


def test_order_blob_uri(executor_from_config):
pdf = 'tests/data/cats_are_awesome.pdf'
doc = Document(uri=pdf)
doc.load_uri_to_blob()
docs = DocumentArray(doc)

# this is why the order is important in `_parse_pdf` method in segmenter
executor_from_config.craft(docs)

assert len(docs[0].chunks) > 0

0 comments on commit 5c4b722

Please sign in to comment.