Skip to content

Commit

Permalink
Don't re-download existing images
Browse files Browse the repository at this point in the history
  • Loading branch information
wragge committed Sep 27, 2022
1 parent 5b16e58 commit 88cb726
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 117 deletions.
54 changes: 30 additions & 24 deletions 00_articles.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -165,39 +165,42 @@
" * a list of image file names\n",
" \"\"\"\n",
" images = []\n",
" if output_dir:\n",
" output_path = Path(output_dir)\n",
" output_path.mkdir(exist_ok=True, parents=True)\n",
" else:\n",
" output_path = \"\"\n",
" \n",
" # Get position of article on the page(s)\n",
" boxes = get_article_boxes(article_id)\n",
" for box in boxes:\n",
" cropped_file = Path(\n",
" output_path, f'nla.news-article{article_id}-{box[\"page_id\"]}.jpg'\n",
" )\n",
" if not cropped_file.exists():\n",
" # Construct the url we need to download the full page image\n",
" page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box[\"page_id\"]}/level7'\n",
"\n",
" # Construct the url we need to download the full page image\n",
" page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box[\"page_id\"]}/level7'\n",
" # Download the page image\n",
" response = requests.get(page_url)\n",
"\n",
" # Download the page image\n",
" response = requests.get(page_url)\n",
" # Open download as an image for editing\n",
" img = Image.open(BytesIO(response.content))\n",
"\n",
" # Open download as an image for editing\n",
" img = Image.open(BytesIO(response.content))\n",
" # Use coordinates of the bounding box to crop article\n",
" points = (box[\"left\"], box[\"top\"], box[\"right\"], box[\"bottom\"])\n",
"\n",
" # Use coordinates of the bounding box to crop article\n",
" points = (box[\"left\"], box[\"top\"], box[\"right\"], box[\"bottom\"])\n",
" # Crop image to article box\n",
" cropped = img.crop(points)\n",
"\n",
" # Crop image to article box\n",
" cropped = img.crop(points)\n",
" # Resize if necessary\n",
" if size:\n",
" cropped.thumbnail((size, size), Image.LANCZOS)\n",
"\n",
" # Resize if necessary\n",
" if size:\n",
" cropped.thumbnail((size, size), Image.LANCZOS)\n",
" # Save cropped image\n",
" \n",
"\n",
" # Save cropped image\n",
" if output_dir:\n",
" output_path = Path(output_dir)\n",
" output_path.mkdir(exist_ok=True, parents=True)\n",
" else:\n",
" output_path = \"\"\n",
" cropped_file = Path(\n",
" output_path, f'nla.news-article{article_id}-{box[\"page_id\"]}.jpg'\n",
" )\n",
" cropped.save(cropped_file)\n",
" cropped.save(cropped_file)\n",
" images.append(cropped_file.name)\n",
" # print(f'Downloaded: {images}')\n",
" return images\n",
Expand Down Expand Up @@ -478,7 +481,10 @@
"metadata": {},
"outputs": [],
"source": [
"assert Path(images[0]).exists()"
"assert Path(images[0]).exists()\n",
"\n",
"# Delete the image\n",
"Path(images[0]).unlink()"
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions nbdev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ project:

website:
title: "trove_newspaper_images"
site-url: "https://wragge.github.io/trove_newspaper_images/"
site-url: "https://wragge.github.io/trove_newspaper_images"
description: "Tool to download Trove newspaper articles as images."
repo-branch: master
repo-url: "https://github.com/wragge/trove_newspaper_images/tree/master/"
repo-url: "https://github.com/wragge/trove_newspaper_images"
98 changes: 32 additions & 66 deletions settings.ini
Original file line number Diff line number Diff line change
@@ -1,76 +1,42 @@
[DEFAULT]
# All sections below are required unless otherwise specified
host = github
lib_name = trove_newspaper_images
# For Enterprise Git add variable repo_name and company name
# repo_name = analytics
# company_name = nike
# All sections below are required unless otherwise specified.
# See https://github.com/fastai/nbdev/blob/master/settings.ini for examples.

user = wragge
description = Tool to download Trove newspaper articles as images.
keywords = Trove
author = Tim Sherratt
author_email = [email protected]
copyright = Tim Sherratt
branch = master
version = 0.2.0
### Python library ###
repo = trove_newspaper_images
lib_name = %(repo)s
version = 0.2.1
min_python = 3.8
audience = Developers
language = English
# Set to True if you want to create a more fancy sidebar.json than the default
custom_sidebar = False
# Add licenses and see current list in `setup.py`
license = mit
# From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive
status = 4

# Optional. Same format as setuptools requirements
requirements = requests beautifulsoup4 lxml pillow
dev_requirements = jupyterlab nbdev jupyterlab-code-formatter black isort
# Optional. Same format as setuptools console_scripts
console_scripts = trove_newspaper_images.download=trove_newspaper_images.articles:main
# Optional. Same format as setuptools dependency-links
# dep_links =

###
# You probably won't need to change anything under here,
# unless you have some special requirements
###

# Change to, e.g. "nbs", to put your notebooks in nbs dir instead of repo root
nbs_path = .
### nbdev ###
doc_path = _docs

# Whether to look for library notebooks recursively in the `nbs_path` dir
lib_path = trove_newspaper_images
nbs_path = .
recursive = False
tst_flags = notest
put_version_in_init = False

# Anything shown as '%(...)s' is substituted with that setting automatically
doc_host = https://%(user)s.github.io
#For Enterprise Git pages use:
#doc_host = https://pages.github.%(company_name)s.com.


doc_baseurl = /%(lib_name)s/
# For Enterprise Github pages docs use:
# doc_baseurl = /%(repo_name)s/%(lib_name)s/

git_url = https://github.com/%(user)s/%(lib_name)s/tree/%(branch)s/
# For Enterprise Github use:
#git_url = https://github.%(company_name)s.com/%(repo_name)s/%(lib_name)s/tree/%(branch)s/



lib_path = %(lib_name)s
### Docs ###
branch = master
custom_sidebar = False
doc_host = https://%(user)s.github.io
doc_baseurl = /%(repo)s
git_url = https://github.com/%(user)s/%(repo)s
title = %(lib_name)s

#Optional advanced parameters
#Monospace docstings: adds <pre> tags around the doc strings, preserving newlines/indentation.
#monospace_docstrings = False
#Test flags: introduce here the test flags you want to use separated by |
#tst_flags =
#Custom sidebar: customize sidebar.json yourself for advanced sidebars (False/True)
#custom_sidebar =
#Cell spacing: if you want cell blocks in code separated by more than one new line
#cell_spacing =
#Custom jekyll styles: if you want more jekyll styles than tip/important/warning, set them here
#jekyll_styles = note,warning,tip,important
### PyPI ###
audience = Developers
author = Tim Sherratt
author_email = [email protected]
copyright = 2022 onwards, %(author)s
description = Tool to download Trove newspaper articles as images.
keywords = nbdev jupyter notebook python
language = English
status = 3
user = wragge

### Optional ###
requirements = requests beautifulsoup4 lxml pillow
dev_requirements = jupyterlab nbdev jupyterlab-code-formatter black isort
console_scripts = trove_newspaper_images.download=trove_newspaper_images.articles:main
4 changes: 2 additions & 2 deletions trove_newspaper_images/_modidx.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Autogenerated by nbdev

d = { 'settings': { 'branch': 'master',
'doc_baseurl': '/trove_newspaper_images/',
'doc_baseurl': '/trove_newspaper_images',
'doc_host': 'https://wragge.github.io',
'git_url': 'https://github.com/wragge/trove_newspaper_images/tree/master/',
'git_url': 'https://github.com/wragge/trove_newspaper_images',
'lib_path': 'trove_newspaper_images'},
'syms': { 'trove_newspaper_images.articles': { 'trove_newspaper_images.articles.download_images': ( 'articles.html#download_images',
'trove_newspaper_images/articles.py'),
Expand Down
49 changes: 26 additions & 23 deletions trove_newspaper_images/articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,39 +117,42 @@ def download_images(article_id, output_dir="", size=None):
* a list of image file names
"""
images = []
if output_dir:
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True, parents=True)
else:
output_path = ""

# Get position of article on the page(s)
boxes = get_article_boxes(article_id)
for box in boxes:
cropped_file = Path(
output_path, f'nla.news-article{article_id}-{box["page_id"]}.jpg'
)
if not cropped_file.exists():
# Construct the url we need to download the full page image
page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box["page_id"]}/level7'

# Construct the url we need to download the full page image
page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box["page_id"]}/level7'
# Download the page image
response = requests.get(page_url)

# Download the page image
response = requests.get(page_url)
# Open download as an image for editing
img = Image.open(BytesIO(response.content))

# Open download as an image for editing
img = Image.open(BytesIO(response.content))
# Use coordinates of the bounding box to crop article
points = (box["left"], box["top"], box["right"], box["bottom"])

# Use coordinates of the bounding box to crop article
points = (box["left"], box["top"], box["right"], box["bottom"])
# Crop image to article box
cropped = img.crop(points)

# Crop image to article box
cropped = img.crop(points)
# Resize if necessary
if size:
cropped.thumbnail((size, size), Image.LANCZOS)

# Resize if necessary
if size:
cropped.thumbnail((size, size), Image.LANCZOS)
# Save cropped image


# Save cropped image
if output_dir:
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True, parents=True)
else:
output_path = ""
cropped_file = Path(
output_path, f'nla.news-article{article_id}-{box["page_id"]}.jpg'
)
cropped.save(cropped_file)
cropped.save(cropped_file)
images.append(cropped_file.name)
# print(f'Downloaded: {images}')
return images
Expand Down

0 comments on commit 88cb726

Please sign in to comment.