Skip to content

Commit

Permalink
add masking
Browse files Browse the repository at this point in the history
  • Loading branch information
wragge committed Apr 16, 2024
1 parent 88cb726 commit 7301c36
Show file tree
Hide file tree
Showing 7 changed files with 202 additions and 43 deletions.
150 changes: 130 additions & 20 deletions 00_articles.ipynb

Large diffs are not rendered by default.

11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
trove-newspaper-images
================
# trove-newspaper-images


<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->

Expand Down Expand Up @@ -54,6 +54,13 @@ specify a directory for the downloaded images. For example:
trove_newspaper_images.download 107024751 --output_dir images
```

Add the `--masked` parameter to try and remove content from neighbouring
articles.

``` shell
trove_newspaper_images.download 107024751 --masked
```

------------------------------------------------------------------------

Created by [Tim Sherratt](https://timsherratt.org)
Expand Down
8 changes: 7 additions & 1 deletion index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@
"\n",
"```shell\n",
"trove_newspaper_images.download 107024751 --output_dir images\n",
"```\n",
"\n",
"Add the `--masked` parameter to try and remove content from neighbouring articles.\n",
"\n",
"```shell\n",
"trove_newspaper_images.download 107024751 --masked\n",
"```"
]
},
Expand All @@ -95,7 +101,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "python3",
"language": "python",
"name": "python3"
}
Expand Down
4 changes: 2 additions & 2 deletions settings.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
### Python library ###
repo = trove_newspaper_images
lib_name = %(repo)s
version = 0.2.1
version = 0.3.1
min_python = 3.8
license = mit

Expand All @@ -15,7 +15,7 @@ lib_path = trove_newspaper_images
nbs_path = .
recursive = False
tst_flags = notest
put_version_in_init = False
put_version_in_init = True

### Docs ###
branch = master
Expand Down
2 changes: 1 addition & 1 deletion trove_newspaper_images/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.0"
__version__ = "0.3.1"
2 changes: 1 addition & 1 deletion trove_newspaper_images/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
'trove_newspaper_images.articles.get_box': ( 'articles.html#get_box',
'trove_newspaper_images/articles.py'),
'trove_newspaper_images.articles.main': ( 'articles.html#main',
'trove_newspaper_images/articles.py')}}}
'trove_newspaper_images/articles.py')}}}
68 changes: 52 additions & 16 deletions trove_newspaper_images/articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,30 @@ def get_box(zones):
page_id = zones[0]["data-page-id"]

# Loop through zones to find the outer boundaries of the article
z_boxes = []
for zone in zones:
if int(zone["data-y"]) < top:
top = int(zone["data-y"])
if int(zone["data-x"]) < left:
left = int(zone["data-x"])
if (int(zone["data-x"]) + int(zone["data-w"])) > right:
right = int(zone["data-x"]) + int(zone["data-w"])
if (int(zone["data-y"]) + int(zone["data-h"])) > bottom:
bottom = int(zone["data-y"]) + int(zone["data-h"])
z_left = int(zone["data-x"])
z_top = int(zone["data-y"])
z_right = int(zone["data-x"]) + int(zone["data-w"])
z_bottom = int(zone["data-y"]) + int(zone["data-h"])
z_boxes.append(
{"left": z_left, "top": z_top, "right": z_right, "bottom": z_bottom}
)
if z_top < top:
top = z_top
if z_left < left:
left = z_left
if z_right > right:
right = z_right
if z_bottom > bottom:
bottom = z_bottom
return {
"page_id": page_id,
"left": left,
"top": top,
"right": right,
"bottom": bottom,
"zones": z_boxes,
}


Expand Down Expand Up @@ -102,7 +111,7 @@ def get_article_boxes(article_id):
return boxes


def download_images(article_id, output_dir="", size=None):
def download_images(article_id, output_dir="", size=None, masked=False):
"""
Extract an image of a newspaper article from the page image(s), download and save it, and return the image filename(s).
Expand All @@ -111,6 +120,7 @@ def download_images(article_id, output_dir="", size=None):
* article_id -- identifier for a Trove newspaper article
* output_dir -- a directory to save images in (will be created if it doesn't exist)
* size -- maximum dimensions of image
* masked -- `True` or `False`, remove content that isn't part of the article
Returns:
Expand All @@ -122,7 +132,7 @@ def download_images(article_id, output_dir="", size=None):
output_path.mkdir(exist_ok=True, parents=True)
else:
output_path = ""

# Get position of article on the page(s)
boxes = get_article_boxes(article_id)
for box in boxes:
Expand All @@ -134,24 +144,49 @@ def download_images(article_id, output_dir="", size=None):
page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box["page_id"]}/level7'

# Download the page image
response = requests.get(page_url)
response = requests.get(page_url, stream=True)

# Open download as an image for editing
img = Image.open(BytesIO(response.content))

# Use coordinates of the bounding box to crop article
points = (box["left"], box["top"], box["right"], box["bottom"])

# Crop image to article box
cropped = img.crop(points)
if masked:

# Create a new empty image the same size as the original
new_img = Image.new("RGB", img.size, "#fdfdfd")

# Process each zone separately
for zone in box["zones"]:

# Get zone coords
z_points = (
zone["left"],
zone["top"],
zone["right"],
zone["bottom"],
)

# Crop the zone from the original image
zone_crop = img.crop(z_points)

# Paste the zone into the new image
new_img.paste(zone_crop, z_points)

# Crop the new image to the article box
cropped = new_img.crop(points)

else:

# Crop image to article box
cropped = img.crop(points)

# Resize if necessary
if size:
cropped.thumbnail((size, size), Image.LANCZOS)

# Save cropped image


cropped.save(cropped_file)
images.append(cropped_file.name)
# print(f'Downloaded: {images}')
Expand All @@ -166,5 +201,6 @@ def main():
parser.add_argument("article_id", help="article identifier")
parser.add_argument("--output_dir", help="directory to save images")
parser.add_argument("--size", help="maximum image dimensions")
parser.add_argument('--masked', action="store_true", help="mask image")
args = parser.parse_args()
download_images(args.article_id, args.output_dir, args.size)
download_images(args.article_id, args.output_dir, args.size, args.masked)

0 comments on commit 7301c36

Please sign in to comment.