diff --git a/00_articles.ipynb b/00_articles.ipynb index fd1cec8..ca38c0f 100644 --- a/00_articles.ipynb +++ b/00_articles.ipynb @@ -165,39 +165,42 @@ " * a list of image file names\n", " \"\"\"\n", " images = []\n", + " if output_dir:\n", + " output_path = Path(output_dir)\n", + " output_path.mkdir(exist_ok=True, parents=True)\n", + " else:\n", + " output_path = \"\"\n", + " \n", " # Get position of article on the page(s)\n", " boxes = get_article_boxes(article_id)\n", " for box in boxes:\n", + " cropped_file = Path(\n", + " output_path, f'nla.news-article{article_id}-{box[\"page_id\"]}.jpg'\n", + " )\n", + " if not cropped_file.exists():\n", + " # Construct the url we need to download the full page image\n", + " page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box[\"page_id\"]}/level7'\n", "\n", - " # Construct the url we need to download the full page image\n", - " page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box[\"page_id\"]}/level7'\n", + " # Download the page image\n", + " response = requests.get(page_url)\n", "\n", - " # Download the page image\n", - " response = requests.get(page_url)\n", + " # Open download as an image for editing\n", + " img = Image.open(BytesIO(response.content))\n", "\n", - " # Open download as an image for editing\n", - " img = Image.open(BytesIO(response.content))\n", + " # Use coordinates of the bounding box to crop article\n", + " points = (box[\"left\"], box[\"top\"], box[\"right\"], box[\"bottom\"])\n", "\n", - " # Use coordinates of the bounding box to crop article\n", - " points = (box[\"left\"], box[\"top\"], box[\"right\"], box[\"bottom\"])\n", + " # Crop image to article box\n", + " cropped = img.crop(points)\n", "\n", - " # Crop image to article box\n", - " cropped = img.crop(points)\n", + " # Resize if necessary\n", + " if size:\n", + " cropped.thumbnail((size, size), Image.LANCZOS)\n", "\n", - " # Resize if necessary\n", - " if size:\n", - " cropped.thumbnail((size, size), Image.LANCZOS)\n", + " # Save cropped image\n", + " \n", "\n", - " # Save cropped image\n", - " if output_dir:\n", - " output_path = Path(output_dir)\n", - " output_path.mkdir(exist_ok=True, parents=True)\n", - " else:\n", - " output_path = \"\"\n", - " cropped_file = Path(\n", - " output_path, f'nla.news-article{article_id}-{box[\"page_id\"]}.jpg'\n", - " )\n", - " cropped.save(cropped_file)\n", + " cropped.save(cropped_file)\n", " images.append(cropped_file.name)\n", " # print(f'Downloaded: {images}')\n", " return images\n", @@ -478,7 +481,10 @@ "metadata": {}, "outputs": [], "source": [ - "assert Path(images[0]).exists()" + "assert Path(images[0]).exists()\n", + "\n", + "# Delete the image\n", + "Path(images[0]).unlink()" ] }, { diff --git a/nbdev.yml b/nbdev.yml index 32aade2..abbb14b 100644 --- a/nbdev.yml +++ b/nbdev.yml @@ -3,7 +3,7 @@ project: website: title: "trove_newspaper_images" - site-url: "https://wragge.github.io/trove_newspaper_images/" + site-url: "https://wragge.github.io/trove_newspaper_images" description: "Tool to download Trove newspaper articles as images." repo-branch: master - repo-url: "https://github.com/wragge/trove_newspaper_images/tree/master/" + repo-url: "https://github.com/wragge/trove_newspaper_images" diff --git a/settings.ini b/settings.ini index c6f5253..8b0306b 100644 --- a/settings.ini +++ b/settings.ini @@ -1,76 +1,42 @@ [DEFAULT] -# All sections below are required unless otherwise specified -host = github -lib_name = trove_newspaper_images -# For Enterprise Git add variable repo_name and company name -# repo_name = analytics -# company_name = nike +# All sections below are required unless otherwise specified. +# See https://github.com/fastai/nbdev/blob/master/settings.ini for examples. -user = wragge -description = Tool to download Trove newspaper articles as images. -keywords = Trove -author = Tim Sherratt -author_email = tim@timsherratt.org -copyright = Tim Sherratt -branch = master -version = 0.2.0 +### Python library ### +repo = trove_newspaper_images +lib_name = %(repo)s +version = 0.2.1 min_python = 3.8 -audience = Developers -language = English -# Set to True if you want to create a more fancy sidebar.json than the default -custom_sidebar = False -# Add licenses and see current list in `setup.py` license = mit -# From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive -status = 4 -# Optional. Same format as setuptools requirements -requirements = requests beautifulsoup4 lxml pillow -dev_requirements = jupyterlab nbdev jupyterlab-code-formatter black isort -# Optional. Same format as setuptools console_scripts -console_scripts = trove_newspaper_images.download=trove_newspaper_images.articles:main -# Optional. Same format as setuptools dependency-links -# dep_links = - -### -# You probably won't need to change anything under here, -# unless you have some special requirements -### - -# Change to, e.g. "nbs", to put your notebooks in nbs dir instead of repo root -nbs_path = . +### nbdev ### doc_path = _docs - -# Whether to look for library notebooks recursively in the `nbs_path` dir +lib_path = trove_newspaper_images +nbs_path = . recursive = False +tst_flags = notest +put_version_in_init = False -# Anything shown as '%(...)s' is substituted with that setting automatically -doc_host = https://%(user)s.github.io -#For Enterprise Git pages use: -#doc_host = https://pages.github.%(company_name)s.com. - - -doc_baseurl = /%(lib_name)s/ -# For Enterprise Github pages docs use: -# doc_baseurl = /%(repo_name)s/%(lib_name)s/ - -git_url = https://github.com/%(user)s/%(lib_name)s/tree/%(branch)s/ -# For Enterprise Github use: -#git_url = https://github.%(company_name)s.com/%(repo_name)s/%(lib_name)s/tree/%(branch)s/ - - - -lib_path = %(lib_name)s +### Docs ### +branch = master +custom_sidebar = False +doc_host = https://%(user)s.github.io +doc_baseurl = /%(repo)s +git_url = https://github.com/%(user)s/%(repo)s title = %(lib_name)s -#Optional advanced parameters -#Monospace docstings: adds
tags around the doc strings, preserving newlines/indentation. -#monospace_docstrings = False -#Test flags: introduce here the test flags you want to use separated by | -#tst_flags = -#Custom sidebar: customize sidebar.json yourself for advanced sidebars (False/True) -#custom_sidebar = -#Cell spacing: if you want cell blocks in code separated by more than one new line -#cell_spacing = -#Custom jekyll styles: if you want more jekyll styles than tip/important/warning, set them here -#jekyll_styles = note,warning,tip,important +### PyPI ### +audience = Developers +author = Tim Sherratt +author_email = tim@timsherratt.org +copyright = 2022 onwards, %(author)s +description = Tool to download Trove newspaper articles as images. +keywords = nbdev jupyter notebook python +language = English +status = 3 +user = wragge + +### Optional ### +requirements = requests beautifulsoup4 lxml pillow +dev_requirements = jupyterlab nbdev jupyterlab-code-formatter black isort +console_scripts = trove_newspaper_images.download=trove_newspaper_images.articles:main \ No newline at end of file diff --git a/trove_newspaper_images/_modidx.py b/trove_newspaper_images/_modidx.py index 10b3144..6fe423f 100644 --- a/trove_newspaper_images/_modidx.py +++ b/trove_newspaper_images/_modidx.py @@ -1,9 +1,9 @@ # Autogenerated by nbdev d = { 'settings': { 'branch': 'master', - 'doc_baseurl': '/trove_newspaper_images/', + 'doc_baseurl': '/trove_newspaper_images', 'doc_host': 'https://wragge.github.io', - 'git_url': 'https://github.com/wragge/trove_newspaper_images/tree/master/', + 'git_url': 'https://github.com/wragge/trove_newspaper_images', 'lib_path': 'trove_newspaper_images'}, 'syms': { 'trove_newspaper_images.articles': { 'trove_newspaper_images.articles.download_images': ( 'articles.html#download_images', 'trove_newspaper_images/articles.py'), diff --git a/trove_newspaper_images/articles.py b/trove_newspaper_images/articles.py index 3c8ad8d..ca2f836 100644 --- a/trove_newspaper_images/articles.py +++ b/trove_newspaper_images/articles.py @@ -117,39 +117,42 @@ def download_images(article_id, output_dir="", size=None): * a list of image file names """ images = [] + if output_dir: + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True, parents=True) + else: + output_path = "" + # Get position of article on the page(s) boxes = get_article_boxes(article_id) for box in boxes: + cropped_file = Path( + output_path, f'nla.news-article{article_id}-{box["page_id"]}.jpg' + ) + if not cropped_file.exists(): + # Construct the url we need to download the full page image + page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box["page_id"]}/level7' - # Construct the url we need to download the full page image - page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box["page_id"]}/level7' + # Download the page image + response = requests.get(page_url) - # Download the page image - response = requests.get(page_url) + # Open download as an image for editing + img = Image.open(BytesIO(response.content)) - # Open download as an image for editing - img = Image.open(BytesIO(response.content)) + # Use coordinates of the bounding box to crop article + points = (box["left"], box["top"], box["right"], box["bottom"]) - # Use coordinates of the bounding box to crop article - points = (box["left"], box["top"], box["right"], box["bottom"]) + # Crop image to article box + cropped = img.crop(points) - # Crop image to article box - cropped = img.crop(points) + # Resize if necessary + if size: + cropped.thumbnail((size, size), Image.LANCZOS) - # Resize if necessary - if size: - cropped.thumbnail((size, size), Image.LANCZOS) + # Save cropped image + - # Save cropped image - if output_dir: - output_path = Path(output_dir) - output_path.mkdir(exist_ok=True, parents=True) - else: - output_path = "" - cropped_file = Path( - output_path, f'nla.news-article{article_id}-{box["page_id"]}.jpg' - ) - cropped.save(cropped_file) + cropped.save(cropped_file) images.append(cropped_file.name) # print(f'Downloaded: {images}') return images