Don't re-download existing images

wragge · Sep 27, 2022 · 88cb726 · 88cb726
1 parent 5b16e58
commit 88cb726
Show file tree

Hide file tree

Showing 5 changed files with 92 additions and 117 deletions.
diff --git a/00_articles.ipynb b/00_articles.ipynb
@@ -165,39 +165,42 @@
     "    * a list of image file names\n",
     "    \"\"\"\n",
     "    images = []\n",
+    "    if output_dir:\n",
+    "        output_path = Path(output_dir)\n",
+    "        output_path.mkdir(exist_ok=True, parents=True)\n",
+    "    else:\n",
+    "        output_path = \"\"\n",
+    "    \n",
     "    # Get position of article on the page(s)\n",
     "    boxes = get_article_boxes(article_id)\n",
     "    for box in boxes:\n",
+    "        cropped_file = Path(\n",
+    "            output_path, f'nla.news-article{article_id}-{box[\"page_id\"]}.jpg'\n",
+    "        )\n",
+    "        if not cropped_file.exists():\n",
+    "            # Construct the url we need to download the full page image\n",
+    "            page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box[\"page_id\"]}/level7'\n",
     "\n",
-    "        # Construct the url we need to download the full page image\n",
-    "        page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box[\"page_id\"]}/level7'\n",
+    "            # Download the page image\n",
+    "            response = requests.get(page_url)\n",
     "\n",
-    "        # Download the page image\n",
-    "        response = requests.get(page_url)\n",
+    "            # Open download as an image for editing\n",
+    "            img = Image.open(BytesIO(response.content))\n",
     "\n",
-    "        # Open download as an image for editing\n",
-    "        img = Image.open(BytesIO(response.content))\n",
+    "            # Use coordinates of the bounding box to crop article\n",
+    "            points = (box[\"left\"], box[\"top\"], box[\"right\"], box[\"bottom\"])\n",
     "\n",
-    "        # Use coordinates of the bounding box to crop article\n",
-    "        points = (box[\"left\"], box[\"top\"], box[\"right\"], box[\"bottom\"])\n",
+    "            # Crop image to article box\n",
+    "            cropped = img.crop(points)\n",
     "\n",
-    "        # Crop image to article box\n",
-    "        cropped = img.crop(points)\n",
+    "            # Resize if necessary\n",
+    "            if size:\n",
+    "                cropped.thumbnail((size, size), Image.LANCZOS)\n",
     "\n",
-    "        # Resize if necessary\n",
-    "        if size:\n",
-    "            cropped.thumbnail((size, size), Image.LANCZOS)\n",
+    "            # Save cropped image\n",
+    "            \n",
     "\n",
-    "        # Save cropped image\n",
-    "        if output_dir:\n",
-    "            output_path = Path(output_dir)\n",
-    "            output_path.mkdir(exist_ok=True, parents=True)\n",
-    "        else:\n",
-    "            output_path = \"\"\n",
-    "        cropped_file = Path(\n",
-    "            output_path, f'nla.news-article{article_id}-{box[\"page_id\"]}.jpg'\n",
-    "        )\n",
-    "        cropped.save(cropped_file)\n",
+    "            cropped.save(cropped_file)\n",
     "        images.append(cropped_file.name)\n",
     "    # print(f'Downloaded: {images}')\n",
     "    return images\n",
@@ -478,7 +481,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "assert Path(images[0]).exists()"
+    "assert Path(images[0]).exists()\n",
+    "\n",
+    "# Delete the image\n",
+    "Path(images[0]).unlink()"
    ]
   },
   {

diff --git a/nbdev.yml b/nbdev.yml
@@ -3,7 +3,7 @@ project:
 
 website:
   title: "trove_newspaper_images"
-  site-url: "https://wragge.github.io/trove_newspaper_images/"
+  site-url: "https://wragge.github.io/trove_newspaper_images"
   description: "Tool to download Trove newspaper articles as images."
   repo-branch: master
-  repo-url: "https://github.com/wragge/trove_newspaper_images/tree/master/"
+  repo-url: "https://github.com/wragge/trove_newspaper_images"
diff --git a/settings.ini b/settings.ini
@@ -1,76 +1,42 @@
 [DEFAULT]
-# All sections below are required unless otherwise specified
-host = github
-lib_name = trove_newspaper_images
-# For Enterprise Git add variable repo_name and company name
-# repo_name = analytics
-# company_name = nike
+# All sections below are required unless otherwise specified.
+# See https://github.com/fastai/nbdev/blob/master/settings.ini for examples.
 
-user = wragge
-description = Tool to download Trove newspaper articles as images.
-keywords = Trove
-author = Tim Sherratt
-author_email = [email protected]
-copyright = Tim Sherratt
-branch = master
-version = 0.2.0
+### Python library ###
+repo = trove_newspaper_images
+lib_name = %(repo)s
+version = 0.2.1
 min_python = 3.8
-audience = Developers
-language = English
-# Set to True if you want to create a more fancy sidebar.json than the default
-custom_sidebar = False
-# Add licenses and see current list in `setup.py`
 license = mit
-# From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive
-status = 4
 
-# Optional. Same format as setuptools requirements
-requirements = requests beautifulsoup4 lxml pillow
-dev_requirements = jupyterlab nbdev jupyterlab-code-formatter black isort
-# Optional. Same format as setuptools console_scripts
-console_scripts = trove_newspaper_images.download=trove_newspaper_images.articles:main
-# Optional. Same format as setuptools dependency-links
-# dep_links = 
-
-###
-# You probably won't need to change anything under here,
-#   unless you have some special requirements
-###
-
-# Change to, e.g. "nbs", to put your notebooks in nbs dir instead of repo root
-nbs_path = .
+### nbdev ###
 doc_path = _docs
-
-# Whether to look for library notebooks recursively in the `nbs_path` dir
+lib_path = trove_newspaper_images
+nbs_path = .
 recursive = False
+tst_flags = notest
+put_version_in_init = False
 
-# Anything shown as '%(...)s' is substituted with that setting automatically
-doc_host =  https://%(user)s.github.io
-#For Enterprise Git pages use:  
-#doc_host = https://pages.github.%(company_name)s.com.  
-
-
-doc_baseurl = /%(lib_name)s/
-# For Enterprise Github pages docs use:
-# doc_baseurl = /%(repo_name)s/%(lib_name)s/
-
-git_url = https://github.com/%(user)s/%(lib_name)s/tree/%(branch)s/
-# For Enterprise Github use:
-#git_url = https://github.%(company_name)s.com/%(repo_name)s/%(lib_name)s/tree/%(branch)s/
-
-
-
-lib_path = %(lib_name)s
+### Docs ###
+branch = master
+custom_sidebar = False
+doc_host = https://%(user)s.github.io
+doc_baseurl = /%(repo)s
+git_url = https://github.com/%(user)s/%(repo)s
 title = %(lib_name)s
 
-#Optional advanced parameters
-#Monospace docstings: adds <pre> tags around the doc strings, preserving newlines/indentation.
-#monospace_docstrings = False
-#Test flags: introduce here the test flags you want to use separated by |
-#tst_flags = 
-#Custom sidebar: customize sidebar.json yourself for advanced sidebars (False/True)
-#custom_sidebar = 
-#Cell spacing: if you want cell blocks in code separated by more than one new line
-#cell_spacing = 
-#Custom jekyll styles: if you want more jekyll styles than tip/important/warning, set them here
-#jekyll_styles = note,warning,tip,important
+### PyPI ###
+audience = Developers
+author = Tim Sherratt
+author_email = [email protected]
+copyright = 2022 onwards, %(author)s
+description = Tool to download Trove newspaper articles as images.
+keywords = nbdev jupyter notebook python
+language = English
+status = 3
+user = wragge
+
+### Optional ###
+requirements = requests beautifulsoup4 lxml pillow
+dev_requirements = jupyterlab nbdev jupyterlab-code-formatter black isort
+console_scripts = trove_newspaper_images.download=trove_newspaper_images.articles:main
diff --git a/trove_newspaper_images/_modidx.py b/trove_newspaper_images/_modidx.py
@@ -1,9 +1,9 @@
 # Autogenerated by nbdev
 
 d = { 'settings': { 'branch': 'master',
-                'doc_baseurl': '/trove_newspaper_images/',
+                'doc_baseurl': '/trove_newspaper_images',
                 'doc_host': 'https://wragge.github.io',
-                'git_url': 'https://github.com/wragge/trove_newspaper_images/tree/master/',
+                'git_url': 'https://github.com/wragge/trove_newspaper_images',
                 'lib_path': 'trove_newspaper_images'},
   'syms': { 'trove_newspaper_images.articles': { 'trove_newspaper_images.articles.download_images': ( 'articles.html#download_images',
                                                                                                       'trove_newspaper_images/articles.py'),

diff --git a/trove_newspaper_images/articles.py b/trove_newspaper_images/articles.py
@@ -117,39 +117,42 @@ def download_images(article_id, output_dir="", size=None):
     * a list of image file names
     """
     images = []
+    if output_dir:
+        output_path = Path(output_dir)
+        output_path.mkdir(exist_ok=True, parents=True)
+    else:
+        output_path = ""
+
     # Get position of article on the page(s)
     boxes = get_article_boxes(article_id)
     for box in boxes:
+        cropped_file = Path(
+            output_path, f'nla.news-article{article_id}-{box["page_id"]}.jpg'
+        )
+        if not cropped_file.exists():
+            # Construct the url we need to download the full page image
+            page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box["page_id"]}/level7'
 
-        # Construct the url we need to download the full page image
-        page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box["page_id"]}/level7'
+            # Download the page image
+            response = requests.get(page_url)
 
-        # Download the page image
-        response = requests.get(page_url)
+            # Open download as an image for editing
+            img = Image.open(BytesIO(response.content))
 
-        # Open download as an image for editing
-        img = Image.open(BytesIO(response.content))
+            # Use coordinates of the bounding box to crop article
+            points = (box["left"], box["top"], box["right"], box["bottom"])
 
-        # Use coordinates of the bounding box to crop article
-        points = (box["left"], box["top"], box["right"], box["bottom"])
+            # Crop image to article box
+            cropped = img.crop(points)
 
-        # Crop image to article box
-        cropped = img.crop(points)
+            # Resize if necessary
+            if size:
+                cropped.thumbnail((size, size), Image.LANCZOS)
 
-        # Resize if necessary
-        if size:
-            cropped.thumbnail((size, size), Image.LANCZOS)
+            # Save cropped image
+
 
-        # Save cropped image
-        if output_dir:
-            output_path = Path(output_dir)
-            output_path.mkdir(exist_ok=True, parents=True)
-        else:
-            output_path = ""
-        cropped_file = Path(
-            output_path, f'nla.news-article{article_id}-{box["page_id"]}.jpg'
-        )
-        cropped.save(cropped_file)
+            cropped.save(cropped_file)
         images.append(cropped_file.name)
     # print(f'Downloaded: {images}')
     return images