diff --git a/00_articles.ipynb b/00_articles.ipynb index fd1cec8..ca38c0f 100644 --- a/00_articles.ipynb +++ b/00_articles.ipynb @@ -165,39 +165,42 @@ " * a list of image file names\n", " \"\"\"\n", " images = []\n", + " if output_dir:\n", + " output_path = Path(output_dir)\n", + " output_path.mkdir(exist_ok=True, parents=True)\n", + " else:\n", + " output_path = \"\"\n", + " \n", " # Get position of article on the page(s)\n", " boxes = get_article_boxes(article_id)\n", " for box in boxes:\n", + " cropped_file = Path(\n", + " output_path, f'nla.news-article{article_id}-{box[\"page_id\"]}.jpg'\n", + " )\n", + " if not cropped_file.exists():\n", + " # Construct the url we need to download the full page image\n", + " page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box[\"page_id\"]}/level7'\n", "\n", - " # Construct the url we need to download the full page image\n", - " page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box[\"page_id\"]}/level7'\n", + " # Download the page image\n", + " response = requests.get(page_url)\n", "\n", - " # Download the page image\n", - " response = requests.get(page_url)\n", + " # Open download as an image for editing\n", + " img = Image.open(BytesIO(response.content))\n", "\n", - " # Open download as an image for editing\n", - " img = Image.open(BytesIO(response.content))\n", + " # Use coordinates of the bounding box to crop article\n", + " points = (box[\"left\"], box[\"top\"], box[\"right\"], box[\"bottom\"])\n", "\n", - " # Use coordinates of the bounding box to crop article\n", - " points = (box[\"left\"], box[\"top\"], box[\"right\"], box[\"bottom\"])\n", + " # Crop image to article box\n", + " cropped = img.crop(points)\n", "\n", - " # Crop image to article box\n", - " cropped = img.crop(points)\n", + " # Resize if necessary\n", + " if size:\n", + " cropped.thumbnail((size, size), Image.LANCZOS)\n", "\n", - " # Resize if necessary\n", - " if size:\n", - " cropped.thumbnail((size, size), Image.LANCZOS)\n", + " # Save cropped image\n", + " \n", "\n", - " # Save cropped image\n", - " if output_dir:\n", - " output_path = Path(output_dir)\n", - " output_path.mkdir(exist_ok=True, parents=True)\n", - " else:\n", - " output_path = \"\"\n", - " cropped_file = Path(\n", - " output_path, f'nla.news-article{article_id}-{box[\"page_id\"]}.jpg'\n", - " )\n", - " cropped.save(cropped_file)\n", + " cropped.save(cropped_file)\n", " images.append(cropped_file.name)\n", " # print(f'Downloaded: {images}')\n", " return images\n", @@ -478,7 +481,10 @@ "metadata": {}, "outputs": [], "source": [ - "assert Path(images[0]).exists()" + "assert Path(images[0]).exists()\n", + "\n", + "# Delete the image\n", + "Path(images[0]).unlink()" ] }, { diff --git a/nbdev.yml b/nbdev.yml index 32aade2..abbb14b 100644 --- a/nbdev.yml +++ b/nbdev.yml @@ -3,7 +3,7 @@ project: website: title: "trove_newspaper_images" - site-url: "https://wragge.github.io/trove_newspaper_images/" + site-url: "https://wragge.github.io/trove_newspaper_images" description: "Tool to download Trove newspaper articles as images." repo-branch: master - repo-url: "https://github.com/wragge/trove_newspaper_images/tree/master/" + repo-url: "https://github.com/wragge/trove_newspaper_images" diff --git a/settings.ini b/settings.ini index c6f5253..8b0306b 100644 --- a/settings.ini +++ b/settings.ini @@ -1,76 +1,42 @@ [DEFAULT] -# All sections below are required unless otherwise specified -host = github -lib_name = trove_newspaper_images -# For Enterprise Git add variable repo_name and company name -# repo_name = analytics -# company_name = nike +# All sections below are required unless otherwise specified. +# See https://github.com/fastai/nbdev/blob/master/settings.ini for examples. -user = wragge -description = Tool to download Trove newspaper articles as images. -keywords = Trove -author = Tim Sherratt -author_email = tim@timsherratt.org -copyright = Tim Sherratt -branch = master -version = 0.2.0 +### Python library ### +repo = trove_newspaper_images +lib_name = %(repo)s +version = 0.2.1 min_python = 3.8 -audience = Developers -language = English -# Set to True if you want to create a more fancy sidebar.json than the default -custom_sidebar = False -# Add licenses and see current list in `setup.py` license = mit -# From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive -status = 4 -# Optional. Same format as setuptools requirements -requirements = requests beautifulsoup4 lxml pillow -dev_requirements = jupyterlab nbdev jupyterlab-code-formatter black isort -# Optional. Same format as setuptools console_scripts -console_scripts = trove_newspaper_images.download=trove_newspaper_images.articles:main -# Optional. Same format as setuptools dependency-links -# dep_links = - -### -# You probably won't need to change anything under here, -# unless you have some special requirements -### - -# Change to, e.g. "nbs", to put your notebooks in nbs dir instead of repo root -nbs_path = . +### nbdev ### doc_path = _docs - -# Whether to look for library notebooks recursively in the `nbs_path` dir +lib_path = trove_newspaper_images +nbs_path = . recursive = False +tst_flags = notest +put_version_in_init = False -# Anything shown as '%(...)s' is substituted with that setting automatically -doc_host = https://%(user)s.github.io -#For Enterprise Git pages use: -#doc_host = https://pages.github.%(company_name)s.com. - - -doc_baseurl = /%(lib_name)s/ -# For Enterprise Github pages docs use: -# doc_baseurl = /%(repo_name)s/%(lib_name)s/ - -git_url = https://github.com/%(user)s/%(lib_name)s/tree/%(branch)s/ -# For Enterprise Github use: -#git_url = https://github.%(company_name)s.com/%(repo_name)s/%(lib_name)s/tree/%(branch)s/ - - - -lib_path = %(lib_name)s +### Docs ### +branch = master +custom_sidebar = False +doc_host = https://%(user)s.github.io +doc_baseurl = /%(repo)s +git_url = https://github.com/%(user)s/%(repo)s title = %(lib_name)s -#Optional advanced parameters -#Monospace docstings: adds
 tags around the doc strings, preserving newlines/indentation.
-#monospace_docstrings = False
-#Test flags: introduce here the test flags you want to use separated by |
-#tst_flags = 
-#Custom sidebar: customize sidebar.json yourself for advanced sidebars (False/True)
-#custom_sidebar = 
-#Cell spacing: if you want cell blocks in code separated by more than one new line
-#cell_spacing = 
-#Custom jekyll styles: if you want more jekyll styles than tip/important/warning, set them here
-#jekyll_styles = note,warning,tip,important
+### PyPI ###
+audience = Developers
+author = Tim Sherratt
+author_email = tim@timsherratt.org
+copyright = 2022 onwards, %(author)s
+description = Tool to download Trove newspaper articles as images.
+keywords = nbdev jupyter notebook python
+language = English
+status = 3
+user = wragge
+
+### Optional ###
+requirements = requests beautifulsoup4 lxml pillow
+dev_requirements = jupyterlab nbdev jupyterlab-code-formatter black isort
+console_scripts = trove_newspaper_images.download=trove_newspaper_images.articles:main
\ No newline at end of file
diff --git a/trove_newspaper_images/_modidx.py b/trove_newspaper_images/_modidx.py
index 10b3144..6fe423f 100644
--- a/trove_newspaper_images/_modidx.py
+++ b/trove_newspaper_images/_modidx.py
@@ -1,9 +1,9 @@
 # Autogenerated by nbdev
 
 d = { 'settings': { 'branch': 'master',
-                'doc_baseurl': '/trove_newspaper_images/',
+                'doc_baseurl': '/trove_newspaper_images',
                 'doc_host': 'https://wragge.github.io',
-                'git_url': 'https://github.com/wragge/trove_newspaper_images/tree/master/',
+                'git_url': 'https://github.com/wragge/trove_newspaper_images',
                 'lib_path': 'trove_newspaper_images'},
   'syms': { 'trove_newspaper_images.articles': { 'trove_newspaper_images.articles.download_images': ( 'articles.html#download_images',
                                                                                                       'trove_newspaper_images/articles.py'),
diff --git a/trove_newspaper_images/articles.py b/trove_newspaper_images/articles.py
index 3c8ad8d..ca2f836 100644
--- a/trove_newspaper_images/articles.py
+++ b/trove_newspaper_images/articles.py
@@ -117,39 +117,42 @@ def download_images(article_id, output_dir="", size=None):
     * a list of image file names
     """
     images = []
+    if output_dir:
+        output_path = Path(output_dir)
+        output_path.mkdir(exist_ok=True, parents=True)
+    else:
+        output_path = ""
+    
     # Get position of article on the page(s)
     boxes = get_article_boxes(article_id)
     for box in boxes:
+        cropped_file = Path(
+            output_path, f'nla.news-article{article_id}-{box["page_id"]}.jpg'
+        )
+        if not cropped_file.exists():
+            # Construct the url we need to download the full page image
+            page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box["page_id"]}/level7'
 
-        # Construct the url we need to download the full page image
-        page_url = f'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{box["page_id"]}/level7'
+            # Download the page image
+            response = requests.get(page_url)
 
-        # Download the page image
-        response = requests.get(page_url)
+            # Open download as an image for editing
+            img = Image.open(BytesIO(response.content))
 
-        # Open download as an image for editing
-        img = Image.open(BytesIO(response.content))
+            # Use coordinates of the bounding box to crop article
+            points = (box["left"], box["top"], box["right"], box["bottom"])
 
-        # Use coordinates of the bounding box to crop article
-        points = (box["left"], box["top"], box["right"], box["bottom"])
+            # Crop image to article box
+            cropped = img.crop(points)
 
-        # Crop image to article box
-        cropped = img.crop(points)
+            # Resize if necessary
+            if size:
+                cropped.thumbnail((size, size), Image.LANCZOS)
 
-        # Resize if necessary
-        if size:
-            cropped.thumbnail((size, size), Image.LANCZOS)
+            # Save cropped image
+            
 
-        # Save cropped image
-        if output_dir:
-            output_path = Path(output_dir)
-            output_path.mkdir(exist_ok=True, parents=True)
-        else:
-            output_path = ""
-        cropped_file = Path(
-            output_path, f'nla.news-article{article_id}-{box["page_id"]}.jpg'
-        )
-        cropped.save(cropped_file)
+            cropped.save(cropped_file)
         images.append(cropped_file.name)
     # print(f'Downloaded: {images}')
     return images