Skip to content

Commit

Permalink
robuster handling of invalid reverse input lines (#55)
Browse files Browse the repository at this point in the history
  • Loading branch information
mtmail authored Sep 11, 2024
1 parent b8399a9 commit 0be0b48
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 18 deletions.
44 changes: 35 additions & 9 deletions opencage/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
import backoff
import certifi
import random
import re

from tqdm import tqdm
from urllib.parse import urlencode
from contextlib import suppress
from opencage.geocoder import OpenCageGeocode, OpenCageGeocodeError
from opencage.geocoder import OpenCageGeocode, OpenCageGeocodeError, _query_for_reverse_geocoding

class OpenCageBatchGeocoder():
def __init__(self, options):
Expand Down Expand Up @@ -38,9 +39,11 @@ async def geocode(self, input, output):

queue = asyncio.Queue(maxsize=self.options.limit)

await self.read_input(input, queue)
read_warnings = await self.read_input(input, queue)

if self.options.dry_run:
if not read_warnings:
print('All good.')
return

if self.options.headers:
Expand Down Expand Up @@ -78,19 +81,28 @@ async def test_request(self):
return { 'error': exc }

async def read_input(self, input, queue):
any_warnings = False
for index, row in enumerate(input):
line_number = index + 1

if len(row) == 0:
raise Exception(f"Empty line in input file at line number {line_number}, aborting")
self.log(f"Line {line_number} - Empty line")
any_warnings = True
row = ['']

item = await self.read_one_line(row, line_number)
if item['warnings'] is True:
any_warnings = True
await queue.put(item)

if queue.full():
break

return any_warnings

async def read_one_line(self, row, row_id):
warnings = False

if self.options.command == 'reverse':
input_columns = [1, 2]
elif self.options.input_columns:
Expand All @@ -105,14 +117,26 @@ async def read_one_line(self, row, row_id):
# input_columns option uses 1-based indexing
address.append(row[column - 1])
except IndexError:
self.log(f"Missing input column {column} in {row}")
self.log(f"Line {row_id} - Missing input column {column} in {row}")
warnings = True
else:
address = row

if self.options.command == 'reverse' and len(address) != 2:
self.log(f"Expected two comma-separated values for reverse geocoding, got {address}")
if self.options.command == 'reverse':

return { 'row_id': row_id, 'address': ','.join(address), 'original_columns': row }
if len(address) != 2:
self.log(f"Line {row_id} - Expected two comma-separated values for reverse geocoding, got {address}")
else:
# _query_for_reverse_geocoding attempts to convert into numbers. We rather have it fail
# now than during the actual geocoding
try:
_query_for_reverse_geocoding(address[0], address[1])
except:
self.log(f"Line {row_id} - Does not look like latitude and longitude: '{address[0]}' and '{address[1]}'")
warnings = True
address = []

return { 'row_id': row_id, 'address': ','.join(address), 'original_columns': row, 'warnings': warnings }

async def worker(self, output, queue, progress):
while True:
Expand Down Expand Up @@ -147,8 +171,9 @@ async def _geocode_one_address():

try:
if self.options.command == 'reverse':
lon, lat = address.split(',')
geocoding_results = await geocoder.reverse_geocode_async(lon, lat, **params)
if ',' in address:
lon, lat = address.split(',')
geocoding_results = await geocoder.reverse_geocode_async(lon, lat, **params)
else:
geocoding_results = await geocoder.geocode_async(address, **params)
except OpenCageGeocodeError as exc:
Expand Down Expand Up @@ -205,6 +230,7 @@ async def write_one_geocoding_result(self, output, row_id, address, geocoding_re
output.writerow(row)
self.write_counter = self.write_counter + 1


def log(self, message):
if not self.options.quiet:
sys.stderr.write(f"{message}\n")
Expand Down
4 changes: 4 additions & 0 deletions opencage/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ def parse_args(args):
print(f"Error: The output file '{options.output}' already exists. You can add --overwrite to your command.", file=sys.stderr)
sys.exit(1)

if 0 in options.input_columns:
print(f"Error: A column 0 in --input-columns does not exist. The lowest possible number is 1.", file=sys.stderr)
sys.exit(1)

return options


Expand Down
1 change: 1 addition & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
[pytest]
pythonpath = .
asyncio_default_fixture_loop_scope = session
21 changes: 18 additions & 3 deletions test/cli/test_cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ def test_invalid_command(capfd):
capfd
)

def test_invalid_command(capfd):
def test_version_number(capfd):
with pytest.raises(SystemExit):
parse_args(['--version'])
out, err = capfd.readouterr()
out, _ = capfd.readouterr()

assert __version__ in out

Expand Down Expand Up @@ -79,6 +79,20 @@ def test_argument_range(capfd):
capfd
)

def test_zero_based_list(capfd):
assert_parse_args_error(
[
"forward",
"--api-key", "oc_gc_12345678901234567890123456789012",
"--input", "test/fixtures/input.txt",
"--output", "test/fixtures/output.csv",
"--input-columns", "0,1,2"
],
'The lowest possible number is 1',
capfd
)


def test_full_argument_list():
args = parse_args([
"reverse",
Expand Down Expand Up @@ -130,7 +144,8 @@ def test_defaults():
assert args.limit == 0
assert args.headers is False
assert args.input_columns == [1]
assert args.add_columns == ["lat", "lng", "_type", "_category", "country_code", "country", "state", "county", "_normalized_city", "postcode", "road", "house_number", "confidence", "formatted"]
assert args.add_columns == ["lat", "lng", "_type", "_category", "country_code", "country", "state",
"county", "_normalized_city", "postcode", "road", "house_number", "confidence", "formatted"]
assert args.workers == 1
assert args.timeout == 10
assert args.retries == 10
Expand Down
32 changes: 27 additions & 5 deletions test/cli/test_cli_run.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pathlib
import pytest
import os
import sys
import pytest

from opencage.command_line import main

Expand Down Expand Up @@ -85,14 +84,32 @@ def test_input_errors(capfd):
"--api-key", TEST_APIKEY_200,
"--input", "test/fixtures/cli/reverse_with_errors.csv",
"--output", "test/fixtures/cli/output.csv",
"--add-columns", "country_code,postcode",
"--no-progress"
])

_, err = capfd.readouterr()
assert 'Missing input column 2 in' in err
assert 'Expected two comma-separated values' in err
# assert err == ''
assert err.count("\n") == 6
assert "Line 1 - Missing input column 2 in ['50.101010']" in err
assert "Line 1 - Expected two comma-separated values for reverse geocoding, got ['50.101010']" in err
assert "Line 3 - Empty line" in err
assert "Line 3 - Missing input column 2 in ['']" in err
assert "Line 3 - Expected two comma-separated values for reverse geocoding, got ['']" in err
assert "Line 4 - Does not look like latitude and longitude: 'a' and 'b'" in err

def test_empty_result(capfd):
assert_output(
path="test/fixtures/cli/output.csv",
length=4,
lines=[
'50.101010,,',
'-100,60.1,de,48153',
',,',
'a,b,,'
]
)

def test_empty_result():
# 'NOWHERE-INTERESTING' is guaranteed to return no result
# https://opencagedata.com/api#testingkeys
main([
Expand Down Expand Up @@ -138,6 +155,11 @@ def test_dryrun(capfd):

assert not os.path.isfile("test/fixtures/cli/output.csv")

out, _ = capfd.readouterr()
assert out.count("\n") == 1
assert "All good." in out


def test_invalid_domain(capfd):
main([
"forward",
Expand Down
5 changes: 4 additions & 1 deletion test/fixtures/cli/reverse_with_errors.csv
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
50.101010
50.101010
-100,60.1

a,b

0 comments on commit 0be0b48

Please sign in to comment.