Skip to content

Commit

Permalink
issue #3: update all new scrapers to use 'name_to_id' explicitely + a…
Browse files Browse the repository at this point in the history
…dd id validation to jsonschema
  • Loading branch information
defgsus committed Dec 30, 2021
1 parent 4705ff5 commit 022fa80
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 34 deletions.
4 changes: 2 additions & 2 deletions web/scrapers/builtin/bahn.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def get_lot_data(self) -> List[LotData]:

lots.append(
LotData(
id="db-%s" % space["id"],
id=name_to_id("db", space["id"]),
timestamp=now,
lot_timestamp=lot_timestamp,
status=status,
Expand Down Expand Up @@ -103,7 +103,7 @@ def get_lot_infos(self) -> List[LotInfo]:

lots.append(
LotInfo(
id="db-%s" % space["id"],
id=name_to_id("db", space["id"]),
name=space["name"],
# either street or auto-mapping
type=LotInfo.Types.street if space["spaceType"] == "Straße" else space["spaceType"],
Expand Down
4 changes: 2 additions & 2 deletions web/scrapers/builtin/bielefeld.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def get_lot_data(self) -> List[LotData]:
lots.append(
LotData(
timestamp=now,
id=f"bielefeld-{lot_id}",
id=name_to_id("bielefeld", lot_id),
status=status,
num_free=num_free,
capacity=capacity,
Expand Down Expand Up @@ -94,7 +94,7 @@ def get_lot_infos(self) -> List[LotInfo]:

lots.append(
LotInfo(
id=f"bielefeld-{lot_id}",
id=name_to_id("bielefeld", lot_id),
name=name,
type=guess_lot_type(name) or LotInfo.Types.unknown,
public_url=self.POOL.public_url,
Expand Down
4 changes: 2 additions & 2 deletions web/scrapers/builtin/bochum.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_lot_data(self) -> List[LotData]:

lots = []
for lot_elem in soup.find_all("article", class_="lot"):
lot_id = "bochum-" + lot_elem["data-uid"]
lot_id = name_to_id("bochum", lot_elem["data-uid"])

num_free = None
status = LotData.Status.unknown
Expand Down Expand Up @@ -58,7 +58,7 @@ def get_lot_infos(self) -> List[LotInfo]:

lots = []
for lot_elem in soup.find_all("article", class_="lot"):
lot_id = "bochum-" + lot_elem["data-uid"]
lot_id = name_to_id("bochum", lot_elem["data-uid"])

lot_name = lot_elem.find("h3").text.strip()
lat, lng = lot_elem["data-lat"], lot_elem["data-lng"]
Expand Down
4 changes: 2 additions & 2 deletions web/scrapers/builtin/braunschweig.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def get_lot_data(self) -> List[LotData]:

lots.append(
LotData(
id=props["name"],
id=name_to_id("braunschweig", props["name"]),
timestamp=self.timestamp,
lot_timestamp=self.to_utc_datetime(props["timestamp"]) if props.get("timestamp") else None,
status=status,
Expand All @@ -59,7 +59,7 @@ def get_lot_infos(self) -> List[LotInfo]:

lots.append(
LotInfo(
id=props["name"],
id=name_to_id("braunschweig", props["name"]),
name=props["name"],
capacity=props.get("capacity"),
address=get_soup_text(address),
Expand Down
4 changes: 2 additions & 2 deletions web/scrapers/builtin/jena.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_lot_data(self) -> List[LotData]:
if row and row[3] != "nie":
lots.append(
LotData(
id=f"jena-{row[0]}",
id=name_to_id("jena", row[0]),
timestamp=now,
status=LotData.Status.open,
num_free=int_or_none(row[1]),
Expand All @@ -53,7 +53,7 @@ def get_lot_infos(self) -> List[LotInfo]:

lots.append(
LotInfo(
id=f"jena-{name}",
id=name_to_id("jena", name),
name=name,
type=LotInfo.Types.unknown,
public_url=urllib.parse.urljoin(self.POOL.public_url, content_rows[0][1].find("a")["href"]),
Expand Down
14 changes: 8 additions & 6 deletions web/scrapers/builtin/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
"properties": {
"id": {
"type": "string",
"maxLength": 64
"maxLength": 64,
"pattern": "^[a-z0-9\\-]+$"
},
"name": {
"type": "string",
Expand All @@ -40,12 +41,12 @@
"public_url": {
"type": "string",
"maxLength": 4096,
"pattern": "[a-z]+://.+"
"pattern": "^[a-z]+://.+"
},
"source_url": {
"type": ["string", "null"],
"maxLength": 4096,
"pattern": "[a-z]+://.+"
"pattern": "^[a-z]+://.+"
},
"timezone": {
"type": "string",
Expand Down Expand Up @@ -77,7 +78,8 @@
"properties": {
"id": {
"type": "string",
"maxLength": 64
"maxLength": 64,
"pattern": "^[a-z0-9\\-æø]+$"
},
"name": {
"type": "string",
Expand All @@ -91,12 +93,12 @@
"public_url": {
"type": ["string", "null"],
"maxLength": 4096,
"pattern": "[a-z]+://.+"
"pattern": "^[a-z]+://.+"
},
"source_url": {
"type": "string",
"maxLength": 4096,
"pattern": "[a-z]+://.+"
"pattern": "^[a-z]+://.+"
},
"address": {
"type": ["string", "null"],
Expand Down
61 changes: 43 additions & 18 deletions web/scrapers/builtin/util/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,32 +27,28 @@ def guess_lot_type(name: str) -> Optional[str]:

def name_to_legacy_id(city_name: str, lot_name: str) -> str:
"""
Converts city name and lot name to the legacy lot ID
Converts city/pool name and lot name to the legacy lot ID
identical to the original ParkAPI ID
:param city_name: str, city or pool prefix
:param lot_name: str, name of the lot
:return: a normalized string
"""
name = f"{city_name}{lot_name}".lower()
return remove_special_chars(name)


def name_to_id(name: str) -> str:
"""
Converts any string to
- ascii alphanumeric or "-" characters
- no spaces
- lowercase
- maximal length of 64
def name_to_id(city_name: str, lot_name: str) -> str:
"""
id_name = str(name)
id_name = id_name.replace("ß", "ss")
id_name = unicodedata.normalize('NFKD', id_name).encode("ascii", "ignore").decode("ascii")
Converts city/pool name and lot name to a lot ID
with only ascii characters and "-"
id_name = "".join(
c if c.isalnum() or c in " \t" else "-"
for c in id_name
).replace(" ", "-")

id_name = RE_MULTI_MINUS.sub("-", id_name).strip("-")
return id_name.lower()[:64]
:param city_name: str, city or pool prefix
:param lot_name: str, name of the lot
:return: a normalized string, maximum length of 64 characters!
"""
name = f"{city_name}-{lot_name}".lower()
return remove_special_chars_v2(name)[:64]


def remove_special_chars(name: str) -> str:
Expand Down Expand Up @@ -80,6 +76,35 @@ def remove_special_chars(name: str) -> str:
return name


def remove_special_chars_v2(name: str) -> str:
"""
Converts any string to
- ascii alphanumeric or "-" characters
- no spaces
- lowercase
"""
replacements = {
"ä": "ae",
"ö": "oe",
"ü": "ue",
"ß": "ss",
}
id_name = str(name)
for old, new in replacements.items():
name = name.replace(old, new)

id_name = id_name.replace("ß", "ss")
id_name = unicodedata.normalize('NFKD', id_name).encode("ascii", "ignore").decode("ascii")

id_name = "".join(
c if c.isalnum() or c in " \t\n" else "-"
for c in id_name
).replace(" ", "-")

id_name = RE_MULTI_MINUS.sub("-", id_name).strip("-")
return id_name.lower()[:64]


def int_or_none(x) -> Optional[int]:
try:
x = str(x)
Expand Down

0 comments on commit 022fa80

Please sign in to comment.