Skip to content

Commit

Permalink
chg: accept same input data shape for merge and create_many
Browse files Browse the repository at this point in the history
- chg: move insert data reshaping to create_many so that it's
  easier to use and accepts same kind of data as merge.
- add: create_source/source_size parameter for both methods to
  allow specifying this info separately to the data.
- chg: docstring improvements
  • Loading branch information
lindsay-stevens committed Aug 29, 2024
1 parent d657b0f commit 72bb68e
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 66 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from pyodk import Client

project_id = 1
entity_list_name = f"previous_survey_{uuid4()}"
entity_label_field = "first_name"
entity_properties = ("age", "location")
csv_path = Path("./imported_answers.csv")
Expand All @@ -18,28 +17,30 @@
def create_one_at_a_time():
with Client(project_id=project_id) as client, open(csv_path) as csv_file:
# Create the entity list.
client.entity_lists.create(entity_list_name=entity_list_name)
entity_list = client.entity_lists.create(
entity_list_name=f"previous_survey_{uuid4()}"
)
for prop in entity_properties:
client.entity_lists.add_property(name=prop, entity_list_name=entity_list_name)
client.entity_lists.add_property(name=prop, entity_list_name=entity_list.name)

# Create the entities from the CSV data.
for row in DictReader(csv_file):
client.entities.create(
label=row[entity_label_field],
data={k: str(v) for k, v in row.items() if k in entity_properties},
entity_list_name=entity_list_name,
entity_list_name=entity_list.name,
)


def create_with_merge():
with Client(project_id=project_id) as client, open(csv_path) as csv_file:
client.entity_lists.default_entity_list_name = client.session.get_xform_uuid()
client.entity_lists.default_entity_list_name = f"previous_survey_{uuid4()}"
entity_list = client.entity_lists.create()
client.entities.merge(
source_data=list(DictReader(csv_file)),
data=DictReader(csv_file),
entity_list_name=entity_list.name,
source_label_key=entity_label_field,
source_keys=entity_properties,
source_keys=(entity_label_field, *entity_properties),
)


Expand Down
124 changes: 83 additions & 41 deletions pyodk/_endpoints/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any
from uuid import uuid4

from pyodk.__version__ import __version__
from pyodk._endpoints import bases
from pyodk._endpoints.entity_list_properties import EntityListPropertyService
from pyodk._utils import validators as pv
Expand Down Expand Up @@ -179,34 +180,65 @@ def create(

def create_many(
self,
data: dict,
data: Iterable[Mapping[str, Any]],
entity_list_name: str | None = None,
project_id: int | None = None,
) -> Entity:
create_source: str | None = None,
source_size: str | None = None,
) -> bool:
"""
Create one or more Entities in a single request.
Required keys in data: entities[].label, entities[].data, source.name.
Example of the required data format:
Example input for `data` would be a list of dictionaries from a CSV file:
data = [
{"label": "Sydney", "state": "NSW", "postcode": "2000"},
{"label": "Melbourne", "state": "VIC", "postcode": "3000"},
]
{
"entities": [
{"label": "Sydney", "data": {"state": "NSW", "postcode": "2000"}},
{"label": "Melbourne", "data": {"state": "VIC", "postcode": "3000"}},
],
"source": {"name": "pyodk", "size": 1},
}
Each Entity in `data` must include a "label" key. An Entity List property must be
created in advance for each key in `data` that is not "label". The `merge` method
can be used to automatically add properties (or a subset) and create Entities.
:param data: Data to store for the Entities.
:param entity_list_name: The name of the Entity List (Dataset) being referenced.
:param project_id: The id of the project this Entity belongs to.
:param create_source: Used to capture the source of the change in Central, for
example a file name. Defaults to the PyODK version.
:param source_size: Used to capture the size of the source data in Central, for
example a file size or row count. Excluded if None.
"""
if create_source is None:
create_source = f"pyodk v{__version__}"
if source_size is None:
size = {}
else:
size = {"size": source_size}

def reshape(d):
try:
new = [
{
"label": i["label"],
"data": {k: i.get(k) for k in i if k != "label"},
}
for i in d
]
except KeyError as kerr:
raise PyODKError("All data must include a 'label' key.") from kerr
else:
return new

try:
pid = pv.validate_project_id(project_id, self.default_project_id)
eln = pv.validate_entity_list_name(
entity_list_name, self.default_entity_list_name
)
data = pv.validate_is_instance(data, typ=Iterable, key="data")
final_data = {
"entities": reshape(data),
"source": {"name": create_source, **size},
}
except PyODKError as err:
log.error(err, exc_info=True)
raise
Expand All @@ -215,7 +247,7 @@ def create_many(
method="POST",
url=self.session.urlformat(self.urls.post, project_id=pid, el_name=eln),
logger=log,
json=data,
json=final_data,
)
data = response.json()
return data["success"]
Expand Down Expand Up @@ -463,7 +495,7 @@ def get_key(entity: Mapping[str, Any], keys: list) -> tuple:

def merge(
self,
source_data: Iterable[Mapping[str, Any]],
data: Iterable[Mapping[str, Any]],
entity_list_name: str | None = None,
project_id: int | None = None,
match_keys: Iterable[str] | None = None,
Expand All @@ -472,46 +504,56 @@ def merge(
delete_not_matched: bool = False,
source_label_key: str = "label",
source_keys: Iterable[str] | None = None,
create_source: str = "pyodk",
create_source: str | None = None,
source_size: str | None = None,
) -> MergeActions:
"""
Update Entities in Central based on the provided source data.
Update Entities in Central based on the provided data:
1. Create Entities in the source data that don't exist in Central.
2. Update Entities in Central that match the source data.
3. Optionally, delete any Entities in Central that aren't in the source data.
1. Create Entities from `data` that don't exist in Central.
2. Update Entities from `data` that exist in Central.
3. Optionally, delete any Entities in Central that don't exist in `data`.
Creation is performed using the bulk creation endpoint. This method may be slow
for large quantities of updates or deletes, since for these operations each
change is a request in a loop. If this is a concern, set the parameters
`update_matched` and `delete_not_matched` to False and use the return value to
perform threaded or async requests for these data.
Example input for `source_data` would be a list of dictionaries from a CSV file:
:param source_data: Data to use for updating Entities in Central.
data = [
{"label": "Sydney", "state": "NSW", "postcode": "2000"},
{"label": "Melbourne", "state": "VIC", "postcode": "3000"},
]
Entity creation is performed in one request using `create_many`. The merge
operation may be slow if large quantities of updates or deletes are required,
since for these operations each change is a request in a loop. If this is a
concern, set the parameters `update_matched` and `delete_not_matched` to False and
use the return value to perform threaded or async requests for these data.
:param data: Data to use for updating Entities in Central.
:param entity_list_name: The name of the Entity List (Dataset) being referenced.
:param project_id: The id of the project this Entity belongs to.
:param match_keys: Dictionary keys common to source and target used to match rows.
Defaults to ("label",). If a custom source_label_key is provided, specify that
key as "label", because it is translated to "label" for matching.
:param add_new_properties: If True, add any Entity List properties from the
source data that aren't in Central.
:param update_matched: If True, update any Entities in Central that match the
source data but have different properties.
:param add_new_properties: If True, add any Entity List properties from `data`
that aren't in Central.
:param update_matched: If True, update any Entities in Central that match `data`
but have different properties.
:param delete_not_matched: If True, delete any Entities in Central that aren't
in the source data.
:param source_label_key: The key in the source data to use as the label. The
target label key is always "label" because this key is required by Central.
:param source_keys: If provided, process only these keys in the source data.
:param create_source: When creating Entities in bulk, this value is used to
capture the source of the change in Central.
in `data`.
:param source_label_key: The key in `data` to use as the label. The target label
key is always "label" because this key is required by Central.
:param source_keys: If provided, process only these keys in `data`.
:param create_source: If Entities are created, this is used to capture the source
of the change in Central, for example a file name. Defaults to the PyODK version.
:param source_size: If Entities are created, this is used to capture the size of
`data` in Central, for example a file size. Excluded if None.
"""
pid = pv.validate_project_id(project_id, self.default_project_id)
eln = pv.validate_entity_list_name(
entity_list_name, self.default_entity_list_name
)
target_data = self.get_table(entity_list_name=entity_list_name)["value"]
merge_actions = self._prep_data_for_merge(
source_data=source_data,
source_data=data,
target_data=target_data,
match_keys=match_keys,
source_label_key=source_label_key,
Expand All @@ -529,16 +571,16 @@ def merge(
else:
merge_actions.final_keys = merge_actions.keys_intersect
if len(merge_actions.to_insert) > 0:
insert_reshape = [
{
"label": i["label"],
"data": {k: i.get(k) for k in i if k in merge_actions.final_keys},
}
relevant_keys = {"label", *merge_actions.final_keys}
insert_filter = [
{k: i.get(k) for k in i if k in relevant_keys}
for i in merge_actions.to_insert.values()
]
self.create_many(
data={"entities": insert_reshape, "source": {"name": create_source}},
data=insert_filter,
entity_list_name=eln,
create_source=create_source,
source_size=source_size,
)
if update_matched:
for u in merge_actions.to_update.values():
Expand Down
3 changes: 2 additions & 1 deletion tests/endpoints/test_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from unittest import TestCase
from unittest.mock import MagicMock, patch

from pyodk._endpoints.entities import Entity
from pyodk._endpoints.entities import Entity, MergeActions
from pyodk._endpoints.entities import EntityService as es
from pyodk._utils.session import Session
from pyodk.client import Client
Expand Down Expand Up @@ -351,6 +351,7 @@ def test_merge__all_ops(self):
{"label": "Melbourne", "state": "VIC"},
]
observed = es._prep_data_for_merge(source_data=source, target_data=target)
self.assertIsInstance(observed, MergeActions)
self.assertEqual(1, len(observed.to_insert))
self.assertEqual(
source[1]["label"],
Expand Down
28 changes: 11 additions & 17 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def test_entity__merge__new(self):
)
entity_list = self.client.entity_lists.create()
self.client.entities.merge(
source_data=[
data=[
{"label": "Sydney", "state": "NSW"},
{"label": "Melbourne", "state": "VIC"},
],
Expand All @@ -321,18 +321,15 @@ def test_entity__merge__existing__add_props__delete_unmatched(self):
name="state", entity_list_name=entity_list.name
)
self.client.entities.create_many(
data={
"entities": [
{"label": "Sydney", "data": {"state": "VIC"}},
{"label": "Darwin", "data": {"state": "NT"}},
],
"source": {"name": "pyodk"},
},
data=[
{"label": "Sydney", "state": "VIC"},
{"label": "Darwin", "state": "NT"},
],
entity_list_name=entity_list.name,
)
# Add postcode property, Add Brisbane, update Sydney, delete Darwin.
self.client.entities.merge(
source_data=[
data=[
{"label": "Sydney", "state": "NSW", "postcode": "2001"},
{"label": "Brisbane", "state": "QLD", "postcode": "4000"},
],
Expand Down Expand Up @@ -366,18 +363,15 @@ def test_entity__merge__existing__ignore_props__keep_unmatched(self):
name="state", entity_list_name=entity_list.name
)
self.client.entities.create_many(
data={
"entities": [
{"label": "Sydney", "data": {"state": "VIC"}},
{"label": "Darwin", "data": {"state": "NT"}},
],
"source": {"name": "pyodk"},
},
data=[
{"label": "Sydney", "state": "VIC"},
{"label": "Darwin", "state": "NT"},
],
entity_list_name=entity_list.name,
)
# Skip postcode property, add Brisbane, update Sydney, keep Darwin.
self.client.entities.merge(
source_data=[
data=[
{"label": "Sydney", "state": "NSW", "postcode": "2000"}, # update
{"label": "Brisbane", "state": "QLD", "postcode": "4000"}, # insert
],
Expand Down

0 comments on commit 72bb68e

Please sign in to comment.