Skip to content

Commit

Permalink
Merge branch 'ecmwf:develop' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
OpheliaMiralles authored Oct 29, 2024
2 parents 1c87c51 + e5ec079 commit 0138022
Show file tree
Hide file tree
Showing 19 changed files with 336 additions and 55 deletions.
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
Please add your functional changes to the appropriate section in the PR.
Keep it human-readable, your future self will thank you!

## [Unreleased](https://github.com/ecmwf/anemoi-datasets/compare/0.5.7...HEAD)
## [Unreleased](https://github.com/ecmwf/anemoi-datasets/compare/0.5.8...HEAD)

## [0.5.8](https://github.com/ecmwf/anemoi-datasets/compare/0.5.7...0.5.8) - 2024-10-26

### Changed

- Bugfix in `auto_adjust`
- Fixed precommit CI errors
- Improve tests
- More verbosity

### Added

- Add anemoi-transform link to documentation
- Various bug fixes
- Control compatibility check in xy/zip
- Add `merge` feature
- Add support for storing `supporting_arrays` in checkpoint files
- Allow naming of datasets components
- Contributors file (#105)

### Changed

- Remove upstream dependencies from downstream-ci workflow (temporary) (#83)
- ci: pin python versions to 3.9 ... 3.12 for checks (#93)
- Fix `__version__` import in init

## [0.5.7](https://github.com/ecmwf/anemoi-datasets/compare/0.5.6...0.5.7) - 2024-10-09

Expand Down
13 changes: 13 additions & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## How to Contribute

Please see the [read the docs](https://anemoi-training.readthedocs.io/en/latest/dev/contributing.html).


## Contributors

Thank you to all the wonderful people who have contributed to Anemoi. Contributions can come in many forms, including code, documentation, bug reports, feature suggestions, design, and more. A list of code-based contributors can be found [here](https://github.com/ecmwf/anemoi-datasets/graphs/contributors).


## Contributing Organisations

Significant contributions have been made by the following organisations: [DWD](https://www.dwd.de/), [MET Norway](https://www.met.no/), [MeteoSwiss](https://www.meteoswiss.admin.ch/), [RMI](https://www.meteo.be/) & [ECMWF](https://www.ecmwf.int/)
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@

project = "Anemoi"

author = "ECMWF"
author = "Anemoi contributors"

year = datetime.datetime.now().year
if year == 2024:
years = "2024"
else:
years = "2024-%s" % (year,)

copyright = "%s, ECMWF" % (years,)
copyright = "%s, Anemoi contributors" % (years,)


try:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ dynamic = [
]
dependencies = [
"anemoi-transform>=0.0.5",
"anemoi-utils[provenance]>=0.4.1",
"anemoi-utils[provenance]>=0.4.2",
"cfunits",
"numpy",
"pyyaml",
Expand Down
10 changes: 8 additions & 2 deletions src/anemoi/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,21 @@
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

from ._version import __version__
from .data import MissingDateError
from .data import add_dataset_path
from .data import add_named_dataset
from .data import list_dataset_names
from .data import open_dataset

try:
# NOTE: the `_version.py` file must not be present in the git repository
# as it is generated by setuptools at install time
from ._version import __version__ # type: ignore
except ImportError: # pragma: no cover
# Local copy or not installed with setuptools
__version__ = "999"

__all__ = [
"__version__",
"add_dataset_path",
"add_named_dataset",
"list_dataset_names",
Expand Down
4 changes: 1 addition & 3 deletions src/anemoi/datasets/commands/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@


def task(what, options, *args, **kwargs):
"""
Make sure `import Creator` is done in the sub-processes, and not in the main one.
"""
"""Make sure `import Creator` is done in the sub-processes, and not in the main one."""

now = datetime.datetime.now()
LOG.info(f"🎬 Task {what}({args},{kwargs}) starting")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,8 @@ def __repr__(self):
)

def __getitem__(self, i):
"""
Get a 2D field from the variable
"""
"""Get a 2D field from the variable"""

if i >= self.length:
raise IndexError(i)

Expand Down
23 changes: 23 additions & 0 deletions src/anemoi/datasets/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,30 @@ class MissingDateError(Exception):
pass


def _convert(x):

if isinstance(x, list):
return [_convert(a) for a in x]

if isinstance(x, tuple):
return tuple(_convert(a) for a in x)

if isinstance(x, dict):
return {k: _convert(v) for k, v in x.items()}

if x.__class__.__name__ in ("DictConfig", "ListConfig"):
from omegaconf import OmegaConf

return OmegaConf.to_container(x, resolve=True)

return x


def open_dataset(*args, **kwargs):

# That will get rid of OmegaConf objects
args, kwargs = _convert(args), _convert(kwargs)

ds = _open_dataset(*args, **kwargs)
ds = ds.mutate()
ds.arguments = {"args": args, "kwargs": kwargs}
Expand Down
143 changes: 117 additions & 26 deletions src/anemoi/datasets/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,40 @@
LOG = logging.getLogger(__name__)


def _tidy(v):
if isinstance(v, (list, tuple, set)):
return [_tidy(i) for i in v]
if isinstance(v, dict):
return {k: _tidy(v) for k, v in v.items()}
if isinstance(v, str) and v.startswith("/"):
return os.path.basename(v)
if isinstance(v, datetime.datetime):
return v.isoformat()
if isinstance(v, datetime.date):
return v.isoformat()
if isinstance(v, datetime.timedelta):
return frequency_to_string(v)

if isinstance(v, Dataset):
# That can happen in the `arguments`
# if a dataset is passed as an argument
return repr(v)

if isinstance(v, slice):
return (v.start, v.stop, v.step)

return v


class Dataset:
arguments = {}
_name = None

def mutate(self) -> "Dataset":
"""
Give an opportunity to a subclass to return a new Dataset
"""Give an opportunity to a subclass to return a new Dataset
object of a different class, if needed.
"""

return self

def swap_with_parent(self, parent):
Expand All @@ -41,6 +67,21 @@ def _len(self):
return len(self)

def _subset(self, **kwargs):

if not kwargs:
return self.mutate()

name = kwargs.pop("name", None)
result = self.__subset(**kwargs)
result._name = name

return result

@property
def name(self):
return self._name

def __subset(self, **kwargs):
if not kwargs:
return self.mutate()

Expand Down Expand Up @@ -254,41 +295,32 @@ def typed_variables(self):

return result

def _input_sources(self):
sources = []
self.collect_input_sources(sources)
return sources

def metadata(self):
import anemoi

def tidy(v):
if isinstance(v, (list, tuple, set)):
return [tidy(i) for i in v]
if isinstance(v, dict):
return {k: tidy(v) for k, v in v.items()}
if isinstance(v, str) and v.startswith("/"):
return os.path.basename(v)
if isinstance(v, datetime.datetime):
return v.isoformat()
if isinstance(v, datetime.date):
return v.isoformat()
if isinstance(v, datetime.timedelta):
return frequency_to_string(v)

if isinstance(v, Dataset):
# That can happen in the `arguments`
# if a dataset is passed as an argument
return repr(v)

if isinstance(v, slice):
return (v.start, v.stop, v.step)

return v
_, source_to_arrays = self._supporting_arrays_and_sources()

sources = []
for i, source in enumerate(self._input_sources()):
source_metadata = source.dataset_metadata().copy()
source_metadata["supporting_arrays"] = source_to_arrays[id(source)]
sources.append(source_metadata)

md = dict(
version=anemoi.datasets.__version__,
arguments=self.arguments,
**self.dataset_metadata(),
sources=sources,
supporting_arrays=source_to_arrays[id(self)],
)

try:
return json.loads(json.dumps(tidy(md)))
return json.loads(json.dumps(_tidy(md)))
except Exception:
LOG.exception("Failed to serialize metadata")
pprint.pprint(md)
Expand All @@ -313,8 +345,67 @@ def dataset_metadata(self):
dtype=str(self.dtype),
start_date=self.start_date.astype(str),
end_date=self.end_date.astype(str),
name=self.name,
)

def _supporting_arrays(self, *path):

import numpy as np

def _path(path, name):
return "/".join(str(_) for _ in [*path, name])

result = {
_path(path, "latitudes"): self.latitudes,
_path(path, "longitudes"): self.longitudes,
}
collected = []

self.collect_supporting_arrays(collected, *path)

for path, name, array in collected:
assert isinstance(path, tuple) and isinstance(name, str)
assert isinstance(array, np.ndarray)

name = _path(path, name)

if name in result:
raise ValueError(f"Duplicate key {name}")

result[name] = array

return result

def supporting_arrays(self):
"""Arrays to be saved in the checkpoints"""
arrays, _ = self._supporting_arrays_and_sources()
return arrays

def _supporting_arrays_and_sources(self):

source_to_arrays = {}

# Top levels arrays
result = self._supporting_arrays()
source_to_arrays[id(self)] = sorted(result.keys())

# Arrays from the input sources
for i, source in enumerate(self._input_sources()):
name = source.name if source.name is not None else i
src_arrays = source._supporting_arrays(name)
source_to_arrays[id(source)] = sorted(src_arrays.keys())

for k in src_arrays:
assert k not in result

result.update(src_arrays)

return result, source_to_arrays

def collect_supporting_arrays(self, collected, *path):
# Override this method to add more arrays
pass

def metadata_specific(self, **kwargs):
action = self.__class__.__name__.lower()
# assert isinstance(self.frequency, datetime.timedelta), (self.frequency, self, action)
Expand Down
19 changes: 19 additions & 0 deletions src/anemoi/datasets/data/forwards.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@


import logging
import warnings
from functools import cached_property

import numpy as np
Expand All @@ -34,6 +35,12 @@ def __len__(self):
def __getitem__(self, n):
return self.forward[n]

@property
def name(self):
if self._name is not None:
return self._name
return self.forward.name

@property
def dates(self):
return self.forward.dates
Expand Down Expand Up @@ -102,6 +109,12 @@ def metadata_specific(self, **kwargs):
**kwargs,
)

def collect_supporting_arrays(self, collected, *path):
self.forward.collect_supporting_arrays(collected, *path)

def collect_input_sources(self, collected):
self.forward.collect_input_sources(collected)

def source(self, index):
return self.forward.source(index)

Expand Down Expand Up @@ -197,6 +210,12 @@ def metadata_specific(self, **kwargs):
**kwargs,
)

def collect_supporting_arrays(self, collected, *path):
warnings.warn(f"The behaviour of {self.__class__.__name__}.collect_supporting_arrays() is not well defined")
for i, d in enumerate(self.datasets):
name = d.name if d.name is not None else i
d.collect_supporting_arrays(collected, *path, name)

@property
def missing(self):
raise NotImplementedError("missing() not implemented for Combined")
Expand Down
Loading

0 comments on commit 0138022

Please sign in to comment.