Merge branch 'ecmwf:develop' into develop

MeteoSwiss · Oct 29, 2024 · 0138022 · 0138022
2 parents 1c87c51 + e5ec079
commit 0138022
Show file tree

Hide file tree

Showing 19 changed files with 336 additions and 55 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,23 +8,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 Please add your functional changes to the appropriate section in the PR.
 Keep it human-readable, your future self will thank you!
 
-## [Unreleased](https://github.com/ecmwf/anemoi-datasets/compare/0.5.7...HEAD)
+## [Unreleased](https://github.com/ecmwf/anemoi-datasets/compare/0.5.8...HEAD)
+
+## [0.5.8](https://github.com/ecmwf/anemoi-datasets/compare/0.5.7...0.5.8) - 2024-10-26
 
 ### Changed
 
 - Bugfix in `auto_adjust`
+- Fixed precommit CI errors
+- Improve tests
+- More verbosity
 
 ### Added
 
 - Add anemoi-transform link to documentation
 - Various bug fixes
 - Control compatibility check in xy/zip
 - Add `merge` feature
+- Add support for storing `supporting_arrays` in checkpoint files
+- Allow naming of datasets components
+- Contributors file (#105)
 
 ### Changed
 
 - Remove upstream dependencies from downstream-ci workflow (temporary) (#83)
 - ci: pin python versions to 3.9 ... 3.12 for checks (#93)
+- Fix `__version__` import in init
 
 ## [0.5.7](https://github.com/ecmwf/anemoi-datasets/compare/0.5.6...0.5.7) - 2024-10-09
 

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -0,0 +1,13 @@
+## How to Contribute
+
+Please see the [read the docs](https://anemoi-training.readthedocs.io/en/latest/dev/contributing.html).
+
+
+## Contributors
+
+Thank you to all the wonderful people who have contributed to Anemoi. Contributions can come in many forms, including code, documentation, bug reports, feature suggestions, design, and more. A list of code-based contributors can be found [here](https://github.com/ecmwf/anemoi-datasets/graphs/contributors).
+
+
+## Contributing Organisations
+
+Significant contributions have been made by the following organisations: [DWD](https://www.dwd.de/), [MET Norway](https://www.met.no/), [MeteoSwiss](https://www.meteoswiss.admin.ch/), [RMI](https://www.meteo.be/) & [ECMWF](https://www.ecmwf.int/)
diff --git a/docs/conf.py b/docs/conf.py
@@ -34,15 +34,15 @@
 
 project = "Anemoi"
 
-author = "ECMWF"
+author = "Anemoi contributors"
 
 year = datetime.datetime.now().year
 if year == 2024:
     years = "2024"
 else:
     years = "2024-%s" % (year,)
 
-copyright = "%s, ECMWF" % (years,)
+copyright = "%s, Anemoi contributors" % (years,)
 
 
 try:

diff --git a/pyproject.toml b/pyproject.toml
@@ -51,7 +51,7 @@ dynamic = [
 ]
 dependencies = [
   "anemoi-transform>=0.0.5",
-  "anemoi-utils[provenance]>=0.4.1",
+  "anemoi-utils[provenance]>=0.4.2",
   "cfunits",
   "numpy",
   "pyyaml",

diff --git a/src/anemoi/datasets/__init__.py b/src/anemoi/datasets/__init__.py
@@ -5,15 +5,21 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 
-from ._version import __version__
 from .data import MissingDateError
 from .data import add_dataset_path
 from .data import add_named_dataset
 from .data import list_dataset_names
 from .data import open_dataset
 
+try:
+    # NOTE: the `_version.py` file must not be present in the git repository
+    #   as it is generated by setuptools at install time
+    from ._version import __version__  # type: ignore
+except ImportError:  # pragma: no cover
+    # Local copy or not installed with setuptools
+    __version__ = "999"
+
 __all__ = [
-    "__version__",
     "add_dataset_path",
     "add_named_dataset",
     "list_dataset_names",

diff --git a/src/anemoi/datasets/commands/create.py b/src/anemoi/datasets/commands/create.py
@@ -23,9 +23,7 @@
 
 
 def task(what, options, *args, **kwargs):
-    """
-    Make sure `import Creator` is done in the sub-processes, and not in the main one.
-    """
+    """Make sure `import Creator` is done in the sub-processes, and not in the main one."""
 
     now = datetime.datetime.now()
     LOG.info(f"🎬 Task {what}({args},{kwargs}) starting")

diff --git a/src/anemoi/datasets/create/functions/sources/xarray/variable.py b/src/anemoi/datasets/create/functions/sources/xarray/variable.py
@@ -83,9 +83,8 @@ def __repr__(self):
         )
 
     def __getitem__(self, i):
-        """
-        Get a 2D field from the variable
-        """
+        """Get a 2D field from the variable"""
+
         if i >= self.length:
             raise IndexError(i)
 

diff --git a/src/anemoi/datasets/data/__init__.py b/src/anemoi/datasets/data/__init__.py
@@ -25,7 +25,30 @@ class MissingDateError(Exception):
     pass
 
 
+def _convert(x):
+
+    if isinstance(x, list):
+        return [_convert(a) for a in x]
+
+    if isinstance(x, tuple):
+        return tuple(_convert(a) for a in x)
+
+    if isinstance(x, dict):
+        return {k: _convert(v) for k, v in x.items()}
+
+    if x.__class__.__name__ in ("DictConfig", "ListConfig"):
+        from omegaconf import OmegaConf
+
+        return OmegaConf.to_container(x, resolve=True)
+
+    return x
+
+
 def open_dataset(*args, **kwargs):
+
+    # That will get rid of OmegaConf objects
+    args, kwargs = _convert(args), _convert(kwargs)
+
     ds = _open_dataset(*args, **kwargs)
     ds = ds.mutate()
     ds.arguments = {"args": args, "kwargs": kwargs}

diff --git a/src/anemoi/datasets/data/dataset.py b/src/anemoi/datasets/data/dataset.py
@@ -23,14 +23,40 @@
 LOG = logging.getLogger(__name__)
 
 
+def _tidy(v):
+    if isinstance(v, (list, tuple, set)):
+        return [_tidy(i) for i in v]
+    if isinstance(v, dict):
+        return {k: _tidy(v) for k, v in v.items()}
+    if isinstance(v, str) and v.startswith("/"):
+        return os.path.basename(v)
+    if isinstance(v, datetime.datetime):
+        return v.isoformat()
+    if isinstance(v, datetime.date):
+        return v.isoformat()
+    if isinstance(v, datetime.timedelta):
+        return frequency_to_string(v)
+
+    if isinstance(v, Dataset):
+        # That can happen in the `arguments`
+        # if a dataset is passed as an argument
+        return repr(v)
+
+    if isinstance(v, slice):
+        return (v.start, v.stop, v.step)
+
+    return v
+
+
 class Dataset:
     arguments = {}
+    _name = None
 
     def mutate(self) -> "Dataset":
-        """
-        Give an opportunity to a subclass to return a new Dataset
+        """Give an opportunity to a subclass to return a new Dataset
         object of a different class, if needed.
         """
+
         return self
 
     def swap_with_parent(self, parent):
@@ -41,6 +67,21 @@ def _len(self):
         return len(self)
 
     def _subset(self, **kwargs):
+
+        if not kwargs:
+            return self.mutate()
+
+        name = kwargs.pop("name", None)
+        result = self.__subset(**kwargs)
+        result._name = name
+
+        return result
+
+    @property
+    def name(self):
+        return self._name
+
+    def __subset(self, **kwargs):
         if not kwargs:
             return self.mutate()
 
@@ -254,41 +295,32 @@ def typed_variables(self):
 
         return result
 
+    def _input_sources(self):
+        sources = []
+        self.collect_input_sources(sources)
+        return sources
+
     def metadata(self):
         import anemoi
 
-        def tidy(v):
-            if isinstance(v, (list, tuple, set)):
-                return [tidy(i) for i in v]
-            if isinstance(v, dict):
-                return {k: tidy(v) for k, v in v.items()}
-            if isinstance(v, str) and v.startswith("/"):
-                return os.path.basename(v)
-            if isinstance(v, datetime.datetime):
-                return v.isoformat()
-            if isinstance(v, datetime.date):
-                return v.isoformat()
-            if isinstance(v, datetime.timedelta):
-                return frequency_to_string(v)
-
-            if isinstance(v, Dataset):
-                # That can happen in the `arguments`
-                # if a dataset is passed as an argument
-                return repr(v)
-
-            if isinstance(v, slice):
-                return (v.start, v.stop, v.step)
-
-            return v
+        _, source_to_arrays = self._supporting_arrays_and_sources()
+
+        sources = []
+        for i, source in enumerate(self._input_sources()):
+            source_metadata = source.dataset_metadata().copy()
+            source_metadata["supporting_arrays"] = source_to_arrays[id(source)]
+            sources.append(source_metadata)
 
         md = dict(
             version=anemoi.datasets.__version__,
             arguments=self.arguments,
             **self.dataset_metadata(),
+            sources=sources,
+            supporting_arrays=source_to_arrays[id(self)],
         )
 
         try:
-            return json.loads(json.dumps(tidy(md)))
+            return json.loads(json.dumps(_tidy(md)))
         except Exception:
             LOG.exception("Failed to serialize metadata")
             pprint.pprint(md)
@@ -313,8 +345,67 @@ def dataset_metadata(self):
             dtype=str(self.dtype),
             start_date=self.start_date.astype(str),
             end_date=self.end_date.astype(str),
+            name=self.name,
         )
 
+    def _supporting_arrays(self, *path):
+
+        import numpy as np
+
+        def _path(path, name):
+            return "/".join(str(_) for _ in [*path, name])
+
+        result = {
+            _path(path, "latitudes"): self.latitudes,
+            _path(path, "longitudes"): self.longitudes,
+        }
+        collected = []
+
+        self.collect_supporting_arrays(collected, *path)
+
+        for path, name, array in collected:
+            assert isinstance(path, tuple) and isinstance(name, str)
+            assert isinstance(array, np.ndarray)
+
+            name = _path(path, name)
+
+            if name in result:
+                raise ValueError(f"Duplicate key {name}")
+
+            result[name] = array
+
+        return result
+
+    def supporting_arrays(self):
+        """Arrays to be saved in the checkpoints"""
+        arrays, _ = self._supporting_arrays_and_sources()
+        return arrays
+
+    def _supporting_arrays_and_sources(self):
+
+        source_to_arrays = {}
+
+        # Top levels arrays
+        result = self._supporting_arrays()
+        source_to_arrays[id(self)] = sorted(result.keys())
+
+        # Arrays from the input sources
+        for i, source in enumerate(self._input_sources()):
+            name = source.name if source.name is not None else i
+            src_arrays = source._supporting_arrays(name)
+            source_to_arrays[id(source)] = sorted(src_arrays.keys())
+
+            for k in src_arrays:
+                assert k not in result
+
+            result.update(src_arrays)
+
+        return result, source_to_arrays
+
+    def collect_supporting_arrays(self, collected, *path):
+        # Override this method to add more arrays
+        pass
+
     def metadata_specific(self, **kwargs):
         action = self.__class__.__name__.lower()
         # assert isinstance(self.frequency, datetime.timedelta), (self.frequency, self, action)

diff --git a/src/anemoi/datasets/data/forwards.py b/src/anemoi/datasets/data/forwards.py
@@ -9,6 +9,7 @@
 
 
 import logging
+import warnings
 from functools import cached_property
 
 import numpy as np
@@ -34,6 +35,12 @@ def __len__(self):
     def __getitem__(self, n):
         return self.forward[n]
 
+    @property
+    def name(self):
+        if self._name is not None:
+            return self._name
+        return self.forward.name
+
     @property
     def dates(self):
         return self.forward.dates
@@ -102,6 +109,12 @@ def metadata_specific(self, **kwargs):
             **kwargs,
         )
 
+    def collect_supporting_arrays(self, collected, *path):
+        self.forward.collect_supporting_arrays(collected, *path)
+
+    def collect_input_sources(self, collected):
+        self.forward.collect_input_sources(collected)
+
     def source(self, index):
         return self.forward.source(index)
 
@@ -197,6 +210,12 @@ def metadata_specific(self, **kwargs):
             **kwargs,
         )
 
+    def collect_supporting_arrays(self, collected, *path):
+        warnings.warn(f"The behaviour of {self.__class__.__name__}.collect_supporting_arrays() is not well defined")
+        for i, d in enumerate(self.datasets):
+            name = d.name if d.name is not None else i
+            d.collect_supporting_arrays(collected, *path, name)
+
     @property
     def missing(self):
         raise NotImplementedError("missing() not implemented for Combined")