Merge branch 'main' into update-tensorflow

MeteoSwiss · Nov 1, 2023 · a9eb155 · a9eb155
2 parents d4abd9d + 3399e66
commit a9eb155
Show file tree

Hide file tree

Showing 9 changed files with 192 additions and 40 deletions.
diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml
@@ -0,0 +1,30 @@
+# This workflow will upload a Python Package when a release is created
+
+name: Build and publish library
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Setup poetry
+        uses: abatilo/actions-poetry@v2
+      - name: Install the project dependencies
+        run: |
+          poetry install
+      - name: package library
+        run: poetry build
+      - name: Publish to PyPI
+        env:
+          PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
+        run: |
+          poetry config pypi-token.pypi "$PYPI_TOKEN"
+          poetry publish
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -0,0 +1,29 @@
+# This workflow will install the Python package and run the tests
+
+name: Test mlpp-lib
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10"]
+
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Setup poetry
+        uses: abatilo/actions-poetry@v2
+      - name: Install the project dependencies
+        run: poetry install
+      - name: Run the tests
+        run: poetry run pytest tests/
diff --git a/README.md b/README.md
@@ -1,5 +1,8 @@
 # mlpp-lib
 
+[![.github/workflows/run-tests.yml](https://github.com/MeteoSwiss/mlpp-lib/actions/workflows/run-tests.yml/badge.svg)](https://github.com/MeteoSwiss/mlpp-lib/actions/workflows/run-tests.yml)
+[![pypi](https://img.shields.io/pypi/v/mlpp-lib.svg?colorB=<brightgreen>)](https://pypi.python.org/pypi/mlpp-lib/)
+
 Collection of methods for ML-based postprocessing of weather forecasts.
 
 :warning: **The code in this repository is currently work-in-progress and not recommended for production use.** :warning:

diff --git a/mlpp_lib/model_selection.py b/mlpp_lib/model_selection.py
@@ -1,8 +1,9 @@
 import json
 import logging
 import random
+from copy import deepcopy
 from itertools import combinations
-from typing import Any, Hashable, Mapping, Optional, Sequence, Type
+from typing import Any, Mapping, Optional, Sequence, Type
 
 import numpy as np
 import pandas as pd
@@ -113,29 +114,6 @@ def __init__(
         self.seed = seed
         self.time_dim_name = time_dim_name
 
-    @classmethod
-    def from_json(cls, file: str) -> Self:
-        """Instantiate the DataSplitter from a json file with previously computed splits."""
-
-        with open(file, "r") as f:
-            splits = json.load(f)
-
-        time_split = {k: v["forecast_reference_time"] for k, v in splits.items()}
-        station_split = {k: v["station"] for k, v in splits.items()}
-        splitter = cls(time_split, station_split)
-        splitter._time_defined = True
-        splitter._station_defined = True
-        splitter._time_partitioning()
-        splitter._station_partitioning()
-        return splitter
-
-    def to_json(self, file: str):
-        if not hasattr(self, "partitions"):
-            self._time_partitioning()
-            self._station_partitioning()
-        with open(file, "w") as f:
-            json.dump(self.partitions, f, indent=4)
-
     def get_partition(
         self, *args: xr.Dataset, partition=None, thinning: Optional[Mapping] = None
     ) -> tuple[xr.Dataset, ...]:
@@ -215,6 +193,7 @@ def _time_partitioning(self) -> None:
         # assign indexers
         for partition in self.partition_names:
             idx = self._time_indexers[partition]
+            idx = pd.to_datetime(idx)  # always convert to pandas datetime indices
             idx = slice(*idx) if len(idx) == 2 else idx
             indexer = {self.time_dim_name: idx}
             if not hasattr(self, "partitions"):
@@ -299,6 +278,43 @@ def _check_station(self, station_split: dict, station_split_method: str):
         self.station_split = station_split
         self.station_split_method = station_split_method
 
+    @classmethod
+    def from_dict(cls, splits: dict) -> Self:
+        time_split = {k: v["forecast_reference_time"] for k, v in splits.items()}
+        station_split = {k: v["station"] for k, v in splits.items()}
+        splitter = cls(time_split, station_split)
+        splitter._time_defined = True
+        splitter._station_defined = True
+        splitter._time_partitioning()
+        splitter._station_partitioning()
+        return splitter
+
+    def to_dict(self):
+        if not hasattr(self, "time_index") or not hasattr(self, "station_index"):
+            raise ValueError("DataSplitter wasn't applied on data yet")
+        if not hasattr(self, "partitions"):
+            self._time_partitioning()
+            self._station_partitioning()
+        partitions = deepcopy(self.partitions)
+        for split_key, split_dict in partitions.items():
+            for dim, value in split_dict.items():
+                if isinstance(value, slice):
+                    partitions[split_key][dim] = [str(value.start), str(value.stop)]
+                elif hasattr(value, "tolist"):
+                    partitions[split_key][dim] = value.astype(str).tolist()
+        return partitions
+
+    @classmethod
+    def from_json(cls, in_fn: str) -> Self:
+        with open(in_fn, "r") as f:
+            in_dict = json.load(f)
+        return cls.from_dict(in_dict)
+
+    def save_json(self, out_fn: str) -> None:
+        out_dict = self.to_dict()
+        with open(out_fn, "w") as outfile:
+            json.dump(out_dict, outfile, indent=4)
+
 
 def sequential_split(
     index: xindex,

diff --git a/mlpp_lib/models.py b/mlpp_lib/models.py
@@ -24,14 +24,28 @@
     TCN_IMPORTED = True
 
 
+class MonteCarloDropout(Dropout):
+    def call(self, inputs):
+        return super().call(inputs, training=True)
+
+
 def _build_fcn_block(
-    inputs, hidden_layers, activations, dropout, skip_connection, idx=0
+    inputs,
+    hidden_layers,
+    activations,
+    dropout,
+    mc_dropout,
+    skip_connection,
+    idx=0,
 ):
     x = inputs
     for i, units in enumerate(hidden_layers):
         x = Dense(units, activation=activations[i], name=f"dense_{idx}_{i}")(x)
         if i < len(dropout) and 0.0 < dropout[i] < 1.0:
-            x = Dropout(dropout[i], name=f"dropout_{idx}_{i}")(x)
+            if mc_dropout:
+                x = MonteCarloDropout(dropout[i], name=f"mc_dropout_{idx}_{i}")(x)
+            else:
+                x = Dropout(dropout[i], name=f"dropout_{idx}_{i}")(x)
 
     if skip_connection:
         x = Dense(inputs.shape[1], name=f"skip_dense_{idx}")(x)
@@ -70,6 +84,7 @@ def fully_connected_network(
     hidden_layers: list,
     activations: Optional[Union[str, list[str]]] = "relu",
     dropout: Optional[Union[float, list[float]]] = None,
+    mc_dropout: bool = False,
     out_bias_init: Optional[Union[str, np.ndarray[Any, float]]] = "zeros",
     probabilistic_layer: Optional[str] = None,
     skip_connection: bool = False,
@@ -93,6 +108,9 @@ def fully_connected_network(
         (Optional) Dropout rate for the optional dropout layers. If a `float` is passed,
         dropout layers with the given rate are created after each Dense layer, except before the output layer.
         Default is None.
+    mc_dropout: bool
+        Enable Monte Carlo dropout during inference. It has no effect during training.
+        It has no effect if `dropout=None`. Default is false.
     out_bias_init: str or np.ndarray
         (Optional) Specifies the initialization of the output layer bias. If a string is passed,
         it must be a valid Keras built-in initializer (see https://keras.io/api/layers/initializers/).
@@ -129,7 +147,14 @@ def fully_connected_network(
         )
 
     inputs = tf.keras.Input(shape=input_shape)
-    x = _build_fcn_block(inputs, hidden_layers, activations, dropout, skip_connection)
+    x = _build_fcn_block(
+        inputs,
+        hidden_layers,
+        activations,
+        dropout,
+        mc_dropout,
+        skip_connection,
+    )
     outputs = _build_fcn_output(x, output_size, probabilistic_layer, out_bias_init)
     model = Model(inputs=inputs, outputs=outputs)
 
@@ -142,6 +167,7 @@ def fully_connected_multibranch_network(
     hidden_layers: list,
     activations: Optional[Union[str, list[str]]] = "relu",
     dropout: Optional[Union[float, list[float]]] = None,
+    mc_dropout: bool = False,
     out_bias_init: Optional[Union[str, np.ndarray[Any, float]]] = "zeros",
     probabilistic_layer: Optional[str] = None,
     skip_connection: bool = False,
@@ -165,6 +191,9 @@ def fully_connected_multibranch_network(
         (Optional) Dropout rate for the optional dropout layers. If a `float` is passed,
         dropout layers with the given rate are created after each Dense layer, except before the output layer.
         Default is None.
+        mc_dropout: bool
+        Enable Monte Carlo dropout during inference. It has no effect during training.
+        It has no effect if `dropout=None`. Default is false.
     out_bias_init: str or np.ndarray
         (Optional) Specifies the initialization of the output layer bias. If a string is passed,
         it must be a valid Keras built-in initializer (see https://keras.io/api/layers/initializers/).
@@ -211,7 +240,13 @@ def fully_connected_multibranch_network(
 
     for idx in range(n_branches):
         x = _build_fcn_block(
-            inputs, hidden_layers, activations, dropout, skip_connection, idx
+            inputs,
+            hidden_layers,
+            activations,
+            dropout,
+            mc_dropout,
+            skip_connection,
+            idx,
         )
         all_branch_outputs.append(x)
 
@@ -230,6 +265,7 @@ def deep_cross_network(
     hidden_layers: list,
     activations: Optional[Union[str, list[str]]] = "relu",
     dropout: Optional[Union[float, list[float]]] = None,
+    mc_dropout: bool = False,
     out_bias_init: Optional[Union[str, np.ndarray[Any, float]]] = "zeros",
     probabilistic_layer: Optional[str] = None,
     skip_connection: bool = False,
@@ -253,6 +289,9 @@ def deep_cross_network(
         (Optional) Dropout rate for the optional dropout layers. If a `float` is passed,
         dropout layers with the given rate are created after each Dense layer, except before the output layer.
         Default is None.
+    mc_dropout: bool
+        Enable Monte Carlo dropout during inference. It has no effect during training.
+        It has no effect if `dropout=None`. Default is false.
     out_bias_init: str or np.ndarray
         (Optional) Specifies the initialization of the output layer bias. If a string is passed,
         it must be a valid Keras built-in initializer (see https://keras.io/api/layers/initializers/).
@@ -304,7 +343,10 @@ def deep_cross_network(
         deep = BatchNormalization()(deep)
         deep = Activation(activations[i])(deep)
         if i < len(dropout) and 0.0 < dropout[i] < 1.0:
-            deep = Dropout(dropout[i])(deep)
+            if mc_dropout:
+                deep = MonteCarloDropout(dropout[i])(deep)
+            else:
+                deep = Dropout(dropout[i])(deep)
     # deep = tf.keras.Model(inputs=inputs, outputs=deep, name="deepblock")
 
     # merge

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,9 +1,9 @@
 [tool.poetry]
 name = "mlpp-lib"
-version = "0.2.4"
+version = "0.4.2"
 description = "Collection of methods for ML-based postprocessing of weather forecasts."
 authors = ["Daniele Nerini <[email protected]>"]
-include = ["LICENSE", "README.rst"]
+include = ["LICENSE", "README.md"]
 
 [tool.poetry.dependencies]
 python = ">=3.9, <3.11"

diff --git a/tests/test_model_selection.py b/tests/test_model_selection.py
@@ -91,6 +91,25 @@ def test_get_partition(self, options, features_dataset, targets_dataset):
             features_dataset, targets_dataset, partition="test"
         )
 
+    @pytest.mark.parametrize(
+        "options", scenarios, ids=ValidDataSplitterOptions.pytest_id
+    )
+    def test_serialization(self, options, features_dataset, targets_dataset, tmp_path):
+        fn = f"{tmp_path}/splitter.json"
+        splitter = ms.DataSplitter(
+            options.time_split,
+            options.station_split,
+            options.time_split_method,
+            options.station_split_method,
+        )
+        splitter.get_partition(features_dataset, targets_dataset, partition="train")
+        splitter.save_json(fn)
+        new_splitter = ms.DataSplitter.from_json(fn)
+        for split_key, split_dict in splitter.partitions.items():
+            for dim, value in split_dict.items():
+                new_value = new_splitter.partitions[split_key][dim]
+                np.testing.assert_array_equal(value, new_value)
+
     # test invalid arguments
     def test_time_split_method_required(self):
         time_split = {"train": 0.6, "val": 0.2, "test": 0.2}

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -4,6 +4,7 @@
 import pytest
 import tensorflow as tf
 from keras.engine.functional import Functional
+from numpy.testing import assert_array_equal
 
 from mlpp_lib import models
 
@@ -14,6 +15,7 @@
     hidden_layers=[[8, 8]],
     activations=["relu", ["relu", "elu"]],
     dropout=[None, 0.1, [0.1, 0.0]],
+    mc_dropout=[True, False],
     out_bias_init=["zeros", np.array([0.2]), np.array([0.2, 2.1])],
     probabilistic_layer=[None, "IndependentNormal", "MultivariateNormalTriL"],
     skip_connection=[False, True],
@@ -32,6 +34,21 @@
 ]
 
 
+def _test_prediction(model, scenario_kwargs, dummy_input, output_size):
+    pred = model(dummy_input)
+    assert pred.shape == (32, output_size)
+    pred2 = model(dummy_input)
+    if scenario_kwargs["probabilistic_layer"] is not None:
+        pred = pred.mean()
+        pred2 = pred2.mean()
+
+    if scenario_kwargs["dropout"] is not None and scenario_kwargs["mc_dropout"]:
+        with pytest.raises(AssertionError):
+            assert_array_equal(pred, pred2)
+    else:
+        assert_array_equal(pred, pred2)
+
+
 @pytest.mark.parametrize("scenario_kwargs", FCN_SCENARIOS)
 def test_fully_connected_network(scenario_kwargs):
 
@@ -61,7 +78,7 @@ def test_fully_connected_network(scenario_kwargs):
         )
         assert isinstance(model, Functional)
 
-    assert model(dummy_input).shape == (32, output_size)
+    _test_prediction(model, scenario_kwargs, dummy_input, output_size)
 
 
 @pytest.mark.parametrize("scenario_kwargs", FCN_SCENARIOS)
@@ -93,7 +110,7 @@ def test_fully_connected_multibranch_network(scenario_kwargs):
         )
         assert isinstance(model, Functional)
 
-    assert model(dummy_input).shape == (32, output_size)
+    _test_prediction(model, scenario_kwargs, dummy_input, output_size)
 
 
 @pytest.mark.parametrize("scenario_kwargs", DCN_SCENARIOS)
@@ -118,4 +135,4 @@ def test_deep_cross_network(scenario_kwargs):
         model = models.deep_cross_network(input_shape, output_size, **scenario_kwargs)
         assert isinstance(model, Functional)
 
-    assert model(dummy_input).shape == (32, output_size)
+    _test_prediction(model, scenario_kwargs, dummy_input, output_size)