Skip to content

Commit

Permalink
Merge branch 'main' into update-tensorflow
Browse files Browse the repository at this point in the history
  • Loading branch information
dnerini committed Nov 1, 2023
2 parents d4abd9d + 3399e66 commit a9eb155
Show file tree
Hide file tree
Showing 9 changed files with 192 additions and 40 deletions.
30 changes: 30 additions & 0 deletions .github/workflows/publish-package.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# This workflow will upload a Python Package when a release is created

name: Build and publish library

on:
release:
types: [published]

jobs:
deploy:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Setup poetry
uses: abatilo/actions-poetry@v2
- name: Install the project dependencies
run: |
poetry install
- name: package library
run: poetry build
- name: Publish to PyPI
env:
PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
run: |
poetry config pypi-token.pypi "$PYPI_TOKEN"
poetry publish
29 changes: 29 additions & 0 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# This workflow will install the Python package and run the tests

name: Test mlpp-lib

on:
push:
branches: [main]
pull_request:
branches: [main]

jobs:
build:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10"]

steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Setup poetry
uses: abatilo/actions-poetry@v2
- name: Install the project dependencies
run: poetry install
- name: Run the tests
run: poetry run pytest tests/
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# mlpp-lib

[![.github/workflows/run-tests.yml](https://github.com/MeteoSwiss/mlpp-lib/actions/workflows/run-tests.yml/badge.svg)](https://github.com/MeteoSwiss/mlpp-lib/actions/workflows/run-tests.yml)
[![pypi](https://img.shields.io/pypi/v/mlpp-lib.svg?colorB=<brightgreen>)](https://pypi.python.org/pypi/mlpp-lib/)

Collection of methods for ML-based postprocessing of weather forecasts.

:warning: **The code in this repository is currently work-in-progress and not recommended for production use.** :warning:
Expand Down
64 changes: 40 additions & 24 deletions mlpp_lib/model_selection.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import json
import logging
import random
from copy import deepcopy
from itertools import combinations
from typing import Any, Hashable, Mapping, Optional, Sequence, Type
from typing import Any, Mapping, Optional, Sequence, Type

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -113,29 +114,6 @@ def __init__(
self.seed = seed
self.time_dim_name = time_dim_name

@classmethod
def from_json(cls, file: str) -> Self:
"""Instantiate the DataSplitter from a json file with previously computed splits."""

with open(file, "r") as f:
splits = json.load(f)

time_split = {k: v["forecast_reference_time"] for k, v in splits.items()}
station_split = {k: v["station"] for k, v in splits.items()}
splitter = cls(time_split, station_split)
splitter._time_defined = True
splitter._station_defined = True
splitter._time_partitioning()
splitter._station_partitioning()
return splitter

def to_json(self, file: str):
if not hasattr(self, "partitions"):
self._time_partitioning()
self._station_partitioning()
with open(file, "w") as f:
json.dump(self.partitions, f, indent=4)

def get_partition(
self, *args: xr.Dataset, partition=None, thinning: Optional[Mapping] = None
) -> tuple[xr.Dataset, ...]:
Expand Down Expand Up @@ -215,6 +193,7 @@ def _time_partitioning(self) -> None:
# assign indexers
for partition in self.partition_names:
idx = self._time_indexers[partition]
idx = pd.to_datetime(idx) # always convert to pandas datetime indices
idx = slice(*idx) if len(idx) == 2 else idx
indexer = {self.time_dim_name: idx}
if not hasattr(self, "partitions"):
Expand Down Expand Up @@ -299,6 +278,43 @@ def _check_station(self, station_split: dict, station_split_method: str):
self.station_split = station_split
self.station_split_method = station_split_method

@classmethod
def from_dict(cls, splits: dict) -> Self:
time_split = {k: v["forecast_reference_time"] for k, v in splits.items()}
station_split = {k: v["station"] for k, v in splits.items()}
splitter = cls(time_split, station_split)
splitter._time_defined = True
splitter._station_defined = True
splitter._time_partitioning()
splitter._station_partitioning()
return splitter

def to_dict(self):
if not hasattr(self, "time_index") or not hasattr(self, "station_index"):
raise ValueError("DataSplitter wasn't applied on data yet")
if not hasattr(self, "partitions"):
self._time_partitioning()
self._station_partitioning()
partitions = deepcopy(self.partitions)
for split_key, split_dict in partitions.items():
for dim, value in split_dict.items():
if isinstance(value, slice):
partitions[split_key][dim] = [str(value.start), str(value.stop)]
elif hasattr(value, "tolist"):
partitions[split_key][dim] = value.astype(str).tolist()
return partitions

@classmethod
def from_json(cls, in_fn: str) -> Self:
with open(in_fn, "r") as f:
in_dict = json.load(f)
return cls.from_dict(in_dict)

def save_json(self, out_fn: str) -> None:
out_dict = self.to_dict()
with open(out_fn, "w") as outfile:
json.dump(out_dict, outfile, indent=4)


def sequential_split(
index: xindex,
Expand Down
52 changes: 47 additions & 5 deletions mlpp_lib/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,28 @@
TCN_IMPORTED = True


class MonteCarloDropout(Dropout):
def call(self, inputs):
return super().call(inputs, training=True)


def _build_fcn_block(
inputs, hidden_layers, activations, dropout, skip_connection, idx=0
inputs,
hidden_layers,
activations,
dropout,
mc_dropout,
skip_connection,
idx=0,
):
x = inputs
for i, units in enumerate(hidden_layers):
x = Dense(units, activation=activations[i], name=f"dense_{idx}_{i}")(x)
if i < len(dropout) and 0.0 < dropout[i] < 1.0:
x = Dropout(dropout[i], name=f"dropout_{idx}_{i}")(x)
if mc_dropout:
x = MonteCarloDropout(dropout[i], name=f"mc_dropout_{idx}_{i}")(x)
else:
x = Dropout(dropout[i], name=f"dropout_{idx}_{i}")(x)

if skip_connection:
x = Dense(inputs.shape[1], name=f"skip_dense_{idx}")(x)
Expand Down Expand Up @@ -70,6 +84,7 @@ def fully_connected_network(
hidden_layers: list,
activations: Optional[Union[str, list[str]]] = "relu",
dropout: Optional[Union[float, list[float]]] = None,
mc_dropout: bool = False,
out_bias_init: Optional[Union[str, np.ndarray[Any, float]]] = "zeros",
probabilistic_layer: Optional[str] = None,
skip_connection: bool = False,
Expand All @@ -93,6 +108,9 @@ def fully_connected_network(
(Optional) Dropout rate for the optional dropout layers. If a `float` is passed,
dropout layers with the given rate are created after each Dense layer, except before the output layer.
Default is None.
mc_dropout: bool
Enable Monte Carlo dropout during inference. It has no effect during training.
It has no effect if `dropout=None`. Default is false.
out_bias_init: str or np.ndarray
(Optional) Specifies the initialization of the output layer bias. If a string is passed,
it must be a valid Keras built-in initializer (see https://keras.io/api/layers/initializers/).
Expand Down Expand Up @@ -129,7 +147,14 @@ def fully_connected_network(
)

inputs = tf.keras.Input(shape=input_shape)
x = _build_fcn_block(inputs, hidden_layers, activations, dropout, skip_connection)
x = _build_fcn_block(
inputs,
hidden_layers,
activations,
dropout,
mc_dropout,
skip_connection,
)
outputs = _build_fcn_output(x, output_size, probabilistic_layer, out_bias_init)
model = Model(inputs=inputs, outputs=outputs)

Expand All @@ -142,6 +167,7 @@ def fully_connected_multibranch_network(
hidden_layers: list,
activations: Optional[Union[str, list[str]]] = "relu",
dropout: Optional[Union[float, list[float]]] = None,
mc_dropout: bool = False,
out_bias_init: Optional[Union[str, np.ndarray[Any, float]]] = "zeros",
probabilistic_layer: Optional[str] = None,
skip_connection: bool = False,
Expand All @@ -165,6 +191,9 @@ def fully_connected_multibranch_network(
(Optional) Dropout rate for the optional dropout layers. If a `float` is passed,
dropout layers with the given rate are created after each Dense layer, except before the output layer.
Default is None.
mc_dropout: bool
Enable Monte Carlo dropout during inference. It has no effect during training.
It has no effect if `dropout=None`. Default is false.
out_bias_init: str or np.ndarray
(Optional) Specifies the initialization of the output layer bias. If a string is passed,
it must be a valid Keras built-in initializer (see https://keras.io/api/layers/initializers/).
Expand Down Expand Up @@ -211,7 +240,13 @@ def fully_connected_multibranch_network(

for idx in range(n_branches):
x = _build_fcn_block(
inputs, hidden_layers, activations, dropout, skip_connection, idx
inputs,
hidden_layers,
activations,
dropout,
mc_dropout,
skip_connection,
idx,
)
all_branch_outputs.append(x)

Expand All @@ -230,6 +265,7 @@ def deep_cross_network(
hidden_layers: list,
activations: Optional[Union[str, list[str]]] = "relu",
dropout: Optional[Union[float, list[float]]] = None,
mc_dropout: bool = False,
out_bias_init: Optional[Union[str, np.ndarray[Any, float]]] = "zeros",
probabilistic_layer: Optional[str] = None,
skip_connection: bool = False,
Expand All @@ -253,6 +289,9 @@ def deep_cross_network(
(Optional) Dropout rate for the optional dropout layers. If a `float` is passed,
dropout layers with the given rate are created after each Dense layer, except before the output layer.
Default is None.
mc_dropout: bool
Enable Monte Carlo dropout during inference. It has no effect during training.
It has no effect if `dropout=None`. Default is false.
out_bias_init: str or np.ndarray
(Optional) Specifies the initialization of the output layer bias. If a string is passed,
it must be a valid Keras built-in initializer (see https://keras.io/api/layers/initializers/).
Expand Down Expand Up @@ -304,7 +343,10 @@ def deep_cross_network(
deep = BatchNormalization()(deep)
deep = Activation(activations[i])(deep)
if i < len(dropout) and 0.0 < dropout[i] < 1.0:
deep = Dropout(dropout[i])(deep)
if mc_dropout:
deep = MonteCarloDropout(dropout[i])(deep)
else:
deep = Dropout(dropout[i])(deep)
# deep = tf.keras.Model(inputs=inputs, outputs=deep, name="deepblock")

# merge
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[tool.poetry]
name = "mlpp-lib"
version = "0.2.4"
version = "0.4.2"
description = "Collection of methods for ML-based postprocessing of weather forecasts."
authors = ["Daniele Nerini <[email protected]>"]
include = ["LICENSE", "README.rst"]
include = ["LICENSE", "README.md"]

[tool.poetry.dependencies]
python = ">=3.9, <3.11"
Expand Down
19 changes: 19 additions & 0 deletions tests/test_model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,25 @@ def test_get_partition(self, options, features_dataset, targets_dataset):
features_dataset, targets_dataset, partition="test"
)

@pytest.mark.parametrize(
"options", scenarios, ids=ValidDataSplitterOptions.pytest_id
)
def test_serialization(self, options, features_dataset, targets_dataset, tmp_path):
fn = f"{tmp_path}/splitter.json"
splitter = ms.DataSplitter(
options.time_split,
options.station_split,
options.time_split_method,
options.station_split_method,
)
splitter.get_partition(features_dataset, targets_dataset, partition="train")
splitter.save_json(fn)
new_splitter = ms.DataSplitter.from_json(fn)
for split_key, split_dict in splitter.partitions.items():
for dim, value in split_dict.items():
new_value = new_splitter.partitions[split_key][dim]
np.testing.assert_array_equal(value, new_value)

# test invalid arguments
def test_time_split_method_required(self):
time_split = {"train": 0.6, "val": 0.2, "test": 0.2}
Expand Down
23 changes: 20 additions & 3 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest
import tensorflow as tf
from keras.engine.functional import Functional
from numpy.testing import assert_array_equal

from mlpp_lib import models

Expand All @@ -14,6 +15,7 @@
hidden_layers=[[8, 8]],
activations=["relu", ["relu", "elu"]],
dropout=[None, 0.1, [0.1, 0.0]],
mc_dropout=[True, False],
out_bias_init=["zeros", np.array([0.2]), np.array([0.2, 2.1])],
probabilistic_layer=[None, "IndependentNormal", "MultivariateNormalTriL"],
skip_connection=[False, True],
Expand All @@ -32,6 +34,21 @@
]


def _test_prediction(model, scenario_kwargs, dummy_input, output_size):
pred = model(dummy_input)
assert pred.shape == (32, output_size)
pred2 = model(dummy_input)
if scenario_kwargs["probabilistic_layer"] is not None:
pred = pred.mean()
pred2 = pred2.mean()

if scenario_kwargs["dropout"] is not None and scenario_kwargs["mc_dropout"]:
with pytest.raises(AssertionError):
assert_array_equal(pred, pred2)
else:
assert_array_equal(pred, pred2)


@pytest.mark.parametrize("scenario_kwargs", FCN_SCENARIOS)
def test_fully_connected_network(scenario_kwargs):

Expand Down Expand Up @@ -61,7 +78,7 @@ def test_fully_connected_network(scenario_kwargs):
)
assert isinstance(model, Functional)

assert model(dummy_input).shape == (32, output_size)
_test_prediction(model, scenario_kwargs, dummy_input, output_size)


@pytest.mark.parametrize("scenario_kwargs", FCN_SCENARIOS)
Expand Down Expand Up @@ -93,7 +110,7 @@ def test_fully_connected_multibranch_network(scenario_kwargs):
)
assert isinstance(model, Functional)

assert model(dummy_input).shape == (32, output_size)
_test_prediction(model, scenario_kwargs, dummy_input, output_size)


@pytest.mark.parametrize("scenario_kwargs", DCN_SCENARIOS)
Expand All @@ -118,4 +135,4 @@ def test_deep_cross_network(scenario_kwargs):
model = models.deep_cross_network(input_shape, output_size, **scenario_kwargs)
assert isinstance(model, Functional)

assert model(dummy_input).shape == (32, output_size)
_test_prediction(model, scenario_kwargs, dummy_input, output_size)
Loading

0 comments on commit a9eb155

Please sign in to comment.