alan-turing-institute
diff --git a/‎ice_station_zebra/callbacks/metric_summary_callback.py
Lines changed: 12 additions & 7 deletions b/‎ice_station_zebra/callbacks/metric_summary_callback.py
Lines changed: 12 additions & 7 deletions
diff --git a/‎ice_station_zebra/callbacks/plotting_callback.py
Lines changed: 10 additions & 6 deletions b/‎ice_station_zebra/callbacks/plotting_callback.py
Lines changed: 10 additions & 6 deletions
diff --git a/‎ice_station_zebra/callbacks/unconditional_checkpoint.py
Lines changed: 2 additions & 1 deletion b/‎ice_station_zebra/callbacks/unconditional_checkpoint.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎ice_station_zebra/cli/hydra.py
Lines changed: 3 additions & 2 deletions b/‎ice_station_zebra/cli/hydra.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎ice_station_zebra/data_loaders/combined_dataset.py
Lines changed: 16 additions & 10 deletions b/‎ice_station_zebra/data_loaders/combined_dataset.py
Lines changed: 16 additions & 10 deletions
diff --git a/‎ice_station_zebra/data_loaders/zebra_data_module.py
Lines changed: 18 additions & 12 deletions b/‎ice_station_zebra/data_loaders/zebra_data_module.py
Lines changed: 18 additions & 12 deletions
diff --git a/‎ice_station_zebra/data_loaders/zebra_dataset.py
Lines changed: 6 additions & 6 deletions b/‎ice_station_zebra/data_loaders/zebra_dataset.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎ice_station_zebra/data_processors/cli.py
Lines changed: 5 additions & 5 deletions b/‎ice_station_zebra/data_processors/cli.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎ice_station_zebra/data_processors/preprocessors/__init__.py
Lines changed: 1 addition & 1 deletion b/‎ice_station_zebra/data_processors/preprocessors/__init__.py
Lines changed: 1 addition & 1 deletion
@@ -15,24 +15,25 @@
 class MetricSummaryCallback(Callback):
     """A callback to summarise metrics during evaluation."""
 
-    def __init__(self, average_loss: bool = True) -> None:
+    def __init__(self, *, average_loss: bool = True) -> None:
         """Summarise metrics during evaluation.
 
         Args:
             average_loss: Whether to log average loss
+
         """
         self.metrics: dict[str, list[float]] = {}
         if average_loss:
             self.metrics["average_loss"] = []
 
     def on_test_batch_end(
         self,
-        trainer: Trainer,
-        module: LightningModule,
+        _trainer: Trainer,
+        _module: LightningModule,
         outputs: Tensor | Mapping[str, Any] | None,
-        batch: Any,
-        batch_idx: int,
-        dataloader_idx: int = 0,
+        _batch: Any,  # noqa: ANN401
+        _batch_idx: int,
+        _dataloader_idx: int = 0,
     ) -> None:
         """Called when the test batch ends."""
         if not isinstance(outputs, ModelTestOutput):
@@ -43,7 +44,11 @@ def on_test_batch_end(
         if "average_loss" in self.metrics:
             self.metrics["average_loss"].append(outputs.loss.item())
 
-    def on_test_epoch_end(self, trainer: Trainer, module: LightningModule) -> None:
+    def on_test_epoch_end(
+        self,
+        trainer: Trainer,
+        _module: LightningModule,
+    ) -> None:
         """Called at the end of the test epoch."""
         # Post-process accumulated metrics into a single value
         metrics_: dict[str, float] = {}
 
@@ -1,30 +1,33 @@
 import logging
 from collections.abc import Mapping, Sequence
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 from lightning import LightningModule, Trainer
 from lightning.pytorch import Callback
 from torch import Tensor
-from torch.utils.data import DataLoader
 
 from ice_station_zebra.data_loaders import CombinedDataset
 from ice_station_zebra.types import ModelTestOutput
 from ice_station_zebra.visualisations import plot_sic_comparison
 
+if TYPE_CHECKING:
+    from torch.utils.data import DataLoader
+
 logger = logging.getLogger(__name__)
 
 
 class PlottingCallback(Callback):
     """A callback to create plots during evaluation."""
 
     def __init__(
-        self, frequency: int = 10, plot_sea_ice_concentration: bool = True
+        self, *, frequency: int = 10, plot_sea_ice_concentration: bool = True
     ) -> None:
         """Create plots during evaluation.
 
         Args:
             frequency: Create a new plot every `frequency` batches.
             plot_sea_ice_concentration: Whether to plot sea ice concentration.
+
         """
         super().__init__()
         self.frequency = frequency
@@ -35,9 +38,9 @@ def __init__(
     def on_test_batch_end(
         self,
         trainer: Trainer,
-        module: LightningModule,
+        _module: LightningModule,
         outputs: Tensor | Mapping[str, Any] | None,
-        batch: Any,
+        _batch: Any,  # noqa: ANN401
         batch_idx: int,
         dataloader_idx: int = 0,
     ) -> None:
@@ -82,5 +85,6 @@ def on_test_batch_end(
                     lightning_logger.log_image(key=key, images=image_list)
                 else:
                     logger.debug(
-                        f"Logger {lightning_logger.name} does not support logging images."
+                        "Logger %s does not support logging images.",
+                        lightning_logger.name,
                     )
@@ -7,11 +7,12 @@
 class UnconditionalCheckpoint(Callback):
     """A callback to summarise metrics during evaluation."""
 
-    def __init__(self, on_train_end: bool = False) -> None:
+    def __init__(self, *, on_train_end: bool = False) -> None:
         """Save a checkpoint unconditionally.
 
         Args:
             on_train_end: Whether to save a checkpoint at the end of training
+
         """
         super().__init__()
         self.impl = ModelCheckpoint()
 
@@ -11,14 +11,15 @@
 RetType = TypeVar("RetType")
 
 
-def hydra_adaptor(function) -> Callable[Param, RetType]:
-    """Replace a function that takes a Hydra config with one that takes string arguments
+def hydra_adaptor(function: Callable) -> Callable[Param, RetType]:
+    """Replace a function that takes a Hydra config with one that takes string arguments.
 
     Args:
         function: Callable(*args, config: DictConfig, **kwargs)
 
     Returns:
         Callable(*args, config_name: str, **kwargs, overrides: list[str])
+
     """
 
     def wrapper(
 
@@ -1,5 +1,5 @@
 from collections.abc import Sequence
-from datetime import datetime
+from datetime import UTC, datetime
 
 import numpy as np
 from torch.utils.data import Dataset
@@ -18,7 +18,12 @@ def __init__(
         n_forecast_steps: int = 1,
         n_history_steps: int = 1,
     ) -> None:
-        """Constructor"""
+        """Initialise a combined dataset from a sequence of ZebraDatasets.
+
+        One of the datasets must be the target and all must have the same frequency. The
+        number of forecast and history steps can be set, which will determine the shape
+        of the NTCHW tensors returned by __getitem__.
+        """
         super().__init__()
 
         # Store the number of forecast and history steps
@@ -27,10 +32,10 @@ def __init__(
 
         # Define target and input datasets
         self.target = next(ds for ds in datasets if ds.name == target)
-        self.inputs = [ds for ds in datasets]
+        self.inputs = list(datasets)
 
         # Require that all datasets have the same frequency
-        frequencies = sorted(set(ds.dataset.frequency for ds in datasets))
+        frequencies = sorted({ds.dataset.frequency for ds in datasets})
         if len(frequencies) != 1:
             msg = f"Cannot combine datasets with different frequencies: {frequencies}."
             raise ValueError(msg)
@@ -57,17 +62,18 @@ def __init__(
         ]
 
     def __len__(self) -> int:
-        """Return the total length of the dataset"""
+        """Return the total length of the dataset."""
         return len(self.available_dates)
 
     def __getitem__(self, idx: int) -> dict[str, ArrayTCHW]:
-        """Return the data for a single timestep as a dictionary
+        """Return the data for a single timestep as a dictionary.
 
         Returns:
             A dictionary with dataset names as keys and a numpy array as the value.
             The shape of each array is:
             - input datasets: [n_history_steps, C_input_k, H_input_k, W_input_k]
             - target dataset: [n_forecast_steps, C_target, H_target, W_target]
+
         """
         return {
             ds.name: ds.get_tchw(self.get_history_steps(self.available_dates[idx]))
@@ -79,9 +85,9 @@ def __getitem__(self, idx: int) -> dict[str, ArrayTCHW]:
         }
 
     def date_from_index(self, idx: int) -> datetime:
-        """Return the date of the timestep"""
+        """Return the date of the timestep."""
         np_datetime = self.available_dates[idx]
-        return datetime.strptime(str(np_datetime), r"%Y-%m-%dT%H:%M:%S")
+        return datetime.strptime(str(np_datetime), r"%Y-%m-%dT%H:%M:%S").astimezone(UTC)
 
     def get_forecast_steps(self, start_date: np.datetime64) -> list[np.datetime64]:
         """Return list of consecutive forecast dates for a given start date."""
@@ -99,7 +105,7 @@ def get_history_steps(self, start_date: np.datetime64) -> list[np.datetime64]:
     @property
     def end_date(self) -> np.datetime64:
         """Return the end date of the dataset."""
-        end_date = set(dataset.end_date for dataset in self.inputs)
+        end_date = {dataset.end_date for dataset in self.inputs}
         if len(end_date) != 1:
             msg = f"Datasets have {len(end_date)} different end dates"
             raise ValueError(msg)
@@ -108,7 +114,7 @@ def end_date(self) -> np.datetime64:
     @property
     def start_date(self) -> np.datetime64:
         """Return the start date of the dataset."""
-        start_date = set(dataset.start_date for dataset in self.inputs)
+        start_date = {dataset.start_date for dataset in self.inputs}
         if len(start_date) != 1:
             msg = f"Datasets have {len(start_date)} different start dates"
             raise ValueError(msg)
 
@@ -17,6 +17,11 @@
 
 class ZebraDataModule(LightningDataModule):
     def __init__(self, config: DictConfig) -> None:
+        """Initialise a ZebraDataModule from a config.
+
+        The config specifies all datasets used and how to group them. Data splits are
+        also determined from the config, and the appropriate data loaders are created.
+        """
         super().__init__()
 
         # Load paths
@@ -30,14 +35,15 @@ def __init__(self, config: DictConfig) -> None:
                     self.base_path / "data" / "anemoi" / f"{dataset['name']}.zarr"
                 ).resolve()
             )
-        logger.info(f"Found {len(self.dataset_groups)} dataset_groups")
-        for dataset_group in self.dataset_groups.keys():
-            logger.debug(f"... {dataset_group}")
+        logger.info("Found %d dataset_groups.", len(self.dataset_groups))
+        for dataset_group in self.dataset_groups:
+            logger.debug("... %s.", dataset_group)
 
         # Check prediction target
         self.predict_target = config["predict"]["dataset_group"]
         if self.predict_target not in self.dataset_groups:
-            raise ValueError(f"Could not find prediction target {self.predict_target}")
+            msg = f"Could not find prediction target {self.predict_target}."
+            raise ValueError(msg)
 
         # Set periods for train, validation, and test
         self.batch_size = int(config["split"]["batch_size"])
@@ -67,7 +73,7 @@ def __init__(self, config: DictConfig) -> None:
 
     @cached_property
     def input_spaces(self) -> list[DataSpace]:
-        """Return the data space for each input"""
+        """Return the data space for each input."""
         return [
             ZebraDataset(name, paths).space
             for name, paths in self.dataset_groups.items()
@@ -76,7 +82,7 @@ def input_spaces(self) -> list[DataSpace]:
 
     @cached_property
     def output_space(self) -> DataSpace:
-        """Return the data space of the desired output"""
+        """Return the data space of the desired output."""
         return next(
             ZebraDataset(name, paths).space
             for name, paths in self.dataset_groups.items()
@@ -86,7 +92,7 @@ def output_space(self) -> DataSpace:
     def train_dataloader(
         self,
     ) -> DataLoader[dict[str, ArrayTCHW]]:
-        """Construct train dataloader"""
+        """Construct train dataloader."""
         dataset = CombinedDataset(
             [
                 ZebraDataset(
@@ -102,7 +108,7 @@ def train_dataloader(
             target=self.predict_target,
         )
         logger.info(
-            "Loaded training dataset with %d samples between %s and %s",
+            "Loaded training dataset with %d samples between %s and %s.",
             len(dataset),
             dataset.start_date,
             dataset.end_date,
@@ -112,7 +118,7 @@ def train_dataloader(
     def val_dataloader(
         self,
     ) -> DataLoader[dict[str, ArrayTCHW]]:
-        """Construct validation dataloader"""
+        """Construct validation dataloader."""
         dataset = CombinedDataset(
             [
                 ZebraDataset(
@@ -128,7 +134,7 @@ def val_dataloader(
             target=self.predict_target,
         )
         logger.info(
-            "Loaded validation dataset with %d samples between %s and %s",
+            "Loaded validation dataset with %d samples between %s and %s.",
             len(dataset),
             dataset.start_date,
             dataset.end_date,
@@ -138,7 +144,7 @@ def val_dataloader(
     def test_dataloader(
         self,
     ) -> DataLoader[dict[str, ArrayTCHW]]:
-        """Construct test dataloader"""
+        """Construct test dataloader."""
         dataset = CombinedDataset(
             [
                 ZebraDataset(
@@ -154,7 +160,7 @@ def test_dataloader(
             target=self.predict_target,
         )
         logger.info(
-            "Loaded test dataset with %d samples between %s and %s",
+            "Loaded test dataset with %d samples between %s and %s.",
             len(dataset),
             dataset.start_date,
             dataset.end_date,
 
@@ -19,10 +19,10 @@ def __init__(
         start: str | None = None,
         end: str | None = None,
     ) -> None:
-        """A dataset for use by Zebra
+        """A dataset for use by Zebra.
 
-        Dataset shape is: time; variables; ensembles; position
-        We reshape each time point to: variables; pos_x; pos_y
+        The underlying Anemoi dataset has shape [T; C; ensembles; position].
+        We reshape this to CHW before returning.
         """
         super().__init__()
         self._cache: LRUCache = LRUCache(maxsize=128)
@@ -67,15 +67,15 @@ def start_date(self) -> np.datetime64:
         return self.dataset.start_date
 
     def __len__(self) -> int:
-        """Return the total length of the dataset"""
+        """Return the total length of the dataset."""
         return len(self.dataset)
 
     def __getitem__(self, idx: int) -> ArrayCHW:
-        """Return the data for a single timestep in [C, H, W] format"""
+        """Return the data for a single timestep in [C, H, W] format."""
         return self.dataset[idx].reshape(self.space.chw)
 
     def get_tchw(self, dates: Sequence[np.datetime64]) -> ArrayTCHW:
-        """Return the data for a series of timesteps in [T, C, H, W] format"""
+        """Return the data for a series of timesteps in [T, C, H, W] format."""
         return np.stack(
             [self[self.index_from_date(target_date)] for target_date in dates], axis=0
         )
 
@@ -10,26 +10,26 @@
 # Create the typer app
 datasets_cli = typer.Typer(help="Manage datasets")
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 @datasets_cli.command("create")
 @hydra_adaptor
 def create(config: DictConfig) -> None:
-    """Create all datasets"""
+    """Create all datasets."""
     factory = ZebraDataProcessorFactory(config)
     for dataset in factory.datasets:
-        log.info(f"Working on {dataset.name}")
+        logger.info("Working on %s.", dataset.name)
         dataset.create()
 
 
 @datasets_cli.command("inspect")
 @hydra_adaptor
 def inspect(config: DictConfig) -> None:
-    """Inspect all datasets"""
+    """Inspect all datasets."""
     factory = ZebraDataProcessorFactory(config)
     for dataset in factory.datasets:
-        log.info(f"Working on {dataset.name}")
+        logger.info("Working on %s.", dataset.name)
         dataset.inspect()
 
 
 
@@ -2,7 +2,7 @@
 from .icenet_sic import IceNetSICPreprocessor
 
 __all__ = [
-    "IceNetSICPreprocessor",
     "IPreprocessor",
+    "IceNetSICPreprocessor",
     "NullPreprocessor",
 ]
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`from .icenet_sic import IceNetSICPreprocessor`
`3`	`3`
`4`	`4`	`__all__ = [`
`5`		`- "IceNetSICPreprocessor",`
`6`	`5`	`"IPreprocessor",`
	`6`	`+ "IceNetSICPreprocessor",`
`7`	`7`	`"NullPreprocessor",`
`8`	`8`	`]`