Merge pull request #433 from Pale-Blue-Dot-97/426-nongeo-dev

Pale-Blue-Dot-97 · Feb 6, 2024 · 0a68baa · 0a68baa
2 parents 4f7c73b + 3d2d33e
commit 0a68baa
Show file tree

Hide file tree

Showing 49 changed files with 939 additions and 136 deletions.
diff --git a/README.md b/README.md
@@ -126,8 +126,9 @@ University of Southampton. Funded by the Ordnance Survey Ltd.
 Contributions also provided by:
 
 - [Jo Walsh](https://github.com/metazool)
-- [Navid Rahimi](https://github.com/NavidCOMSC)
+- [Jonathon Hare](https://github.com/jonhare)
 - [Isabel Sargent](https://github.com/PenguinJunk)
+- [Navid Rahimi](https://github.com/NavidCOMSC)
 - [Steve Coupland](https://github.com/scoupland-os)
 - [Joe Guyatt](https://github.com/joeguyatt97)
 - [Ben Dickens](https://github.com/BenDickens)

diff --git a/docs/docs_readme.md b/docs/docs_readme.md
@@ -126,8 +126,9 @@ University of Southampton. Funded by the Ordnance Survey Ltd.
 Contributions also provided by:
 
 - [Jo Walsh](https://github.com/metazool)
-- [Navid Rahimi](https://github.com/NavidCOMSC)
+- [Jonathon Hare](https://github.com/jonhare)
 - [Isabel Sargent](https://github.com/PenguinJunk)
+- [Navid Rahimi](https://github.com/NavidCOMSC)
 - [Steve Coupland](https://github.com/scoupland-os)
 - [Joe Guyatt](https://github.com/joeguyatt97)
 - [Ben Dickens](https://github.com/BenDickens)

diff --git a/minerva/datasets/__init__.py b/minerva/datasets/__init__.py
@@ -28,17 +28,23 @@
 # =====================================================================================================================
 #                                                    METADATA
 # =====================================================================================================================
-__author__ = "Harry Baker"
+__author__ = ["Harry Baker", "Jonathon Hare"]
 __contact__ = "[email protected]"
 __license__ = "MIT License"
 __copyright__ = "Copyright (C) 2024 Harry Baker"
 __all__ = [
-    "PairedDataset",
+    "MinervaNonGeoDataset",
+    "MinervaConcatDataset",
+    "PairedGeoDataset",
+    "PairedNonGeoDataset",
     "PairedUnionDataset",
-    "SSL4EOS12Sentinel2",
+    "PairedConcatDataset",
+    "GeoSSL4EOS12Sentinel2",
+    "NonGeoSSL4EOS12Sentinel2",
     "NAIPChesapeakeCVPR",
     "DFC2020",
     "SEN12MS",
+    "MultiSpectralDataset",
     "construct_dataloader",
     "get_collator",
     "get_manifest",
@@ -64,10 +70,18 @@
     make_loaders,
     make_manifest,
 )
+from .multispectral import MultiSpectralDataset
 from .naip import NAIPChesapeakeCVPR
-from .paired import PairedDataset, PairedUnionDataset
-from .ssl4eos12 import SSL4EOS12Sentinel2
+from .paired import (
+    PairedConcatDataset,
+    PairedGeoDataset,
+    PairedNonGeoDataset,
+    PairedUnionDataset,
+)
+from .ssl4eos12 import GeoSSL4EOS12Sentinel2, NonGeoSSL4EOS12Sentinel2
 from .utils import (
+    MinervaConcatDataset,
+    MinervaNonGeoDataset,
     get_random_sample,
     intersect_datasets,
     load_all_samples,

diff --git a/minerva/datasets/factory.py b/minerva/datasets/factory.py
@@ -52,6 +52,7 @@
 import platform
 import re
 from copy import deepcopy
+from inspect import signature
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple, Union
 
@@ -62,17 +63,19 @@
 from catalyst.data.sampler import DistributedSamplerWrapper
 from pandas import DataFrame
 from rasterio.crs import CRS
-from torch.utils.data import DataLoader
-from torchgeo.datasets import GeoDataset, RasterDataset
+from torch.utils.data import DataLoader, Sampler
+from torchgeo.datasets import GeoDataset, NonGeoDataset, RasterDataset
 from torchgeo.samplers import BatchGeoSampler, GeoSampler
 
 from minerva.transforms import init_auto_norm, make_transformations
 from minerva.utils import AUX_CONFIGS, CONFIG, universal_path, utils
 
 from .collators import get_collator, stack_sample_pairs
-from .paired import PairedDataset
+from .paired import PairedGeoDataset, PairedNonGeoDataset
 from .utils import (
+    MinervaConcatDataset,
     cache_dataset,
+    concatenate_datasets,
     intersect_datasets,
     load_all_samples,
     load_dataset_from_cache,
@@ -93,12 +96,12 @@
 #                                                     METHODS
 # =====================================================================================================================
 def create_subdataset(
-    dataset_class: Callable[..., GeoDataset],
+    dataset_class: Union[Callable[..., GeoDataset], Callable[..., NonGeoDataset]],
     paths: Union[str, Iterable[str]],
     subdataset_params: Dict[Literal["params"], Dict[str, Any]],
     transformations: Optional[Any],
     sample_pairs: bool = False,
-) -> GeoDataset:
+) -> Union[GeoDataset, NonGeoDataset]:
     """Creates a sub-dataset based on the parameters supplied.
 
     Args:
@@ -117,12 +120,25 @@ def create_subdataset(
         copy_params["params"]["crs"] = CRS.from_epsg(copy_params["params"]["crs"])
 
     if sample_pairs:
-        return PairedDataset(
-            dataset_class,
-            paths=paths,
-            transforms=transformations,
-            **copy_params["params"],
-        )
+        if "paths" in signature(dataset_class).parameters:
+            return PairedGeoDataset(
+                dataset_class,  # type: ignore[arg-type]
+                paths=paths,
+                transforms=transformations,
+                **copy_params["params"],
+            )
+        elif "root" in signature(dataset_class).parameters:
+            if isinstance(paths, list):
+                paths = paths[0]
+            assert isinstance(paths, str)
+            return PairedNonGeoDataset(
+                dataset_class,  # type: ignore[arg-type]
+                root=paths,
+                transforms=transformations,
+                **copy_params["params"],
+            )
+        else:
+            raise TypeError
     else:
         return dataset_class(
             paths=paths,
@@ -138,7 +154,7 @@ def get_subdataset(
     transformations: Optional[Any],
     sample_pairs: bool = False,
     cache: bool = True,
-) -> GeoDataset:
+) -> Union[GeoDataset, NonGeoDataset]:
     """Get a subdataset based on the parameters specified.
 
     If ``cache==True``, this will attempt to load a cached version of the dataset instance.
@@ -154,7 +170,7 @@ def get_subdataset(
         cache (bool): Cache the dataset or load from cache if pre-existing. Defaults to True.
 
     Returns:
-        ~torchgeo.datasets.GeoDataset: Subdataset requested.
+        ~torchgeo.datasets.GeoDataset | ~torchgeo.datasets.NonGeoDataset: Subdataset requested.
     """
     # Get the params for this sub-dataset.
     sub_dataset_params = dataset_params[key]
@@ -169,7 +185,7 @@ def get_subdataset(
         universal_path(data_directory), sub_dataset_params["paths"]
     )
 
-    sub_dataset: GeoDataset
+    sub_dataset: Union[GeoDataset, NonGeoDataset]
 
     if cache or sub_dataset_params.get("cache_dataset"):
         this_hash = utils.make_hash(sub_dataset_params)
@@ -223,15 +239,17 @@ def make_dataset(
     """
     # --+ MAKE SUB-DATASETS +=========================================================================================+
     # List to hold all the sub-datasets defined by dataset_params to be intersected together into a single dataset.
-    sub_datasets: List[GeoDataset] = []
+    sub_datasets: Union[
+        List[GeoDataset], List[Union[NonGeoDataset, MinervaConcatDataset]]
+    ] = []
 
     # Iterate through all the sub-datasets defined in `dataset_params`.
     for type_key in dataset_params.keys():
         if type_key == "sampler":
             continue
         type_dataset_params = dataset_params[type_key]
 
-        type_subdatasets = []
+        type_subdatasets: Union[List[GeoDataset], List[NonGeoDataset]] = []
 
         multi_datasets_exist = False
 
@@ -289,11 +307,14 @@ def make_dataset(
                     # Reset back to None.
                     auto_norm = None
 
-                type_subdatasets.append(sub_dataset)
+                type_subdatasets.append(sub_dataset)  # type: ignore[arg-type]
 
         # Unionise all the sub-datsets of this modality together.
         if multi_datasets_exist:
-            sub_datasets.append(unionise_datasets(type_subdatasets, master_transforms))
+            if isinstance(type_subdatasets[0], GeoDataset):
+                sub_datasets.append(unionise_datasets(type_subdatasets, master_transforms))  # type: ignore[arg-type]
+            else:
+                sub_datasets.append(concatenate_datasets(type_subdatasets, master_transforms))  # type: ignore[arg-type]
 
         # Add the subdataset of this modality to the list.
         else:
@@ -319,13 +340,13 @@ def make_dataset(
                         f"AutoNorm only supports normalisation of data from RasterDatasets, not {type(sub_dataset)}!"
                     )
 
-            sub_datasets.append(sub_dataset)
+            sub_datasets.append(sub_dataset)  # type: ignore[arg-type]
 
     # Intersect sub-datasets of differing modalities together to form single dataset
     # if more than one sub-dataset exists. Else, just set that to dataset.
     dataset = sub_datasets[0]
-    if len(sub_datasets) > 1:
-        dataset = intersect_datasets(sub_datasets)
+    if len(sub_datasets) > 1 and all(isinstance(x, GeoDataset) for x in sub_datasets):
+        dataset = intersect_datasets(sub_datasets)  # type: ignore[arg-type]
 
     return dataset, sub_datasets
 
@@ -390,11 +411,15 @@ def construct_dataloader(
                 "batch_size"
             ] = per_device_batch_size  # pragma: no cover
 
-    sampler: Union[BatchGeoSampler, GeoSampler, DistributedSamplerWrapper] = _sampler(
-        dataset=subdatasets[0],
-        roi=make_bounding_box(sampler_params["roi"]),
-        **sampler_params["params"],
-    )
+    sampler: Sampler[Any]
+    if "roi" in signature(_sampler).parameters:
+        sampler = _sampler(
+            subdatasets[0],
+            roi=make_bounding_box(sampler_params["roi"]),
+            **sampler_params["params"],
+        )
+    else:
+        sampler = _sampler(subdatasets[0], **sampler_params["params"])
 
     # --+ MAKE DATALOADERS +==========================================================================================+
     collator = get_collator(collator_params)
@@ -542,9 +567,6 @@ def make_loaders(
 
         sampler_params: Dict[str, Any] = dataset_params["sampler"]
 
-        # Calculates number of batches.
-        n_batches = int(sampler_params["params"]["length"] / batch_size)
-
         # --+ MAKE DATASETS +=========================================================================================+
         print(f"CREATING {task_name} DATASET")
         loaders = construct_dataloader(
@@ -561,6 +583,16 @@ def make_loaders(
         )
         print("DONE")
 
+        # Calculates number of batches.
+        assert hasattr(loaders.dataset, "__len__")
+        n_batches = int(
+            sampler_params["params"].get(
+                "length",
+                sampler_params["params"].get("num_samples", len(loaders.dataset)),
+            )
+            / batch_size
+        )
+
     else:
         # Inits dicts to hold the variables and lists for train, validation and test.
         n_batches = {}

diff --git a/minerva/datasets/multispectral.py b/minerva/datasets/multispectral.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+# MIT License
+
+# Copyright (c) 2024 Harry Baker
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# @org: University of Southampton
+# Created under a project funded by the Ordnance Survey Ltd.
+r"""Adaption of :class:`~torchvision.datasets.VisionDataset` for use with :class:`~torchgeo.datasets.NonGeoDataset`.
+"""
+# =====================================================================================================================
+#                                                    METADATA
+# =====================================================================================================================
+__author__ = ["Jonathon Hare", "Harry Baker"]
+__contact__ = "[email protected]"
+__license__ = "MIT License"
+__copyright__ = "Copyright (C) 2024 Harry Baker"
+__all__ = ["MultiSpectralDataset"]
+
+# =====================================================================================================================
+#                                                     IMPORTS
+# =====================================================================================================================
+import os
+from functools import partial
+from typing import Any, Callable, Dict, List, Optional
+
+import numpy as np
+import tifffile
+import torch
+from torchvision.datasets import VisionDataset
+from torchvision.transforms.functional import resize
+
+from .utils import MinervaNonGeoDataset
+
+
+# =====================================================================================================================
+#                                                     CLASSES
+# =====================================================================================================================
+class MultiSpectralDataset(VisionDataset, MinervaNonGeoDataset):
+    """Generic dataset class for multi-spectral images that works within :mod:`torchgeo`"""
+
+    all_bands: List[str] = []
+    rgb_bands: List[str] = []
+
+    def __init__(
+        self,
+        root: str,
+        transforms: Optional[Callable[..., Any]] = None,
+        bands: Optional[List[str]] = None,
+    ) -> None:
+        super().__init__(root, transform=transforms, target_transform=None)
+
+        if bands is None:
+            bands = self.all_bands
+
+        self.loader = partial(tifffile.imread, key=0)
+        self.bands = bands
+        self.samples = self.make_dataset()
+
+    def make_dataset(self) -> List[str]:
+        directory = os.path.expanduser(self.root)
+
+        dirs = set()
+        for root, _, fnames in sorted(os.walk(directory, followlinks=True)):
+            for fname in sorted(fnames):
+                if fname == f"{self.bands[0]}.tif":
+                    dirs.add(root)
+        return sorted(list(dirs))
+
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        path = self.samples[index]
+
+        images = []
+        h, w = 0, 0
+        for b in self.bands:
+            img = torch.from_numpy(self.loader(f"{path}/{b}.tif").astype(np.float32))
+            h = max(img.shape[0], h)
+            w = max(img.shape[1], w)
+            images.append(img.unsqueeze(0))
+
+        for i in range(len(images)):
+            images[i] = resize(images[i], [h, w], antialias=True)
+
+        bands = torch.cat(images, dim=0)
+
+        if self.transform is not None:
+            bands = self.transform(bands)
+
+        sample = {"image": bands}
+
+        return sample
+
+    def __len__(self) -> int:
+        return len(self.samples)