Skip to content

Commit

Permalink
Merge pull request #433 from Pale-Blue-Dot-97/426-nongeo-dev
Browse files Browse the repository at this point in the history
  • Loading branch information
Pale-Blue-Dot-97 committed Feb 6, 2024
2 parents 4f7c73b + 3d2d33e commit 0a68baa
Show file tree
Hide file tree
Showing 49 changed files with 939 additions and 136 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,9 @@ University of Southampton. Funded by the Ordnance Survey Ltd.
Contributions also provided by:

- [Jo Walsh](https://github.com/metazool)
- [Navid Rahimi](https://github.com/NavidCOMSC)
- [Jonathon Hare](https://github.com/jonhare)
- [Isabel Sargent](https://github.com/PenguinJunk)
- [Navid Rahimi](https://github.com/NavidCOMSC)
- [Steve Coupland](https://github.com/scoupland-os)
- [Joe Guyatt](https://github.com/joeguyatt97)
- [Ben Dickens](https://github.com/BenDickens)
Expand Down
3 changes: 2 additions & 1 deletion docs/docs_readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,9 @@ University of Southampton. Funded by the Ordnance Survey Ltd.
Contributions also provided by:

- [Jo Walsh](https://github.com/metazool)
- [Navid Rahimi](https://github.com/NavidCOMSC)
- [Jonathon Hare](https://github.com/jonhare)
- [Isabel Sargent](https://github.com/PenguinJunk)
- [Navid Rahimi](https://github.com/NavidCOMSC)
- [Steve Coupland](https://github.com/scoupland-os)
- [Joe Guyatt](https://github.com/joeguyatt97)
- [Ben Dickens](https://github.com/BenDickens)
Expand Down
24 changes: 19 additions & 5 deletions minerva/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,23 @@
# =====================================================================================================================
# METADATA
# =====================================================================================================================
__author__ = "Harry Baker"
__author__ = ["Harry Baker", "Jonathon Hare"]
__contact__ = "[email protected]"
__license__ = "MIT License"
__copyright__ = "Copyright (C) 2024 Harry Baker"
__all__ = [
"PairedDataset",
"MinervaNonGeoDataset",
"MinervaConcatDataset",
"PairedGeoDataset",
"PairedNonGeoDataset",
"PairedUnionDataset",
"SSL4EOS12Sentinel2",
"PairedConcatDataset",
"GeoSSL4EOS12Sentinel2",
"NonGeoSSL4EOS12Sentinel2",
"NAIPChesapeakeCVPR",
"DFC2020",
"SEN12MS",
"MultiSpectralDataset",
"construct_dataloader",
"get_collator",
"get_manifest",
Expand All @@ -64,10 +70,18 @@
make_loaders,
make_manifest,
)
from .multispectral import MultiSpectralDataset
from .naip import NAIPChesapeakeCVPR
from .paired import PairedDataset, PairedUnionDataset
from .ssl4eos12 import SSL4EOS12Sentinel2
from .paired import (
PairedConcatDataset,
PairedGeoDataset,
PairedNonGeoDataset,
PairedUnionDataset,
)
from .ssl4eos12 import GeoSSL4EOS12Sentinel2, NonGeoSSL4EOS12Sentinel2
from .utils import (
MinervaConcatDataset,
MinervaNonGeoDataset,
get_random_sample,
intersect_datasets,
load_all_samples,
Expand Down
90 changes: 61 additions & 29 deletions minerva/datasets/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import platform
import re
from copy import deepcopy
from inspect import signature
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple, Union

Expand All @@ -62,17 +63,19 @@
from catalyst.data.sampler import DistributedSamplerWrapper
from pandas import DataFrame
from rasterio.crs import CRS
from torch.utils.data import DataLoader
from torchgeo.datasets import GeoDataset, RasterDataset
from torch.utils.data import DataLoader, Sampler
from torchgeo.datasets import GeoDataset, NonGeoDataset, RasterDataset
from torchgeo.samplers import BatchGeoSampler, GeoSampler

from minerva.transforms import init_auto_norm, make_transformations
from minerva.utils import AUX_CONFIGS, CONFIG, universal_path, utils

from .collators import get_collator, stack_sample_pairs
from .paired import PairedDataset
from .paired import PairedGeoDataset, PairedNonGeoDataset
from .utils import (
MinervaConcatDataset,
cache_dataset,
concatenate_datasets,
intersect_datasets,
load_all_samples,
load_dataset_from_cache,
Expand All @@ -93,12 +96,12 @@
# METHODS
# =====================================================================================================================
def create_subdataset(
dataset_class: Callable[..., GeoDataset],
dataset_class: Union[Callable[..., GeoDataset], Callable[..., NonGeoDataset]],
paths: Union[str, Iterable[str]],
subdataset_params: Dict[Literal["params"], Dict[str, Any]],
transformations: Optional[Any],
sample_pairs: bool = False,
) -> GeoDataset:
) -> Union[GeoDataset, NonGeoDataset]:
"""Creates a sub-dataset based on the parameters supplied.
Args:
Expand All @@ -117,12 +120,25 @@ def create_subdataset(
copy_params["params"]["crs"] = CRS.from_epsg(copy_params["params"]["crs"])

if sample_pairs:
return PairedDataset(
dataset_class,
paths=paths,
transforms=transformations,
**copy_params["params"],
)
if "paths" in signature(dataset_class).parameters:
return PairedGeoDataset(
dataset_class, # type: ignore[arg-type]
paths=paths,
transforms=transformations,
**copy_params["params"],
)
elif "root" in signature(dataset_class).parameters:
if isinstance(paths, list):
paths = paths[0]
assert isinstance(paths, str)
return PairedNonGeoDataset(
dataset_class, # type: ignore[arg-type]
root=paths,
transforms=transformations,
**copy_params["params"],
)
else:
raise TypeError
else:
return dataset_class(
paths=paths,
Expand All @@ -138,7 +154,7 @@ def get_subdataset(
transformations: Optional[Any],
sample_pairs: bool = False,
cache: bool = True,
) -> GeoDataset:
) -> Union[GeoDataset, NonGeoDataset]:
"""Get a subdataset based on the parameters specified.
If ``cache==True``, this will attempt to load a cached version of the dataset instance.
Expand All @@ -154,7 +170,7 @@ def get_subdataset(
cache (bool): Cache the dataset or load from cache if pre-existing. Defaults to True.
Returns:
~torchgeo.datasets.GeoDataset: Subdataset requested.
~torchgeo.datasets.GeoDataset | ~torchgeo.datasets.NonGeoDataset: Subdataset requested.
"""
# Get the params for this sub-dataset.
sub_dataset_params = dataset_params[key]
Expand All @@ -169,7 +185,7 @@ def get_subdataset(
universal_path(data_directory), sub_dataset_params["paths"]
)

sub_dataset: GeoDataset
sub_dataset: Union[GeoDataset, NonGeoDataset]

if cache or sub_dataset_params.get("cache_dataset"):
this_hash = utils.make_hash(sub_dataset_params)
Expand Down Expand Up @@ -223,15 +239,17 @@ def make_dataset(
"""
# --+ MAKE SUB-DATASETS +=========================================================================================+
# List to hold all the sub-datasets defined by dataset_params to be intersected together into a single dataset.
sub_datasets: List[GeoDataset] = []
sub_datasets: Union[
List[GeoDataset], List[Union[NonGeoDataset, MinervaConcatDataset]]
] = []

# Iterate through all the sub-datasets defined in `dataset_params`.
for type_key in dataset_params.keys():
if type_key == "sampler":
continue
type_dataset_params = dataset_params[type_key]

type_subdatasets = []
type_subdatasets: Union[List[GeoDataset], List[NonGeoDataset]] = []

multi_datasets_exist = False

Expand Down Expand Up @@ -289,11 +307,14 @@ def make_dataset(
# Reset back to None.
auto_norm = None

type_subdatasets.append(sub_dataset)
type_subdatasets.append(sub_dataset) # type: ignore[arg-type]

# Unionise all the sub-datsets of this modality together.
if multi_datasets_exist:
sub_datasets.append(unionise_datasets(type_subdatasets, master_transforms))
if isinstance(type_subdatasets[0], GeoDataset):
sub_datasets.append(unionise_datasets(type_subdatasets, master_transforms)) # type: ignore[arg-type]
else:
sub_datasets.append(concatenate_datasets(type_subdatasets, master_transforms)) # type: ignore[arg-type]

# Add the subdataset of this modality to the list.
else:
Expand All @@ -319,13 +340,13 @@ def make_dataset(
f"AutoNorm only supports normalisation of data from RasterDatasets, not {type(sub_dataset)}!"
)

sub_datasets.append(sub_dataset)
sub_datasets.append(sub_dataset) # type: ignore[arg-type]

# Intersect sub-datasets of differing modalities together to form single dataset
# if more than one sub-dataset exists. Else, just set that to dataset.
dataset = sub_datasets[0]
if len(sub_datasets) > 1:
dataset = intersect_datasets(sub_datasets)
if len(sub_datasets) > 1 and all(isinstance(x, GeoDataset) for x in sub_datasets):
dataset = intersect_datasets(sub_datasets) # type: ignore[arg-type]

return dataset, sub_datasets

Expand Down Expand Up @@ -390,11 +411,15 @@ def construct_dataloader(
"batch_size"
] = per_device_batch_size # pragma: no cover

sampler: Union[BatchGeoSampler, GeoSampler, DistributedSamplerWrapper] = _sampler(
dataset=subdatasets[0],
roi=make_bounding_box(sampler_params["roi"]),
**sampler_params["params"],
)
sampler: Sampler[Any]
if "roi" in signature(_sampler).parameters:
sampler = _sampler(
subdatasets[0],
roi=make_bounding_box(sampler_params["roi"]),
**sampler_params["params"],
)
else:
sampler = _sampler(subdatasets[0], **sampler_params["params"])

# --+ MAKE DATALOADERS +==========================================================================================+
collator = get_collator(collator_params)
Expand Down Expand Up @@ -542,9 +567,6 @@ def make_loaders(

sampler_params: Dict[str, Any] = dataset_params["sampler"]

# Calculates number of batches.
n_batches = int(sampler_params["params"]["length"] / batch_size)

# --+ MAKE DATASETS +=========================================================================================+
print(f"CREATING {task_name} DATASET")
loaders = construct_dataloader(
Expand All @@ -561,6 +583,16 @@ def make_loaders(
)
print("DONE")

# Calculates number of batches.
assert hasattr(loaders.dataset, "__len__")
n_batches = int(
sampler_params["params"].get(
"length",
sampler_params["params"].get("num_samples", len(loaders.dataset)),
)
/ batch_size
)

else:
# Inits dicts to hold the variables and lists for train, validation and test.
n_batches = {}
Expand Down
111 changes: 111 additions & 0 deletions minerva/datasets/multispectral.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
# MIT License

# Copyright (c) 2024 Harry Baker

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# @org: University of Southampton
# Created under a project funded by the Ordnance Survey Ltd.
r"""Adaption of :class:`~torchvision.datasets.VisionDataset` for use with :class:`~torchgeo.datasets.NonGeoDataset`.
"""
# =====================================================================================================================
# METADATA
# =====================================================================================================================
__author__ = ["Jonathon Hare", "Harry Baker"]
__contact__ = "[email protected]"
__license__ = "MIT License"
__copyright__ = "Copyright (C) 2024 Harry Baker"
__all__ = ["MultiSpectralDataset"]

# =====================================================================================================================
# IMPORTS
# =====================================================================================================================
import os
from functools import partial
from typing import Any, Callable, Dict, List, Optional

import numpy as np
import tifffile
import torch
from torchvision.datasets import VisionDataset
from torchvision.transforms.functional import resize

from .utils import MinervaNonGeoDataset


# =====================================================================================================================
# CLASSES
# =====================================================================================================================
class MultiSpectralDataset(VisionDataset, MinervaNonGeoDataset):
"""Generic dataset class for multi-spectral images that works within :mod:`torchgeo`"""

all_bands: List[str] = []
rgb_bands: List[str] = []

def __init__(
self,
root: str,
transforms: Optional[Callable[..., Any]] = None,
bands: Optional[List[str]] = None,
) -> None:
super().__init__(root, transform=transforms, target_transform=None)

if bands is None:
bands = self.all_bands

self.loader = partial(tifffile.imread, key=0)
self.bands = bands
self.samples = self.make_dataset()

def make_dataset(self) -> List[str]:
directory = os.path.expanduser(self.root)

dirs = set()
for root, _, fnames in sorted(os.walk(directory, followlinks=True)):
for fname in sorted(fnames):
if fname == f"{self.bands[0]}.tif":
dirs.add(root)
return sorted(list(dirs))

def __getitem__(self, index: int) -> Dict[str, Any]:
path = self.samples[index]

images = []
h, w = 0, 0
for b in self.bands:
img = torch.from_numpy(self.loader(f"{path}/{b}.tif").astype(np.float32))
h = max(img.shape[0], h)
w = max(img.shape[1], w)
images.append(img.unsqueeze(0))

for i in range(len(images)):
images[i] = resize(images[i], [h, w], antialias=True)

bands = torch.cat(images, dim=0)

if self.transform is not None:
bands = self.transform(bands)

sample = {"image": bands}

return sample

def __len__(self) -> int:
return len(self.samples)
Loading

0 comments on commit 0a68baa

Please sign in to comment.