FAIR-Chem · mshuaibii · Apr 1, 2024 · Jan 17, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/ocpmodels/datasets/_utils.py b/ocpmodels/datasets/_utils.py
@@ -0,0 +1,33 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates.
+
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+
+from __future__ import annotations
+
+import typing
+
+if typing.TYPE_CHECKING:
+    from torch_geometric.data import Data
+
+
+def rename_data_object_keys(
+    data_object: Data, key_mapping: dict[str, str]
+) -> Data:
+    """Rename data object keys
+
+    Args:
+        data_object: data object
+        key_mapping: dictionary specifying keys to rename and new names {prev_key: new_key}
+    """
+    for _property in key_mapping:
+        # catch for test data not containing labels
+        if _property in data_object:
+            new_property = key_mapping[_property]
+            if new_property not in data_object:
+                data_object[new_property] = data_object[_property]
+                del data_object[_property]
+
+    return data_object
diff --git a/ocpmodels/datasets/ase_datasets.py b/ocpmodels/datasets/ase_datasets.py
@@ -1,13 +1,15 @@
+from __future__ import annotations
+
 import bisect
 import copy
-import functools
-import glob
 import logging
 import os
 import warnings
 from abc import ABC, abstractmethod
+from functools import cache, reduce
+from glob import glob
 from pathlib import Path
-from typing import List
+from typing import Any, Callable, Optional
 
 import ase
 import numpy as np
@@ -16,8 +18,10 @@
 from tqdm import tqdm
 
 from ocpmodels.common.registry import registry
+from ocpmodels.datasets._utils import rename_data_object_keys
 from ocpmodels.datasets.lmdb_database import LMDBDatabase
 from ocpmodels.datasets.target_metadata_guesser import guess_property_metadata
+from ocpmodels.modules.transforms import DataTransforms
 from ocpmodels.preprocessing import AtomsToGraphs
 
 
@@ -65,33 +69,41 @@ class AseAtomsDataset(Dataset, ABC):
     """
 
     def __init__(
-        self, config, transform=None, atoms_transform=apply_one_tags
+        self,
+        config: dict,
+        atoms_transform: Callable[
+            [ase.Atoms, Any, ...], ase.Atoms
+        ] = apply_one_tags,
     ) -> None:
         self.config = config
 
         a2g_args = config.get("a2g_args", {})
-        if a2g_args is None:
-            a2g_args = {}
+
+        # set default to False if not set by user, assuming otf_graph will be used
+        if "r_edges" not in a2g_args:
+            a2g_args["r_edges"] = False
 
         # Make sure we always include PBC info in the resulting atoms objects
         a2g_args["r_pbc"] = True
         self.a2g = AtomsToGraphs(**a2g_args)
 
-        self.transform = transform
+        self.key_mapping = self.config.get("key_mapping", None)
+        self.transforms = DataTransforms(self.config.get("transforms", {}))
+
         self.atoms_transform = atoms_transform
 
         if self.config.get("keep_in_memory", False):
-            self.__getitem__ = functools.cache(self.__getitem__)
+            self.__getitem__ = cache(self.__getitem__)
 
-        self.ids = self.load_dataset_get_ids(config)
+        self.ids = self._load_dataset_get_ids(config)
 
     def __len__(self) -> int:
         return len(self.ids)
 
     def __getitem__(self, idx):
         # Handle slicing
         if isinstance(idx, slice):
-            return [self[i] for i in range(*idx.indices(len(self.ids)))]
+            return [self[i] for i in range(*idx.indices(len(self)))]
 
         # Get atoms object via derived class method
         atoms = self.get_atoms_object(self.ids[idx])
@@ -103,27 +115,21 @@ def __getitem__(self, idx):
             )
 
         sid = atoms.info.get("sid", self.ids[idx])
-        try:
-            sid = tensor([sid])
-            warnings.warn(
-                "Supplied sid is not numeric (or missing). Using dataset indices instead."
-            )
-        except:
-            sid = tensor([idx])
-
         fid = atoms.info.get("fid", tensor([0]))
 
         # Convert to data object
         data_object = self.a2g.convert(atoms, sid)
         data_object.fid = fid
         data_object.natoms = len(atoms)
 
-        # Transform data object
-        if self.transform is not None:
-            data_object = self.transform(
-                data_object, **self.config.get("transform_args", {})
+        if self.key_mapping is not None:
+            data_object = rename_data_object_keys(
+                data_object, self.key_mapping
             )
 
+        # Transform data object
+        data_object = self.transforms(data_object)
+
         if self.config.get("include_relaxed_energy", False):
             data_object.y_relaxed = self.get_relaxed_energy(self.ids[idx])
 
@@ -137,7 +143,7 @@ def get_atoms_object(self, identifier):
         )
 
     @abstractmethod
-    def load_dataset_get_ids(self, config):
+    def _load_dataset_get_ids(self, config):
         # This function should return a list of ids that can be used to index into the database
         raise NotImplementedError(
             "Every ASE dataset needs to declare a function to load the dataset and return a list of ids."
@@ -147,7 +153,7 @@ def close_db(self) -> None:
         # This method is sometimes called by a trainer
         pass
 
-    def guess_target_metadata(self, num_samples: int = 100):
+    def get_metadata(self, num_samples: int = 100):
         metadata = {}
 
         if num_samples < len(self):
@@ -169,9 +175,6 @@ def guess_target_metadata(self, num_samples: int = 100):
 
         return metadata
 
-    def get_metadata(self):
-        return self.guess_target_metadata()
-
 
 @registry.register_dataset("ase_read")
 class AseReadDataset(AseAtomsDataset):
@@ -196,7 +199,7 @@ class AseReadDataset(AseAtomsDataset):
                     default options will work for most users
 
                     If you are using this for a training dataset, set
-                    "r_energy":True and/or "r_forces":True as appropriate
+                    "r_energy":True, "r_forces":True, and/or "r_stress":True as appropriate
                     In that case, energy/forces must be in the files you read (ex. OUTCAR)
 
             ase_read_args (dict): Keyword arguments for ase.io.read()
@@ -213,14 +216,15 @@ class AseReadDataset(AseAtomsDataset):
 
             transform_args (dict): Additional keyword arguments for the transform callable
 
+            key_mapping (dict[str, str]): Dictionary specifying a mapping between the name of a property used
+                in the model with the corresponding property as it was named in the dataset. Only need to use if
+                the name is different.
+
         atoms_transform (callable, optional): Additional preprocessing function applied to the Atoms
                     object. Useful for applying tags, for example.
-
-        transform (callable, optional): Additional preprocessing function for the Data object
-
     """
 
-    def load_dataset_get_ids(self, config) -> List[Path]:
+    def _load_dataset_get_ids(self, config) -> list[Path]:
         self.ase_read_args = config.get("ase_read_args", {})
 
         if ":" in self.ase_read_args.get("index", ""):
@@ -286,7 +290,7 @@ class AseReadMultiStructureDataset(AseAtomsDataset):
                     default options will work for most users
 
                     If you are using this for a training dataset, set
-                    "r_energy":True and/or "r_forces":True as appropriate
+                    "r_energy":True, "r_forces":True, and/or "r_stress":True as appropriate
                     In that case, energy/forces must be in the files you read (ex. OUTCAR)
 
             ase_read_args (dict): Keyword arguments for ase.io.read()
@@ -305,13 +309,17 @@ class AseReadMultiStructureDataset(AseAtomsDataset):
 
             transform_args (dict): Additional keyword arguments for the transform callable
 
+            key_mapping (dict[str, str]): Dictionary specifying a mapping between the name of a property used
+                in the model with the corresponding property as it was named in the dataset. Only need to use if
+                the name is different.
+
         atoms_transform (callable, optional): Additional preprocessing function applied to the Atoms
             object. Useful for applying tags, for example.
 
         transform (callable, optional): Additional preprocessing function for the Data object
     """
 
-    def load_dataset_get_ids(self, config):
+    def _load_dataset_get_ids(self, config):
         self.ase_read_args = config.get("ase_read_args", {})
         if not hasattr(self.ase_read_args, "index"):
             self.ase_read_args["index"] = ":"
@@ -374,32 +382,6 @@ def get_relaxed_energy(self, identifier):
         return relaxed_atoms.get_potential_energy(apply_constraint=False)
 
 
-class dummy_list(list):
-    def __init__(self, max) -> None:
-        self.max = max
-        return
-
-    def __len__(self):
-        return self.max
-
-    def __getitem__(self, idx):
-        # Handle slicing
-        if isinstance(idx, slice):
-            return [self[i] for i in range(*idx.indices(self.max))]
-
-        # Cast idx as int since it could be a tensor index
-        idx = int(idx)
-
-        # Handle negative indices (referenced from end)
-        if idx < 0:
-            idx += self.max
-
-        if 0 <= idx < self.max:
-            return idx
-        else:
-            raise IndexError
-
-
 @registry.register_dataset("ase_db")
 class AseDBDataset(AseAtomsDataset):
     """
@@ -435,7 +417,7 @@ class AseDBDataset(AseAtomsDataset):
                     default options will work for most users
 
                     If you are using this for a training dataset, set
-                    "r_energy":True and/or "r_forces":True as appropriate
+                    "r_energy":True, "r_forces":True, and/or "r_stress":True as appropriate
                     In that case, energy/forces must be in the database
 
             keep_in_memory (bool): Store data in memory. This helps avoid random reads if you need
@@ -444,23 +426,34 @@ class AseDBDataset(AseAtomsDataset):
 
             atoms_transform_args (dict): Additional keyword arguments for the atoms_transform callable
 
-            transform_args (dict): Additional keyword arguments for the transform callable
+            transforms (dict[str, dict]): Dictionary specifying data transforms as {transform_function: config}
+                    where config is a dictionary specifying arguments to the transform_function
+
+            key_mapping (dict[str, str]): Dictionary specifying a mapping between the name of a property used
+                in the model with the corresponding property as it was named in the dataset. Only need to use if
+                the name is different.
 
         atoms_transform (callable, optional): Additional preprocessing function applied to the Atoms
                     object. Useful for applying tags, for example.
 
-        transform (callable, optional): Additional preprocessing function for the Data object
+        transform (callable, optional): deprecated?
     """
 
-    def load_dataset_get_ids(self, config) -> dummy_list:
+    def _load_dataset_get_ids(self, config: dict) -> list[int]:
         if isinstance(config["src"], list):
-            filepaths = config["src"]
+            if os.path.isdir(config["src"][0]):
+                filepaths = reduce(
+                    lambda x, y: x + y,
+                    (glob(f"{path}/*") for path in config["src"]),
+                )
+            else:
+                filepaths = config["src"]
         elif os.path.isfile(config["src"]):
             filepaths = [config["src"]]
         elif os.path.isdir(config["src"]):
-            filepaths = glob.glob(f'{config["src"]}/*')
+            filepaths = glob(f'{config["src"]}/*')
         else:
-            filepaths = glob.glob(config["src"])
+            filepaths = glob(config["src"])
 
         self.dbs = []
 
@@ -470,7 +463,7 @@ def load_dataset_get_ids(self, config) -> dummy_list:
                     self.connect_db(path, config.get("connect_args", {}))
                 )
             except ValueError:
-                logging.warning(
+                logging.debug(
                     f"Tried to connect to {path} but it's not an ASE database!"
                 )
 
@@ -488,16 +481,24 @@ def load_dataset_get_ids(self, config) -> dummy_list:
             if hasattr(db, "ids") and self.select_args == {}:
                 self.db_ids.append(db.ids)
             else:
+                # this is the slow alternative
                 self.db_ids.append(
                     [row.id for row in db.select(**self.select_args)]
                 )
 
         idlens = [len(ids) for ids in self.db_ids]
         self._idlen_cumulative = np.cumsum(idlens).tolist()
 
-        return dummy_list(sum(idlens))
+        return list(range(sum(idlens)))
 
-    def get_atoms_object(self, idx):
+    def get_atoms_object(self, idx: int) -> ase.Atoms:
+        """Get atoms object corresponding to datapoint idx. Useful to read other properties not in data object.
+        Args:
+            idx (int): index in dataset
+
+        Returns:
+            atoms: ASE atoms corresponding to datapoint idx
+        """
         # Figure out which db this should be indexed from.
         db_idx = bisect.bisect(self._idlen_cumulative, idx)
 
@@ -510,17 +511,22 @@ def get_atoms_object(self, idx):
         atoms_row = self.dbs[db_idx]._get_row(self.db_ids[db_idx][el_idx])
         atoms = atoms_row.toatoms()
 
+        # put data back into atoms info
         if isinstance(atoms_row.data, dict):
             atoms.info.update(atoms_row.data)
 
         return atoms
 
-    def connect_db(self, address, connect_args={}):
+    @staticmethod
+    def connect_db(
+        address: str | Path, connect_args: Optional[dict] = None
+    ) -> ase.db.core.Database:
         if connect_args is None:
             connect_args = {}
         db_type = connect_args.get("type", "extract_from_name")
-        if db_type == "lmdb" or (
-            db_type == "extract_from_name" and address.split(".")[-1] == "lmdb"
+        if db_type in ("lmdb", "aselmdb") or (
+            db_type == "extract_from_name"
+            and str(address).split(".")[-1] in ("lmdb", "aselmdb")
         ):
             return LMDBDatabase(address, readonly=True, **connect_args)
         else:
@@ -531,12 +537,12 @@ def close_db(self) -> None:
             if hasattr(db, "close"):
                 db.close()
 
-    def get_metadata(self):
+    def get_metadata(self, num_samples: int = 100) -> dict:
         logging.warning(
             "You specific a folder of ASE dbs, so it's impossible to know which metadata to use. Using the first!"
         )
         if self.dbs[0].metadata == {}:
-            return self.guess_target_metadata()
+            return super().get_metadata(num_samples)
         else:
             return copy.deepcopy(self.dbs[0].metadata)