Merge branch 'main' into dependabot/github_actions/codecov/codecov-ac…

…tion-5
FAIR-Chem · Dec 10, 2024 · 3b8a7c4 · 3b8a7c4
2 parents f32a70a + 6ab6ad7
commit 3b8a7c4
Show file tree

Hide file tree

Showing 34 changed files with 2,588 additions and 281 deletions.
diff --git a/configs/omat24/all/eqV2_153M.yml b/configs/omat24/all/eqV2_153M.yml
@@ -43,7 +43,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/configs/omat24/all/eqV2_31M.yml b/configs/omat24/all/eqV2_31M.yml
@@ -44,7 +44,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/configs/omat24/all/eqV2_86M.yml b/configs/omat24/all/eqV2_86M.yml
@@ -43,7 +43,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/configs/omat24/finetune/eqV2_153M_ft_salexmptrj.yml b/configs/omat24/finetune/eqV2_153M_ft_salexmptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 10

diff --git a/configs/omat24/finetune/eqV2_31M_ft_salexmptrj.yml b/configs/omat24/finetune/eqV2_31M_ft_salexmptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 10

diff --git a/configs/omat24/finetune/eqV2_86M_ft_salexmptrj.yml b/configs/omat24/finetune/eqV2_86M_ft_salexmptrj.yml
@@ -43,7 +43,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 10

diff --git a/configs/omat24/mptrj/eqV2_153M_dens_mptrj.yml b/configs/omat24/mptrj/eqV2_153M_dens_mptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/configs/omat24/mptrj/eqV2_31M_dens_mptrj.yml b/configs/omat24/mptrj/eqV2_31M_dens_mptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 5
   - forces:
       fn: l2mae
       coefficient: 20
@@ -148,17 +148,16 @@ model:
 
     use_force_encoding:                   True
     use_noise_schedule_sigma_encoding:    False
-    use_denoising_energy:                 True
-    use_denoising_stress:                 False
-
 
   heads:
     energy:
-      module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSEnergyHead
+      module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSScalarHead
+      use_denoising:                True
     forces:
-      module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSForceHead
+      module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSVectorHead
     stress:
       module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSRank2Head
       output_name: stress
       use_source_target_embedding:   True
       decompose:                     True
+      use_denoising:                 False
diff --git a/configs/omat24/mptrj/eqV2_31M_mptrj.yml b/configs/omat24/mptrj/eqV2_31M_mptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/configs/omat24/mptrj/eqV2_86M_dens_mptrj.yml b/configs/omat24/mptrj/eqV2_86M_dens_mptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/docs/core/model_checkpoints.md b/docs/core/model_checkpoints.md
@@ -149,15 +149,15 @@ OC22 dataset or pretrained models, as well as the original paper for each model:
 | GemNet-OC-S2EF-ODAC          | GemNet-OC           | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Gemnet-OC.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/gemnet-oc.yml) |
 | eSCN-S2EF-ODAC               | eSCN                | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/eSCN.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eSCN.yml) |
 | EquiformerV2-S2EF-ODAC       | EquiformerV2        | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231116/eqv2_31M.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eqv2_31M.yml) |
-| EquiformerV2-Large-S2EF-ODAC | EquiformerV2 (Large) | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Equiformer_V2_Large.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eqv2_153M.yml) |
+| EquiformerV2-Large-S2EF-ODAC | EquiformerV2 (Large) | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231116/Equiformer_V2_Large.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eqv2_153M.yml) |
 
 ## IS2RE Direct models
 
 | Model Name              | Model        |Checkpoint	| Config |
 |-------------------------|--------------|---	| --- |
 | Gemnet-OC-IS2RE-ODAC    | Gemnet-OC    | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Gemnet-OC_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/gemnet-oc.yml) |
 | eSCN-IS2RE-ODAC         | eSCN         | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/eSCN_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/eSCN.yml) |
-| EquiformerV2-IS2RE-ODAC | EquiformerV2 | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Equiformer_V2_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/eqv2_31M.yml) |
+| EquiformerV2-IS2RE-ODAC | EquiformerV2 | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231116/Equiformer_V2_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/eqv2_31M.yml) |
 
 The models in the table above were trained to predict relaxed energy directly. Relaxed energies can also be predicted by running structural relaxations using the S2EF models from the previous section.
 

diff --git a/packages/env.cpu.yml b/packages/env.cpu.yml
@@ -4,7 +4,7 @@ channels:
 - defaults
 dependencies:
 - cpuonly
-- pytorch>=2.4
+- pytorch==2.4.0
 - ase
 - e3nn>=0.5
 - numpy >=1.26.0,<2.0.0

diff --git a/packages/env.gpu.yml b/packages/env.gpu.yml
@@ -5,7 +5,7 @@ channels:
 - defaults
 dependencies:
 - pytorch-cuda=12.1
-- pytorch>=2.4
+- pytorch==2.4.0
 - ase
 - e3nn>=0.5
 - numpy >=1.26.0,<2.0.0

diff --git a/packages/fairchem-core/pyproject.toml b/packages/fairchem-core/pyproject.toml
@@ -9,7 +9,7 @@ license = {text = "MIT License"}
 dynamic = ["version", "readme"]
 requires-python = ">=3.9, <3.13"
 dependencies = [
-    "torch>=2.4",
+    "torch==2.4",
     "numpy >=1.26.0, <2.0.0",
     "lmdb",
     "ase",
@@ -28,7 +28,7 @@ dependencies = [
 
 [project.optional-dependencies]  # add optional dependencies to be installed as pip install fairchem.core[dev]
 dev = ["pre-commit", "pytest", "pytest-cov", "coverage", "syrupy", "ruff==0.5.1"]
-docs = ["jupyter-book", "jupytext", "sphinx","sphinx-autoapi", "umap-learn", "vdict"]
+docs = ["jupyter-book", "jupytext", "sphinx","sphinx-autoapi==3.3.3", "umap-learn", "vdict"]
 adsorbml = ["dscribe","x3dase","scikit-image"]
 
 [project.scripts]

diff --git a/src/fairchem/core/_cli_hydra.py b/src/fairchem/core/_cli_hydra.py
@@ -12,6 +12,7 @@
 from typing import TYPE_CHECKING
 
 import hydra
+from omegaconf import OmegaConf
 
 if TYPE_CHECKING:
     import argparse
@@ -34,29 +35,48 @@
 
 
 class Submitit(Checkpointable):
-    def __call__(self, dict_config: DictConfig, cli_args: argparse.Namespace) -> None:
+    def __call__(self, dict_config: DictConfig) -> None:
         self.config = dict_config
-        self.cli_args = cli_args
         # TODO: setup_imports is not needed if we stop instantiating models with Registry.
         setup_imports()
         setup_env_vars()
-        try:
-            distutils.setup(map_cli_args_to_dist_config(cli_args))
-            self.runner: Runner = hydra.utils.instantiate(dict_config.runner)
-            self.runner.load_state()
-            self.runner.run()
-        finally:
-            distutils.cleanup()
-
-    def checkpoint(self, *args, **kwargs):
+        distutils.setup(map_cli_args_to_dist_config(dict_config.cli_args))
+        self._init_logger()
+        runner: Runner = hydra.utils.instantiate(dict_config.runner)
+        runner.load_state()
+        runner.run()
+        distutils.cleanup()
+
+    def _init_logger(self) -> None:
+        # optionally instantiate a singleton wandb logger, intentionally only supporting the new wandb logger
+        # don't start logger if in debug mode
+        if (
+            "logger" in self.config
+            and distutils.is_master()
+            and not self.config.cli_args.debug
+        ):
+            # get a partial function from the config and instantiate wandb with it
+            logger_initializer = hydra.utils.instantiate(self.config.logger)
+            simple_config = OmegaConf.to_container(
+                self.config, resolve=True, throw_on_missing=True
+            )
+            logger_initializer(
+                config=simple_config,
+                run_id=self.config.cli_args.timestamp_id,
+                run_name=self.config.cli_args.identifier,
+                log_dir=self.config.cli_args.logdir,
+            )
+
+    def checkpoint(self, *args, **kwargs) -> DelayedSubmission:
+        # TODO: this is yet to be tested properly
         logging.info("Submitit checkpointing callback is triggered")
         new_runner = Submitit()
         self.runner.save_state()
         logging.info("Submitit checkpointing callback is completed")
         return DelayedSubmission(new_runner, self.config, self.cli_args)
 
 
-def map_cli_args_to_dist_config(cli_args: argparse.Namespace) -> dict:
+def map_cli_args_to_dist_config(cli_args: DictConfig) -> dict:
     return {
         "world_size": cli_args.num_nodes * cli_args.num_gpus,
         "distributed_backend": "gloo" if cli_args.cpu else "nccl",
@@ -78,8 +98,8 @@ def get_hydra_config_from_yaml(
     return hydra.compose(config_name=config_name, overrides=overrides_args)
 
 
-def runner_wrapper(config: DictConfig, cli_args: argparse.Namespace):
-    Submitit()(config, cli_args)
+def runner_wrapper(config: DictConfig):
+    Submitit()(config)
 
 
 # this is meant as a future replacement for the main entrypoint
@@ -93,6 +113,11 @@ def main(
     cfg = get_hydra_config_from_yaml(args.config_yml, override_args)
     timestamp_id = get_timestamp_uid()
     log_dir = os.path.join(args.run_dir, timestamp_id, "logs")
+    # override timestamp id and logdir
+    args.timestamp_id = timestamp_id
+    args.logdir = log_dir
+    os.makedirs(log_dir)
+    OmegaConf.update(cfg, "cli_args", vars(args), force_add=True)
     if args.submit:  # Run on cluster
         executor = AutoExecutor(folder=log_dir, slurm_max_num_timeout=3)
         executor.update_parameters(
@@ -107,7 +132,7 @@ def main(
             slurm_qos=args.slurm_qos,
             slurm_account=args.slurm_account,
         )
-        job = executor.submit(runner_wrapper, cfg, args)
+        job = executor.submit(runner_wrapper, cfg)
         logger.info(
             f"Submitted job id: {timestamp_id}, slurm id: {job.job_id}, logs: {log_dir}"
         )
@@ -131,8 +156,8 @@ def main(
                 rdzv_backend="c10d",
                 max_restarts=0,
             )
-            elastic_launch(launch_config, runner_wrapper)(cfg, args)
+            elastic_launch(launch_config, runner_wrapper)(cfg)
         else:
             logger.info("Running in local mode without elastic launch")
             distutils.setup_env_local()
-            runner_wrapper(cfg, args)
+            runner_wrapper(cfg)
diff --git a/src/fairchem/core/common/distutils.py b/src/fairchem/core/common/distutils.py
@@ -80,7 +80,8 @@ def setup(config) -> None:
                     assign_device_for_local_rank(config["cpu"], config["local_rank"])
                 else:
                     # in the old code, all ranks can see all devices but need to be assigned a device equal to their local rank
-                    # this is dangerous and should be deprecated
+                    # this is dangerous and should be deprecated, however, FSDP still requires backwards compatibility with
+                    # initializing this way for now so we need to keep it
                     torch.cuda.set_device(config["local_rank"])
 
                 dist.init_process_group(
@@ -123,6 +124,11 @@ def setup(config) -> None:
         config["local_rank"] = int(os.environ.get("LOCAL_RANK"))
         if config.get("use_cuda_visibile_devices"):
             assign_device_for_local_rank(config["cpu"], config["local_rank"])
+        elif torch.cuda.is_available():
+            # in the old code, all ranks can see all devices but need to be assigned a device equal to their local rank
+            # this is dangerous and should be deprecated, however, FSDP still requires backwards compatibility with
+            # initializing this way for now so we need to keep it
+            torch.cuda.set_device(config["local_rank"])
         dist.init_process_group(
             backend=config["distributed_backend"],
             rank=int(os.environ.get("RANK")),

diff --git a/src/fairchem/core/common/relaxation/__init__.py b/src/fairchem/core/common/relaxation/__init__.py
@@ -0,0 +1,13 @@
+"""
+Copyright (c) Meta, Inc. and its affiliates.
+
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+
+from __future__ import annotations
+
+from .ml_relaxation import ml_relax
+from .optimizable import OptimizableBatch, OptimizableUnitCellBatch
+
+__all__ = ["ml_relax", "OptimizableBatch", "OptimizableUnitCellBatch"]