[NSETM-2310] Allow to use the cache in readonly mode (#36)

New Features: - Add cache_config.readonly, to be able to use an existing cache without exclusive locking [NSETM-2310]. - Add cache_config.store_type, to change the file format (experimental). - Add cache_config.skip_features, to skip writing the features DataFrames (not implemented yet). Deprecations: - Deprecate output, use cache_config.path instead. - Deprecate clear_cache, use cache_config.clear instead.
BlueBrain · Apr 22, 2024 · 53bce0b · 53bce0b
1 parent e6dfe33
commit 53bce0b
Show file tree

Hide file tree

Showing 44 changed files with 582 additions and 193 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,23 @@
 Changelog
 =========
 
+Version 0.9.0
+-------------
+
+New Features
+~~~~~~~~~~~~
+
+- Add ``cache.readonly``, to be able to use an existing cache without exclusive locking [NSETM-2310].
+- Add ``cache.store_type``, to change the file format (experimental).
+- Add ``cache.skip_features``, to skip writing the features DataFrames (not implemented yet).
+
+Deprecations
+~~~~~~~~~~~~
+
+- Deprecate ``output``, use ``cache.path`` instead.
+- Deprecate ``clear_cache``, use ``cache.clear`` instead.
+
+
 Version 0.8.3
 -------------
 

diff --git a/doc/source/data/analysis/config.yaml b/doc/source/data/analysis/config.yaml
@@ -1,7 +1,8 @@
 # simple configuration with extraction and analysis, and combination of parameters
-version: 3
+version: 4
 simulation_campaign: ../simulation-campaign/config.json
-output: analysis_output
+cache:
+  path: analysis_output
 analysis:
   spikes:
     extraction:

diff --git a/doc/source/migration.rst b/doc/source/migration.rst
@@ -5,5 +5,6 @@ Migration
    :hidden:
 
    migration/automatic
+   migration/0.9.0
    migration/0.3.0
    migration/0.2.0
diff --git a/doc/source/migration/0.9.0.rst b/doc/source/migration/0.9.0.rst
@@ -0,0 +1,32 @@
+From 0.8.x to 0.9.x (v4)
+------------------------
+
+BlueETL 0.9.x introduces some changes in the configuration (v4), but it's backward compatible with the configuration used in 0.8.x (v3).
+
+1. The ``version`` should be set to ``4``.
+
+2. The following fields have been deprecated and should be replaced, because they will be removed in a future version:
+
+   - ``output``: use ``cache.path`` instead
+   - ``clear_cache``: use ``cache.clear`` instead
+
+For example, if the old configuration contains::
+
+    version: 3
+    simulation_campaign: /path/to/config.json
+    output: analysis_output
+    clear_cache: true
+    ...
+
+then it should be replaced with::
+
+    version: 4
+    simulation_campaign: /path/to/config.json
+    cache:
+      path: analysis_output
+      clear: true
+    ...
+
+You can see an example of configuration in the new format here:
+
+- https://github.com/BlueBrain/blueetl/blob/blueetl-v0.9.0/tests/functional/data/sonata/config/analysis_config_01.yaml
diff --git a/src/blueetl/analysis.py b/src/blueetl/analysis.py
@@ -2,7 +2,6 @@
 
 import gc
 import logging
-from copy import deepcopy
 from pathlib import Path
 from typing import Any, NamedTuple, Optional
 
@@ -12,7 +11,7 @@
 from blueetl.cache import CacheManager
 from blueetl.campaign.config import SimulationCampaign
 from blueetl.config.analysis import init_multi_analysis_configuration
-from blueetl.config.analysis_model import MultiAnalysisConfig, SingleAnalysisConfig
+from blueetl.config.analysis_model import CacheConfig, MultiAnalysisConfig, SingleAnalysisConfig
 from blueetl.features import FeaturesCollection
 from blueetl.repository import Repository
 from blueetl.resolver import AttrResolver, Resolver
@@ -47,21 +46,21 @@ def from_config(
         cls,
         analysis_config: SingleAnalysisConfig,
         simulations_config: SimulationCampaign,
+        cache_config: CacheConfig,
         resolver: Resolver,
-        clear_cache: bool = False,
     ) -> "Analyzer":
         """Initialize the Analyzer from the given configuration.
 
         Args:
             analysis_config: analysis configuration.
             simulations_config: simulation campaign configuration.
+            cache_config: cache configuration.
             resolver: resolver instance.
-            clear_cache: if True, remove any existing cache.
         """
         cache_manager = CacheManager(
+            cache_config=cache_config,
             analysis_config=analysis_config,
             simulations_config=simulations_config,
-            clear_cache=clear_cache,
         )
         repo = Repository(
             simulations_config=simulations_config,
@@ -193,19 +192,18 @@ def from_config(
         cls,
         global_config: dict,
         base_path: StrOrPath,
-        clear_cache: Optional[bool] = None,
+        extra_params: dict[str, Any],
     ) -> "MultiAnalyzer":
         """Initialize the MultiAnalyzer from the given configuration.
 
         Args:
             global_config: analysis configuration.
             base_path: base path used to resolve relative paths in the configuration.
-            clear_cache: if True, remove any existing cache; if False, reuse the existing cache;
-                if None, use the value from the configuration file.
+            extra_params: dict of overriding parameters.
         """
-        global_config = init_multi_analysis_configuration(global_config, Path(base_path))
-        if clear_cache is not None:
-            global_config.clear_cache = clear_cache
+        global_config = init_multi_analysis_configuration(
+            global_config, base_path=Path(base_path), extra_params=extra_params
+        )
         return cls(global_config=global_config)
 
     def _init_analyzers(self) -> dict[str, Analyzer]:
@@ -216,19 +214,21 @@ def _init_analyzers(self) -> dict[str, Analyzer]:
             name: Analyzer.from_config(
                 analysis_config=analysis_config,
                 simulations_config=simulations_config,
+                cache_config=self.global_config.cache.model_copy(
+                    update={"path": self.global_config.cache.path / name}
+                ),
                 resolver=resolver,
-                clear_cache=self.global_config.clear_cache,
             )
             for name, analysis_config in self.global_config.analysis.items()
         }
 
     @classmethod
-    def from_file(cls, path: StrOrPath, clear_cache: Optional[bool] = None) -> "MultiAnalyzer":
+    def from_file(cls, path: StrOrPath, extra_params: dict[str, Any]) -> "MultiAnalyzer":
         """Return a new instance loaded using the given configuration file."""
         return cls.from_config(
             global_config=load_yaml(path),
             base_path=Path(path).parent,
-            clear_cache=clear_cache,
+            extra_params=extra_params,
         )
 
     @property
@@ -316,7 +316,10 @@ def apply_filter(self, simulations_filter: Optional[dict[str, Any]] = None) -> "
         if not simulations_filter:
             return self
         analyzers = {name: a.apply_filter(simulations_filter) for name, a in self.analyzers.items()}
-        return MultiAnalyzer(global_config=deepcopy(self.global_config), analyzers=analyzers)
+        return MultiAnalyzer(
+            global_config=self.global_config.model_copy(deep=True),
+            analyzers=analyzers,
+        )
 
     def show(self):
         """Print all the DataFrames."""
@@ -333,6 +336,7 @@ def run_from_file(
     calculate: bool = True,
     show: bool = False,
     clear_cache: Optional[bool] = None,
+    readonly_cache: Optional[bool] = None,
     loglevel: Optional[int] = None,
 ) -> MultiAnalyzer:
     """Initialize and return the MultiAnalyzer.
@@ -343,8 +347,12 @@ def run_from_file(
         extract: if True, run the extraction of the repository.
         calculate: if True, run the calculation of the features.
         show: if True, show a short representation of all the Pandas DataFrames, mainly for debug.
-        clear_cache: if True, remove any existing cache; if False, reuse the existing cache;
-            if None, use the value from the configuration file.
+        clear_cache: if None, use the value from the configuration file. Otherwise:
+            if True, remove any existing cache;
+            if False, reuse the existing cache if possible.
+        readonly_cache: if None, use the value from the configuration file. Otherwise:
+            if True, use the existing cache if possible, or raise an error;
+            if False, use the existing cache if possible, or update it.
         loglevel: if specified, used to set up logging.
 
     Returns:
@@ -355,7 +363,13 @@ def run_from_file(
     if seed is not None:
         np.random.seed(seed)
     L.info("MultiAnalyzer configuration: %s", analysis_config_file)
-    ma = MultiAnalyzer.from_file(analysis_config_file, clear_cache=clear_cache)
+    ma = MultiAnalyzer.from_file(
+        analysis_config_file,
+        extra_params={
+            "clear_cache": clear_cache,
+            "readonly_cache": readonly_cache,
+        },
+    )
     if extract:
         ma.extract_repo()
     if calculate:

diff --git a/src/blueetl/apps/migrate.py b/src/blueetl/apps/migrate.py
@@ -10,7 +10,7 @@
 from blueetl.utils import dump_yaml, load_yaml
 
 # it should always match CONFIG_VERSION if the script supports the current version
-MIGRATION_CONFIG_VERSION = 3
+MIGRATION_CONFIG_VERSION = 4
 
 
 def _safe_set(d, key, value):
@@ -81,13 +81,25 @@ def _migrate_v2_to_v3(input_config):
     return output_config
 
 
+def _migrate_v3_to_v4(input_config):
+    """Migrate the configuration from v3 (BlueETL 0.8.x) to v4 (BlueETL 0.9.x)."""
+    output_config = deepcopy(input_config)
+    output_config["version"] = 4
+    cache_config = output_config.setdefault("cache", {})
+    if (value := output_config.pop("output", None)) is not None:
+        _safe_set(cache_config, "path", value)
+    if (value := output_config.pop("clear_cache", None)) is not None:
+        _safe_set(cache_config, "clear", value)
+    return output_config
+
+
 def _sort_root_keys(input_config):
     root_keys = [
         "version",
         "simulation_campaign",
         "simulations_filter",
         "simulations_filter_in_memory",
-        "output",
+        "cache",
         "analysis",
         "custom",
     ]
@@ -110,6 +122,8 @@ def migrate_config(input_config_file, output_config_file, sort):
         config = _migrate_v1_to_v2(config)
     if version <= 2:
         config = _migrate_v2_to_v3(config)
+    if version <= 3:
+        config = _migrate_v3_to_v4(config)
     if version == CONFIG_VERSION:
         click.secho(f"The config version {version} doesn't need to be migrated.", fg="yellow")
     if sort:

diff --git a/src/blueetl/apps/run.py b/src/blueetl/apps/run.py
@@ -17,14 +17,29 @@
 @click.option("--show/--no-show", help="Show repository and features dataframes.")
 @click.option(
     "--clear-cache/--no-clear-cache",
-    help="If specified, force clearing or keeping the cache, regardless of the configuration file.",
+    help="If True, force clearing the cache.",
+    default=None,
+)
+@click.option(
+    "--readonly-cache/--no-readonly-cache",
+    help="If True, use the existing cache if possible, or raise an error if not.",
     default=None,
 )
 @click.option("-i", "--interactive/--no-interactive", help="Start an interactive IPython shell.")
 @click.option("-v", "--verbose", count=True, help="-v for INFO, -vv for DEBUG")
-def run(analysis_config_file, seed, extract, calculate, show, clear_cache, interactive, verbose):
+def run(
+    analysis_config_file,
+    seed,
+    extract,
+    calculate,
+    show,
+    clear_cache,
+    readonly_cache,
+    interactive,
+    verbose,
+):
     """Run the analysis."""
-    # pylint: disable=unused-variable,unused-import,import-outside-toplevel
+    # pylint: disable=unused-variable,unused-import,import-outside-toplevel,too-many-arguments
     loglevel = (logging.WARNING, logging.INFO, logging.DEBUG)[min(verbose, 2)]
     # assign the result to a local variable to make it available in the interactive shell
     ma = run_from_file(  # noqa
@@ -34,6 +49,7 @@ def run(analysis_config_file, seed, extract, calculate, show, clear_cache, inter
         calculate=calculate,
         show=show,
         clear_cache=clear_cache,
+        readonly_cache=readonly_cache,
         loglevel=loglevel,
     )
     if interactive: