update docs and rename extractor to aggregator

cta-observatory · Aug 2, 2024 · fbab2d1 · fbab2d1
1 parent 5275708
commit fbab2d1
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 72 deletions.
diff --git a/docs/api-reference/monitoring/aggregator.rst b/docs/api-reference/monitoring/aggregator.rst
@@ -0,0 +1,11 @@
+.. _stats_aggregator:
+
+*********************
+Statistics Aggregator
+*********************
+
+
+Reference/API
+=============
+
+.. automodapi:: ctapipe.monitoring.aggregator
diff --git a/docs/api-reference/monitoring/extractor.rst b/docs/api-reference/monitoring/extractor.rst
diff --git a/docs/api-reference/monitoring/index.rst b/docs/api-reference/monitoring/index.rst
@@ -6,9 +6,11 @@ Monitoring (`~ctapipe.monitoring`)
 
 .. currentmodule:: ctapipe.monitoring
 
-This module include all the functions and classes needed for the Monitoring of CTA data.
+Monitoring data are time-series used to monitor the status or quality of hardware, software algorithms, the environment, or other data products. These contain values recorded periodically at different rates, and can be thought of as a set of tables with rows identified by a time-stamp. They are potentially acquired during the day or nighttime operation of the array and during subsequent data processing, but ataverage rates much slower than Event data and faster than the length of a typical observation block. Examples include telescope tracking positions, trigger rates, camera sensor conditions, weather conditions, and the status or quality-control data of a particular hardware or software component.
 
-Currently, only code related to :ref:`stats_extractor` is implemented here.
+This module provides some code to help to generate monitoring data from processed event data, particularly for the purposes of calibration and data quality assessment.
+
+Currently, only code related to :ref:`stats_aggregator` is implemented here.
 
 
 Submodules
@@ -18,7 +20,7 @@ Submodules
   :maxdepth: 1
   :glob:
 
-  extractor
+  aggregator
 
 
 Reference/API

diff --git a/src/ctapipe/monitoring/extractor.py → src/ctapipe/monitoring/aggregator.py b/src/ctapipe/monitoring/extractor.py → src/ctapipe/monitoring/aggregator.py
@@ -1,11 +1,17 @@
 """
-Extraction algorithms to compute the statistics from a chunk of images
+Algorithms to compute aggregated time-series statistics from columns of an event table.
+
+These classes take as input an events table, divide it into time chunks, which
+may optionally overlap, and compute various aggregated statistics for each
+chunk.  The statistics include the count, mean, median, and standard deviation. The result
+is a monitoring table with columns describing the start and stop time of the chunk
+and the aggregated statistic values.
 """
 
 __all__ = [
-    "StatisticsExtractor",
-    "PlainExtractor",
-    "SigmaClippingExtractor",
+    "StatisticsAggregator",
+    "PlainAggregator",
+    "SigmaClippingAggregator",
 ]
 
 from abc import abstractmethod
@@ -20,15 +26,15 @@
 from ctapipe.core.traits import Int
 
 
-class StatisticsExtractor(TelescopeComponent):
+class StatisticsAggregator(TelescopeComponent):
     """
-    Base component to handle the extraction of the statistics from a table
+    Base component to handle the computation of aggregated statistic values from a table
     containing e.g. charges, peak times and/or charge variances (images).
     """
 
     chunk_size = Int(
         2500,
-        help="Size of the chunk used for the calculation of the statistical values",
+        help="Size of the chunk used for the computation of aggregated statistic values",
     ).tag(config=True)
 
     def __call__(
@@ -39,40 +45,40 @@ def __call__(
         col_name="image",
     ) -> Table:
         """
-        Divide table into chunks and extract the statistical values.
+        Divide table into chunks and compute aggregated statistic values.
 
         This function divides the input table into overlapping or non-overlapping chunks of size ``chunk_size``
-        and call the relevant function of the particular extractor to extract the statistical values.
+        and call the relevant function of the particular aggregator to compute aggregated statistic values.
         The chunks are generated in a way that ensures they do not overflow the bounds of the table.
-        - If ``chunk_shift`` is None, extraction chunks will not overlap, but the last chunk is ensured to be
+        - If ``chunk_shift`` is None, chunks will not overlap, but the last chunk is ensured to be
         of size `chunk_size`, even if it means the last two chunks will overlap.
         - If ``chunk_shift`` is provided, it will determine the number of samples to shift between the start
-        of consecutive chunks resulting in an overlap of extraction chunks. Chunks that overflows the bounds
+        of consecutive chunks resulting in an overlap of chunks. Chunks that overflows the bounds
         of the table are not considered.
 
         Parameters
         ----------
         table : astropy.table.Table
             table with images of shape (n_images, n_channels, n_pix)
-            and timestamps of shape (n_images, ) stored in an astropy Table
+            and timestamps of shape (n_images, )
         masked_pixels_of_sample : ndarray, optional
             boolean array of masked pixels of shape (n_pix, ) that are not available for processing
         chunk_shift : int, optional
-            number of samples to shift between the start of consecutive extraction chunks
+            number of samples to shift between the start of consecutive chunks
         col_name : string
             column name in the table
 
         Returns
         -------
         astropy.table.Table
             table containing the start and end values as timestamps and event IDs
-            as well as the extracted statistics (mean, median, std) for each chunk
+            as well as the aggregated statistic values (mean, median, std) for each chunk
         """
 
-        # Check if the statistics of the table is sufficient to extract at least one chunk.
+        # Check if the statistics of the table is sufficient to compute at least one complete chunk.
         if len(table) < self.chunk_size:
             raise ValueError(
-                f"The length of the provided table ({len(table)}) is insufficient to meet the required statistics for a single extraction chunk of size ({self.chunk_size})."
+                f"The length of the provided table ({len(table)}) is insufficient to meet the required statistics for a single chunk of size ({self.chunk_size})."
             )
         # Check if the chunk_shift is smaller than the chunk_size
         if chunk_shift is not None and chunk_shift > self.chunk_size:
@@ -93,11 +99,11 @@ def _get_chunks(table, chunk_shift):
             if chunk_shift is None and len(table) % self.chunk_size != 0:
                 yield table[-self.chunk_size :]
 
-        # Calculate the statistics for each chunk of images
+        # Compute aggregated statistic values for each chunk of images
         units = {col: table[col_name].unit for col in ("mean", "median", "std")}
         data = defaultdict(list)
         for chunk in _get_chunks(table, chunk_shift):
-            stats = self.extract(chunk[col_name].data, masked_pixels_of_sample)
+            stats = self.compute_stats(chunk[col_name].data, masked_pixels_of_sample)
             data["time_start"].append(chunk["time_mono"][0])
             data["time_end"].append(chunk["time_mono"][-1])
             data["event_id_start"].append(chunk["event_id"][0])
@@ -110,20 +116,20 @@ def _get_chunks(table, chunk_shift):
         return Table(data, units=units)
 
     @abstractmethod
-    def extract(self, images, masked_pixels_of_sample) -> StatisticsContainer:
+    def compute_stats(self, images, masked_pixels_of_sample) -> StatisticsContainer:
         pass
 
 
-class PlainExtractor(StatisticsExtractor):
+class PlainAggregator(StatisticsAggregator):
     """
-    Extract the statistics from a chunk of images using numpy functions
+    Compute aggregated statistic values from a chunk of images using numpy functions
     """
 
-    def extract(self, images, masked_pixels_of_sample) -> StatisticsContainer:
+    def compute_stats(self, images, masked_pixels_of_sample) -> StatisticsContainer:
         # Mask broken pixels
         masked_images = np.ma.array(images, mask=masked_pixels_of_sample)
 
-        # Calculate the mean, median, and std over the chunk per pixel
+        # Compute the mean, median, and std over the chunk per pixel
         pixel_mean = np.ma.mean(masked_images, axis=0).filled(np.nan)
         pixel_median = np.ma.median(masked_images, axis=0).filled(np.nan)
         pixel_std = np.ma.std(masked_images, axis=0).filled(np.nan)
@@ -136,9 +142,9 @@ def extract(self, images, masked_pixels_of_sample) -> StatisticsContainer:
         )
 
 
-class SigmaClippingExtractor(StatisticsExtractor):
+class SigmaClippingAggregator(StatisticsAggregator):
     """
-    Extract the statistics from a chunk of images using astropy's sigma clipping functions
+    Compute aggregated statistic values from a chunk of images using astropy's sigma clipping functions
     """
 
     max_sigma = Int(
@@ -150,11 +156,11 @@ class SigmaClippingExtractor(StatisticsExtractor):
         help="Number of iterations for the sigma clipping outlier removal",
     ).tag(config=True)
 
-    def extract(self, images, masked_pixels_of_sample) -> StatisticsContainer:
+    def compute_stats(self, images, masked_pixels_of_sample) -> StatisticsContainer:
         # Mask broken pixels
         masked_images = np.ma.array(images, mask=masked_pixels_of_sample)
 
-        # Calculate the mean, median, and std over the chunk per pixel
+        # Compute the mean, median, and std over the chunk per pixel
         pixel_mean, pixel_median, pixel_std = sigma_clipped_stats(
             masked_images,
             sigma=self.max_sigma,

diff --git a/...tapipe/monitoring/tests/test_extractor.py → ...apipe/monitoring/tests/test_aggregator.py b/...tapipe/monitoring/tests/test_extractor.py → ...apipe/monitoring/tests/test_aggregator.py
@@ -1,17 +1,17 @@
 """
-Tests for StatisticsExtractor and related functions
+Tests for StatisticsAggregator and related functions
 """
 
 import numpy as np
 import pytest
 from astropy.table import Table
 from astropy.time import Time
 
-from ctapipe.monitoring.extractor import PlainExtractor, SigmaClippingExtractor
+from ctapipe.monitoring.aggregator import PlainAggregator, SigmaClippingAggregator
 
 
-def test_extractors(example_subarray):
-    """test basic functionality of the StatisticsExtractors"""
+def test_aggregators(example_subarray):
+    """test basic functionality of the StatisticsAggregators"""
 
     # Create dummy data for testing
     times = Time(
@@ -35,23 +35,25 @@ def test_extractors(example_subarray):
         [times, event_ids, time_data],
         names=("time_mono", "event_id", "peak_time"),
     )
-    # Initialize the extractors
+    # Initialize the aggregators
     chunk_size = 2500
-    ped_extractor = SigmaClippingExtractor(
+    ped_aggregator = SigmaClippingAggregator(
         subarray=example_subarray, chunk_size=chunk_size
     )
-    ff_charge_extractor = SigmaClippingExtractor(
+    ff_charge_aggregator = SigmaClippingAggregator(
+        subarray=example_subarray, chunk_size=chunk_size
+    )
+    ff_time_aggregator = PlainAggregator(
         subarray=example_subarray, chunk_size=chunk_size
     )
-    ff_time_extractor = PlainExtractor(subarray=example_subarray, chunk_size=chunk_size)
 
-    # Extract the statistical values
-    ped_stats = ped_extractor(table=ped_table)
-    charge_stats = ff_charge_extractor(table=charge_table)
-    time_stats = ff_time_extractor(table=time_table, col_name="peak_time")
+    # Compute the statistical values
+    ped_stats = ped_aggregator(table=ped_table)
+    charge_stats = ff_charge_aggregator(table=charge_table)
+    time_stats = ff_time_aggregator(table=time_table, col_name="peak_time")
 
     # Check if the start and end values are properly set for the timestamps and event IDs
-    # and if the number of events used for the extraction of the statistics is equal the size of the chunk
+    # and if the number of events used for the computation of aggregated statistic values is equal the size of the chunk
     assert ped_stats[0]["time_start"] == times[0]
     assert time_stats[0]["event_id_start"] == event_ids[0]
     assert ped_stats[1]["time_end"] == times[-1]
@@ -88,25 +90,25 @@ def test_chunk_shift(example_subarray):
         [times, event_ids, charge_data],
         names=("time_mono", "event_id", "image"),
     )
-    # Initialize the extractor
-    extractor = SigmaClippingExtractor(subarray=example_subarray, chunk_size=2500)
-    # Extract the statistical values
-    chunk_stats = extractor(table=charge_table)
-    chunk_stats_shift = extractor(table=charge_table, chunk_shift=2000)
-    # Check if three chunks are used for the extraction as the last chunk overflows
+    # Initialize the aggregator
+    aggregator = SigmaClippingAggregator(subarray=example_subarray, chunk_size=2500)
+    # Compute aggregated statistic values
+    chunk_stats = aggregator(table=charge_table)
+    chunk_stats_shift = aggregator(table=charge_table, chunk_shift=2000)
+    # Check if three chunks are used for the computation of aggregated statistic values as the last chunk overflows
     assert len(chunk_stats) == 3
-    # Check if two chunks are used for the extraction as the last chunk is dropped
+    # Check if two chunks are used for the computation of aggregated statistic values as the last chunk is dropped
     assert len(chunk_stats_shift) == 2
     # Check if ValueError is raised when the chunk_size is larger than the length of table
     with pytest.raises(ValueError):
-        _ = extractor(table=charge_table[1000:1500])
+        _ = aggregator(table=charge_table[1000:1500])
     # Check if ValueError is raised when the chunk_shift is smaller than the chunk_size
     with pytest.raises(ValueError):
-        _ = extractor(table=charge_table, chunk_shift=3000)
+        _ = aggregator(table=charge_table, chunk_shift=3000)
 
 
 def test_with_outliers(example_subarray):
-    """test the robustness of the extractors in the presence of outliers"""
+    """test the robustness of the aggregators in the presence of outliers"""
 
     # Create dummy data for testing
     times = Time(
@@ -125,19 +127,19 @@ def test_with_outliers(example_subarray):
         [times, event_ids, ped_data],
         names=("time_mono", "event_id", "image"),
     )
-    # Initialize the extractors
-    sigmaclipping_extractor = SigmaClippingExtractor(
+    # Initialize the aggregators
+    sigmaclipping_aggregator = SigmaClippingAggregator(
         subarray=example_subarray, chunk_size=2500
     )
-    plain_extractor = PlainExtractor(subarray=example_subarray, chunk_size=2500)
+    plain_aggregator = PlainAggregator(subarray=example_subarray, chunk_size=2500)
 
-    # Extract the statistical values
-    sigmaclipping_chunk_stats = sigmaclipping_extractor(table=ped_table)
-    plain_chunk_stats = plain_extractor(table=ped_table)
+    # Compute aggregated statistic values
+    sigmaclipping_chunk_stats = sigmaclipping_aggregator(table=ped_table)
+    plain_chunk_stats = plain_aggregator(table=ped_table)
 
-    # Check if SigmaClippingExtractor is robust to a few fake outliers as expected
+    # Check if SigmaClippingAggregator is robust to a few fake outliers as expected
     np.testing.assert_allclose(sigmaclipping_chunk_stats[0]["mean"], 2.0, atol=1.5)
 
-    # Check if PlainExtractor is not robust to a few fake outliers as expected
+    # Check if PlainAggregator is not robust to a few fake outliers as expected
     with pytest.raises(AssertionError):
         np.testing.assert_allclose(plain_chunk_stats[0]["mean"], 2.0, atol=1.5)