From 205b2bb2330ae2c86dd4b2d68924ae98a53949dd Mon Sep 17 00:00:00 2001 From: Reuven Gonzales Date: Fri, 25 Oct 2024 15:14:23 -0700 Subject: [PATCH] Improve Metrics Mesh README (#2419) * Updates readme and some macros to ensure ease of use * fix * fix --- warehouse/metrics_mesh/README.md | 155 ++++++++++++++++++ .../oso_metrics/change_in_developers.sql | 20 ++- .../contributor_activity_classification.sql | 12 +- .../developer_activity_classification.sql | 12 +- .../metrics_tools/lib/factories/factory.py | 4 +- .../metrics_tools/lib/factories/macros.py | 10 +- 6 files changed, 189 insertions(+), 24 deletions(-) diff --git a/warehouse/metrics_mesh/README.md b/warehouse/metrics_mesh/README.md index 4dce3977a..7355c1a2c 100644 --- a/warehouse/metrics_mesh/README.md +++ b/warehouse/metrics_mesh/README.md @@ -32,3 +32,158 @@ cd warehouse/metrics_mesh sqlmesh plan dev --start 2024-07-01 --end 2024-08-01 # to run for specific date rates (fast) sqlmesh plan # to run the entire pipeline (slow) ``` + +## Metrics Overview + +Metrics are generated using the tools in `metrics_tools`. + +### Essential Concepts + +- Metrics Definition + - These are the `.sql` files that act as the starting point for generating + metrics models. The metrics models are generated by taking taking all + permutations of aggregation intervals and entity types for the given metric. + At this time this isn't a fully generic semantic model but is a pragmatic + approximation for the OSO use case. +- Metric Types + - _time aggregation_ - This is an aggregation that operates by aggregating + within a specific bucket of time. We may sometimes call this a "normal" + aggregation + - _rolling window aggregation_ - These are aggregations that look back some + user defined number of days at an interval specified by a `cron` parameter + in a rolling fashion. So if you have a 30 day rolling window with a daily + cron, the final table is generated by iterating through each day and + aggregating the previous 30 days of each day. +- Entity Types + - These are related to the OSO entities artifact, project and collection. The + default entity type is artifact and _every_ metric model should be written, + for now, as if the aggregation is operating on just an artifact. +- Metric Time Range + - Regardless of the aggregation type the metrics will be passed in start/end times in the form of a custom sqlmesh macro (described below). These start/end times are based on the rolling window or the time aggregation. If the rolling window i + +## Metrics Special Macros + +### `@metrics_sample_date` + +Derives the correct sample date for a metric based on the metric type (normal +aggregation or rolling window aggregation). This is essential to use for first +order metrics. + +Usage: + +``` +@metrics_sample_date(event_table_source.event_table_date_column) +``` + +The passed in date should be the date column of a given time series event +source. In all cases that must use this, this is just +`events_daily_to_artifact.bucket_day`. + +### `@metric_start` and `@metric_end` + +For rolling windows or time aggregations that have boundaries that correlate to +times, these provide the proper time ranges. So for a rolling window of 30 days +this will give a 30 day time window where the start days is 30 days _before_ the +interval for the current increment of the given model. So for example if the +increment is `2024-01-30` the start will be `2024-01-01` and the end will be +`2024-01-30`. These macros take a single argument which is the data type to use +for the returned value. + +Usage: + +For a date + +``` +@metrics_start(DATE) +``` + +For a timestamp (the value you use for type will depend on the dialect you're +using) + +``` +@metrics_start(TIMESTAMP) +``` + +### `@metrics_name` + +The metrics name is used to generate a name for a given metric. By default this +can be used without any arguments and the metrics model factory will +automatically assign a metric name. However in cases like the developer +classifications we want to create multiple types of metrics in a given query. We +can accomplish simply using this macro. The first argument is the "prefix" name +of the metric. This macro will then generate the appropriate suffix. For time +aggregations this will simply be something like `daily`, `weekly`, or `monthly`. +For rolling windows this will be something like, `_over_30_days` or +`_over_3_months` (the exact string depends on the rolling window configuration). + +### `@metrics_entity_type_col` + +Provides a way to reference the generated column for a given entity type in a +metrics model. This is required because the metrics definition only acts as the +starting point of a metric from the perspective of an artifact. To get metrics +for projects and collections we need this macro while we aren't doing a fully +generic semantic model. be entity column name based on the currently queried +entity type. When queries are being automatically generated, entity types are +changed so the column used to reference the entity column also changes. This +macro allows for the queries to succeed by accepting a format string where the +variable `entity_type` referenced as `{entity_type}` in the format string is +interpreted as the current entity type (`artifact`, `project`, `collection`). + +Usage: + +``` +@metrics_entity_type_col("to_{entity_type}_id") +``` + +If a query uses like so: + +``` +SELECT + @metrics_entity_type_col("to_{entity_type}_id", t) +FROM table as t +``` + +The following SQL would be rendered for different entity types (the joins are +approximated for example only) + +Artifact: + +``` +SELECT + t.to_artifact_id +FROM table as t +``` + +Project: + +``` +SELECT + artifact_to_project.project_id as to_project_id +FROM table as t +JOIN artifact_to_project as artifact_to_project + ON artifact_to_project.artifact_id = t.to_artifact_id +``` + +Collection + +``` +SELECT + project_to_collection.collection_id as to_collection_id +FROM table as t +JOIN artifact_to_project as artifact_to_project + ON artifact_to_project.artifact_id = t.to_artifact_id +JOIN project_to_collection as project_to_collection + ON project_to_collection.to_project_id = artifact_to_project.to_project_id + +``` + +### `@metrics_entity_type_alias` + +Like, `@metric_entity_type_col` but programmatically aliases any input +expression with one that derives from the entity type. + +Usage: + +``` +@metrics_entity_type_alias(some_column, 'some_{entity_type}_column") +``` diff --git a/warehouse/metrics_mesh/oso_metrics/change_in_developers.sql b/warehouse/metrics_mesh/oso_metrics/change_in_developers.sql index d2ef81c62..e41d74372 100644 --- a/warehouse/metrics_mesh/oso_metrics/change_in_developers.sql +++ b/warehouse/metrics_mesh/oso_metrics/change_in_developers.sql @@ -1,7 +1,10 @@ WITH latest AS ( SELECT classification.metrics_sample_date, classification.event_source, - @metrics_entity_type_col('to_%s_id', table_alias := classification), + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := classification + ), classification.metric, classification.amount FROM metrics_peer_ref( @@ -19,7 +22,10 @@ WITH latest AS ( previous AS ( SELECT classification.metrics_sample_date, classification.event_source, - @metrics_entity_type_col('to_%s_id', table_alias := classification), + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := classification + ), classification.metric, classification.amount FROM metrics_peer_ref( @@ -36,12 +42,12 @@ previous AS ( ) select @metrics_end(DATE) as metrics_sample_date, COALESCE(latest.event_source, previous.event_source) as event_source, - @metrics_alias_by_entity_type( + @metrics_entity_type_alias( COALESCE( - @metrics_entity_type_col('to_%s_id', table_alias := latest), - @metrics_entity_type_col('to_%s_id', table_alias := previous) + @metrics_entity_type_col('to_{entity_type}_id', table_alias := latest), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := previous) ), - 'to_%s_id' + 'to_{entity_type}_id' ), '' as from_artifact_id, @metrics_name( @@ -53,5 +59,5 @@ select @metrics_end(DATE) as metrics_sample_date, latest.amount - previous.amount as amount FROM previous LEFT JOIN latest ON latest.event_source = previous.event_source - AND @metrics_entity_type_col('to_%s_id', table_alias := latest) = @metrics_entity_type_col('to_%s_id', table_alias := previous) + AND @metrics_entity_type_col('to_{entity_type}_id', table_alias := latest) = @metrics_entity_type_col('to_{entity_type}_id', table_alias := previous) AND latest.metric = previous.metric \ No newline at end of file diff --git a/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql b/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql index 72826d878..7982384d2 100644 --- a/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql +++ b/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql @@ -1,6 +1,6 @@ select active.metrics_sample_date, active.event_source, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), '' as from_artifact_id, @metric_name('full_time_contributors') as metric, COUNT(DISTINCT active.from_artifact_id) as amount @@ -12,13 +12,13 @@ from metrics_peer_ref( where active.amount / @rolling_window >= @full_time_ratio group by metric, from_artifact_id, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), event_source, metrics_sample_date union all select active.metrics_sample_date, active.event_source, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), '' as from_artifact_id, @metric_name('part_time_contributors') as metric, COUNT(DISTINCT active.from_artifact_id) as amount @@ -30,13 +30,13 @@ from metrics_peer_ref( where active.amount / @rolling_window < @full_time_ratio group by metric, from_artifact_id, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), event_source, metrics_sample_date union all select active.metrics_sample_date, active.event_source, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), '' as from_artifact_id, @metric_name('active_contributors') as metric, COUNT(DISTINCT active.from_artifact_id) as amount @@ -47,6 +47,6 @@ from metrics_peer_ref( ) as active group by metric, from_artifact_id, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), event_source, metrics_sample_date \ No newline at end of file diff --git a/warehouse/metrics_mesh/oso_metrics/developer_activity_classification.sql b/warehouse/metrics_mesh/oso_metrics/developer_activity_classification.sql index 93305f66a..6f85e2c31 100644 --- a/warehouse/metrics_mesh/oso_metrics/developer_activity_classification.sql +++ b/warehouse/metrics_mesh/oso_metrics/developer_activity_classification.sql @@ -1,6 +1,6 @@ select active.metrics_sample_date, active.event_source, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), '' as from_artifact_id, @metric_name('full_time_developers') as metric, COUNT(DISTINCT active.from_artifact_id) as amount @@ -12,13 +12,13 @@ from metrics_peer_ref( where active.amount / @rolling_window >= @full_time_ratio group by metric, from_artifact_id, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), event_source, metrics_sample_date union all select active.metrics_sample_date, active.event_source, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), '' as from_artifact_id, @metric_name('part_time_developers') as metric, COUNT(DISTINCT active.from_artifact_id) as amount @@ -30,13 +30,13 @@ from metrics_peer_ref( where active.amount / @rolling_window < @full_time_ratio group by metric, from_artifact_id, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), event_source, metrics_sample_date union all select active.metrics_sample_date, active.event_source, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), '' as from_artifact_id, @metric_name('active_developers') as metric, COUNT(DISTINCT active.from_artifact_id) as amount @@ -47,6 +47,6 @@ from metrics_peer_ref( ) as active group by metric, from_artifact_id, - @metrics_entity_type_col('to_%s_id', table_alias := active), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), event_source, metrics_sample_date \ No newline at end of file diff --git a/warehouse/metrics_tools/lib/factories/factory.py b/warehouse/metrics_tools/lib/factories/factory.py index 790b7c889..93a0fbdb5 100644 --- a/warehouse/metrics_tools/lib/factories/factory.py +++ b/warehouse/metrics_tools/lib/factories/factory.py @@ -22,7 +22,7 @@ metrics_sample_date, metrics_start, relative_window_sample_date, - metrics_alias_by_entity_type, + metrics_entity_type_alias, ) CURR_DIR = os.path.dirname(__file__) @@ -120,7 +120,7 @@ def generate_models_from_query( columns = METRICS_COLUMNS_BY_ENTITY[ref["entity_type"]] additional_macros = [ metrics_entity_type_col, - metrics_alias_by_entity_type, + metrics_entity_type_alias, relative_window_sample_date, (metrics_name, ["metric_name"]), metrics_sample_date, diff --git a/warehouse/metrics_tools/lib/factories/macros.py b/warehouse/metrics_tools/lib/factories/macros.py index 563a7febc..6a25d1d31 100644 --- a/warehouse/metrics_tools/lib/factories/macros.py +++ b/warehouse/metrics_tools/lib/factories/macros.py @@ -236,15 +236,19 @@ def metrics_entity_type_col( names.append(table_alias.this) else: names.append(table_alias.sql()) - column_name = format_str % evaluator.locals.get("entity_type", "artifact") + column_name = format_str.format( + entity_type=evaluator.locals.get("entity_type", "artifact") + ) names.append(column_name) return sqlglot.to_column(f"{'.'.join(names)}") -def metrics_alias_by_entity_type( +def metrics_entity_type_alias( evaluator: MacroEvaluator, to_alias: exp.Expression, format_str: str ): if isinstance(format_str, exp.Literal): format_str = format_str.this - alias_name = format_str % evaluator.locals.get("entity_type", "artifact") + alias_name = format_str.format( + entity_type=evaluator.locals.get("entity_type", "artifact") + ) return exp.alias_(to_alias, alias_name)