diff --git a/warehouse/metrics_mesh/models/artifacts_by_project_v1.sql b/warehouse/metrics_mesh/models/artifacts_by_project_v1.sql index a90c9d763..9b8f2c909 100644 --- a/warehouse/metrics_mesh/models/artifacts_by_project_v1.sql +++ b/warehouse/metrics_mesh/models/artifacts_by_project_v1.sql @@ -1,11 +1,10 @@ --- Mirrors the artifacts_by_project_v1 table in the source database. This is --- important for situations like trino and bigquery connections. As trino has no --- ways to optimize queries to bigquery since it's using the storage api +/* Mirrors the artifacts_by_project_v1 table in the source database. This is */ /* important for situations like trino and bigquery connections. As trino has no */ /* ways to optimize queries to bigquery since it's using the storage api */ MODEL ( name metrics.artifacts_by_project_v1, kind FULL ); -select + +SELECT artifact_id, artifact_source_id, artifact_source, @@ -15,4 +14,4 @@ select project_source, project_namespace, project_name -from @oso_source('artifacts_by_project_v1') \ No newline at end of file +FROM @oso_source('artifacts_by_project_v1') \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/code/first_contribution_to_artifact.sql b/warehouse/metrics_mesh/models/code/first_contribution_to_artifact.sql new file mode 100644 index 000000000..d593e6949 --- /dev/null +++ b/warehouse/metrics_mesh/models/code/first_contribution_to_artifact.sql @@ -0,0 +1,24 @@ +/* +TODO: This is a hack for now to fix performance issues with the contributor +classifications. We should use some kind of factory for this in the future to +get all dimensions +*/ +MODEL ( + name metrics.first_contribution_to_artifact, + kind FULL, + partitioned_by (YEAR("time"), "event_source"), + grain (time, event_source, from_artifact_id, to_artifact_id) +); + +SELECT + MIN(time) AS time, + event_source, + from_artifact_id, + to_artifact_id +FROM metrics.first_of_event_from_artifact +WHERE + event_type IN ('COMMIT_CODE', 'ISSUE_OPENED', 'PULL_REQUEST_OPENED', 'PULL_REQUEST_MERGED') +GROUP BY + event_source, + from_artifact_id, + to_artifact_id \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/code/first_contribution_to_collection.sql b/warehouse/metrics_mesh/models/code/first_contribution_to_collection.sql new file mode 100644 index 000000000..8db5c54c2 --- /dev/null +++ b/warehouse/metrics_mesh/models/code/first_contribution_to_collection.sql @@ -0,0 +1,24 @@ +/* +TODO: This is a hack for now to fix performance issues with the contributor +classifications. We should use some kind of factory for this in the future to +get all dimensions +*/ +MODEL ( + name metrics.first_contribution_to_collection, + kind FULL, + partitioned_by (YEAR("time"), "event_source"), + grain (time, event_source, from_artifact_id, to_collection_id) +); + +SELECT + MIN(time) AS time, + first_contribution_to_project.event_source, + first_contribution_to_project.from_artifact_id, + projects_by_collection_v1.collection_id AS to_collection_id +FROM metrics.first_contribution_to_project +INNER JOIN metrics.projects_by_collection_v1 + ON first_contribution_to_project.to_project_id = projects_by_collection_v1.project_id +GROUP BY + event_source, + from_artifact_id, + to_collection_id \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/code/first_contribution_to_project.sql b/warehouse/metrics_mesh/models/code/first_contribution_to_project.sql new file mode 100644 index 000000000..c52bc28fe --- /dev/null +++ b/warehouse/metrics_mesh/models/code/first_contribution_to_project.sql @@ -0,0 +1,24 @@ +/* +TODO: This is a hack for now to fix performance issues with the contributor +classifications. We should use some kind of factory for this in the future to +get all dimensions +*/ +MODEL ( + name metrics.first_contribution_to_project, + kind FULL, + partitioned_by (YEAR("time"), "event_source"), + grain (time, event_source, from_artifact_id, to_project_id) +); + +SELECT + MIN(first_contribution_to_artifact.time) AS time, + first_contribution_to_artifact.event_source, + first_contribution_to_artifact.from_artifact_id, + artifacts_by_project_v1.project_id AS to_project_id +FROM metrics.first_contribution_to_artifact +INNER JOIN metrics.artifacts_by_project_v1 + ON first_contribution_to_artifact.to_artifact_id = artifacts_by_project_v1.artifact_id +GROUP BY + event_source, + from_artifact_id, + to_project_id \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/code/issue_event_time_deltas.sql b/warehouse/metrics_mesh/models/code/issue_event_time_deltas.sql index 9c8111f25..6f7980b46 100644 --- a/warehouse/metrics_mesh/models/code/issue_event_time_deltas.sql +++ b/warehouse/metrics_mesh/models/code/issue_event_time_deltas.sql @@ -1,5 +1,4 @@ --- Model that records the delta (in seconds) since the creation of the issue or --- pr. +/* Model that records the delta (in seconds) since the creation of the issue or */ /* pr. */ MODEL ( name metrics.issue_event_time_deltas, kind INCREMENTAL_BY_TIME_RANGE ( @@ -9,38 +8,31 @@ MODEL ( ), start '2015-01-01', cron '@daily', - partitioned_by (day("time"), "event_type"), - grain ( - time, - event_type, - event_source, - from_artifact_id, - to_artifact_id - ) + partitioned_by (DAY("time"), "event_type"), + grain (time, event_type, event_source, from_artifact_id, to_artifact_id) ); -select + +SELECT "time", event_type, event_source, - @oso_id( - event_source, - to_artifact_id, - issue_number - ) as issue_id, + @oso_id(event_source, to_artifact_id, issue_number) AS issue_id, issue_number, to_artifact_id, from_artifact_id, - created_at::timestamp, - merged_at::timestamp, - closed_at::timestamp, - date_diff('second', created_at, "time")::double as created_delta, - case - when merged_at is null then null - else date_diff('second', merged_at, "time") - end::double as merged_delta, - case - when closed_at is null then null - else date_diff('second', closed_at, "time") - end::double as closed_delta, + created_at::TIMESTAMP, + merged_at::TIMESTAMP, + closed_at::TIMESTAMP, + DATE_DIFF('SECOND', created_at, "time")::DOUBLE AS created_delta, + CASE + WHEN merged_at IS NULL + THEN NULL + ELSE DATE_DIFF('SECOND', merged_at, "time") + END::DOUBLE AS merged_delta, + CASE + WHEN closed_at IS NULL + THEN NULL + ELSE DATE_DIFF('SECOND', closed_at, "time") + END::DOUBLE AS closed_delta, comments -from @oso_source('timeseries_events_aux_issues_by_artifact_v0') \ No newline at end of file +FROM @oso_source('timeseries_events_aux_issues_by_artifact_v0') \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/events_daily_to_artifact.sql b/warehouse/metrics_mesh/models/events_daily_to_artifact.sql index a05e992a2..ac40d881d 100644 --- a/warehouse/metrics_mesh/models/events_daily_to_artifact.sql +++ b/warehouse/metrics_mesh/models/events_daily_to_artifact.sql @@ -7,33 +7,32 @@ MODEL ( ), start '2015-01-01', cron '@daily', - partitioned_by (day("bucket_day"), "event_type"), - grain ( - bucket_day, - event_type, - event_source, - from_artifact_id, - to_artifact_id - ), + partitioned_by (DAY("bucket_day"), "event_type"), + grain (bucket_day, event_type, event_source, from_artifact_id, to_artifact_id) ); + WITH events AS ( - SELECT DISTINCT from_artifact_id, + SELECT DISTINCT + from_artifact_id, to_artifact_id, event_source, event_type, time, amount - from @oso_source('timeseries_events_by_artifact_v0') - where CAST(time AS DATE) between STR_TO_DATE(@start_ds, '%Y-%m-%d')::Date and STR_TO_DATE(@end_ds, '%Y-%m-%d')::Date + FROM @oso_source('timeseries_events_by_artifact_v0') + WHERE + time::DATE BETWEEN STRPTIME(@start_ds, '%Y-%m-%d')::DATE::DATE AND STRPTIME(@end_ds, '%Y-%m-%d')::DATE::DATE ) -SELECT from_artifact_id, +SELECT + from_artifact_id, to_artifact_id, event_source, event_type, DATE_TRUNC('DAY', time::DATE) AS bucket_day, SUM(amount) AS amount FROM events -GROUP BY from_artifact_id, +GROUP BY + from_artifact_id, to_artifact_id, event_source, event_type, diff --git a/warehouse/metrics_mesh/models/events_daily_to_artifact_with_lag.sql b/warehouse/metrics_mesh/models/events_daily_to_artifact_with_lag.sql index cb3f09bfe..127ed17f8 100644 --- a/warehouse/metrics_mesh/models/events_daily_to_artifact_with_lag.sql +++ b/warehouse/metrics_mesh/models/events_daily_to_artifact_with_lag.sql @@ -3,26 +3,16 @@ MODEL ( kind FULL, start '2015-01-01', cron '@daily', - partitioned_by (day("bucket_day"), "event_type"), - grain ( - bucket_day, - event_type, - event_source, - from_artifact_id, - to_artifact_id - ), + partitioned_by (DAY("bucket_day"), "event_type"), + grain (bucket_day, event_type, event_source, from_artifact_id, to_artifact_id) ); -SELECT bucket_day, + +SELECT + bucket_day, to_artifact_id, from_artifact_id, event_source, event_type, amount, - LAG(bucket_day) OVER ( - PARTITION BY to_artifact_id, - from_artifact_id, - event_source, - event_type - ORDER BY bucket_day - ) AS last_event + LAG(bucket_day) OVER (PARTITION BY to_artifact_id, from_artifact_id, event_source, event_type ORDER BY bucket_day) AS last_event FROM metrics.events_daily_to_artifact \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/first_of_event_from_artifact.sql b/warehouse/metrics_mesh/models/first_of_event_from_artifact.sql index 4184c7aae..70067941e 100644 --- a/warehouse/metrics_mesh/models/first_of_event_from_artifact.sql +++ b/warehouse/metrics_mesh/models/first_of_event_from_artifact.sql @@ -1,22 +1,19 @@ MODEL ( name metrics.first_of_event_from_artifact, kind FULL, - partitioned_by (year("time"), "event_type", "event_source"), - grain ( - time, - event_type, - event_source, - from_artifact_id, - to_artifact_id - ), + partitioned_by (YEAR("time"), "event_type", "event_source"), + grain (time, event_type, event_source, from_artifact_id, to_artifact_id) ); -select MIN(time) as time, + +SELECT + MIN(time) AS time, event_type, event_source, from_artifact_id, to_artifact_id -from @oso_source('timeseries_events_by_artifact_v0') -group by event_type, +FROM @oso_source('timeseries_events_by_artifact_v0') +GROUP BY + event_type, event_source, from_artifact_id, to_artifact_id \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/last_of_event_from_artifact.sql b/warehouse/metrics_mesh/models/last_of_event_from_artifact.sql index 737129812..fe527dbe2 100644 --- a/warehouse/metrics_mesh/models/last_of_event_from_artifact.sql +++ b/warehouse/metrics_mesh/models/last_of_event_from_artifact.sql @@ -1,22 +1,19 @@ MODEL ( name metrics.last_of_event_from_artifact, kind FULL, - partitioned_by (year("time"), "event_type", "event_source"), - grain ( - time, - event_type, - event_source, - from_artifact_id, - to_artifact_id - ), + partitioned_by (YEAR("time"), "event_type", "event_source"), + grain (time, event_type, event_source, from_artifact_id, to_artifact_id) ); -select MAX(time) as time, + +SELECT + MAX(time) AS time, event_type, event_source, from_artifact_id, to_artifact_id -from @oso_source('timeseries_events_by_artifact_v0') -group by event_type, +FROM @oso_source('timeseries_events_by_artifact_v0') +GROUP BY + event_type, event_source, from_artifact_id, to_artifact_id \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/metrics_v0.sql b/warehouse/metrics_mesh/models/metrics_v0.sql index 7ad169bfa..e864fe863 100644 --- a/warehouse/metrics_mesh/models/metrics_v0.sql +++ b/warehouse/metrics_mesh/models/metrics_v0.sql @@ -2,22 +2,26 @@ MODEL ( name metrics.metrics_v0, kind FULL ); + WITH unioned_metric_names AS ( - SELECT DISTINCT metric + SELECT DISTINCT + metric FROM metrics.timeseries_metrics_to_artifact UNION ALL - SELECT DISTINCT metric + SELECT DISTINCT + metric FROM metrics.timeseries_metrics_to_project UNION ALL - SELECT DISTINCT metric + SELECT DISTINCT + metric FROM metrics.timeseries_metrics_to_collection -), -all_timeseries_metric_names AS ( - SELECT DISTINCT metric +), all_timeseries_metric_names AS ( + SELECT DISTINCT + metric FROM unioned_metric_names -), -metrics_v0_no_casting AS ( - SELECT @oso_id('OSO', 'oso', metric) AS metric_id, +), metrics_v0_no_casting AS ( + SELECT + @oso_id('OSO', 'oso', metric) AS metric_id, 'OSO' AS metric_source, 'oso' AS metric_namespace, metric AS metric_name, @@ -28,13 +32,14 @@ metrics_v0_no_casting AS ( 'UNKNOWN' AS aggregation_function FROM all_timeseries_metric_names ) -select metric_id::varchar, - metric_source::varchar, - metric_namespace::varchar, - metric_name::varchar, - display_name::varchar, - description::varchar, - raw_definition::varchar, - definition_ref::varchar, - aggregation_function::varchar +SELECT + metric_id::TEXT, + metric_source::TEXT, + metric_namespace::TEXT, + metric_name::TEXT, + display_name::TEXT, + description::TEXT, + raw_definition::TEXT, + definition_ref::TEXT, + aggregation_function::TEXT FROM metrics_v0_no_casting \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/projects_by_collection_v1.sql b/warehouse/metrics_mesh/models/projects_by_collection_v1.sql index 181bf199c..c92bb6ed9 100644 --- a/warehouse/metrics_mesh/models/projects_by_collection_v1.sql +++ b/warehouse/metrics_mesh/models/projects_by_collection_v1.sql @@ -1,11 +1,10 @@ --- Mirrors the projects_by_collection_v1 table in the source database. This is --- important for situations like trino and bigquery connections. As trino has no --- ways to optimize queries to bigquery since it's using the storage api +/* Mirrors the projects_by_collection_v1 table in the source database. This is */ /* important for situations like trino and bigquery connections. As trino has no */ /* ways to optimize queries to bigquery since it's using the storage api */ MODEL ( name metrics.projects_by_collection_v1, kind FULL ); -select + +SELECT project_id, project_source, project_namespace, @@ -14,4 +13,4 @@ select collection_source, collection_namespace, collection_name -from @oso_source('projects_by_collection_v1') \ No newline at end of file +FROM @oso_source('projects_by_collection_v1') \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/timeseries_metrics_by_artifact_v0.sql b/warehouse/metrics_mesh/models/timeseries_metrics_by_artifact_v0.sql index d3e3b6415..1b9a89f48 100644 --- a/warehouse/metrics_mesh/models/timeseries_metrics_by_artifact_v0.sql +++ b/warehouse/metrics_mesh/models/timeseries_metrics_by_artifact_v0.sql @@ -2,17 +2,20 @@ MODEL ( name metrics.timeseries_metrics_by_artifact_v0, kind VIEW ); + WITH all_timeseries_metrics_by_artifact AS ( - SELECT @oso_id('OSO', 'oso', metric) AS metric_id, + SELECT + @oso_id('OSO', 'oso', metric) AS metric_id, to_artifact_id AS artifact_id, metrics_sample_date AS sample_date, amount AS amount, NULL AS unit FROM metrics.timeseries_metrics_to_artifact ) -SELECT metric_id::varchar, - artifact_id::varchar, - sample_date::date, - amount::double, - unit::varchar +SELECT + metric_id::TEXT, + artifact_id::TEXT, + sample_date::DATE, + amount::DOUBLE, + unit::TEXT FROM all_timeseries_metrics_by_artifact \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/timeseries_metrics_by_collection_v0.sql b/warehouse/metrics_mesh/models/timeseries_metrics_by_collection_v0.sql index e5969ace0..cac96af99 100644 --- a/warehouse/metrics_mesh/models/timeseries_metrics_by_collection_v0.sql +++ b/warehouse/metrics_mesh/models/timeseries_metrics_by_collection_v0.sql @@ -2,17 +2,20 @@ MODEL ( name metrics.timeseries_metrics_by_collection_v0, kind VIEW ); + WITH all_timeseries_metrics_by_collection AS ( - SELECT @oso_id('OSO', 'oso', metric) AS metric_id, + SELECT + @oso_id('OSO', 'oso', metric) AS metric_id, to_collection_id AS collection_id, metrics_sample_date AS sample_date, amount AS amount, NULL AS unit FROM metrics.timeseries_metrics_to_collection ) -SELECT metric_id::varchar, - collection_id::varchar, - sample_date::date, - amount::double, - unit::varchar +SELECT + metric_id::TEXT, + collection_id::TEXT, + sample_date::DATE, + amount::DOUBLE, + unit::TEXT FROM all_timeseries_metrics_by_collection \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/timeseries_metrics_by_project_v0.sql b/warehouse/metrics_mesh/models/timeseries_metrics_by_project_v0.sql index 7c6667655..4a0b3d651 100644 --- a/warehouse/metrics_mesh/models/timeseries_metrics_by_project_v0.sql +++ b/warehouse/metrics_mesh/models/timeseries_metrics_by_project_v0.sql @@ -2,17 +2,20 @@ MODEL ( name metrics.timeseries_metrics_by_project_v0, kind VIEW ); + WITH all_timeseries_metrics_by_project AS ( - SELECT @oso_id('OSO', 'oso', metric) AS metric_id, + SELECT + @oso_id('OSO', 'oso', metric) AS metric_id, to_project_id AS project_id, metrics_sample_date AS sample_date, amount AS amount, NULL AS unit FROM metrics.timeseries_metrics_to_project ) -SELECT metric_id::varchar, - project_id::varchar, - sample_date::date, - amount::double, - unit::varchar +SELECT + metric_id::TEXT, + project_id::TEXT, + sample_date::DATE, + amount::DOUBLE, + unit::TEXT FROM all_timeseries_metrics_by_project \ No newline at end of file diff --git a/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql b/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql index c3ddd7902..756c1df8e 100644 --- a/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql +++ b/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql @@ -1,15 +1,27 @@ with first_of_activity_to_entity as ( -- We use this CTE to get the first of a specific type of event to a specific -- entity. - select MIN(time) as `time`, + -- select MIN(time) as `time`, + -- event_source, + -- from_artifact_id, + -- to_artifact_id + -- from metrics.first_of_event_from_artifact + -- where event_type in @activity_event_types + -- group by event_source, + -- from_artifact_id, + -- to_artifact_id + select + `time`, event_source, from_artifact_id, - to_artifact_id - from metrics.first_of_event_from_artifact - where event_type in @activity_event_types - group by event_source, - from_artifact_id, - to_artifact_id + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := first_contribution, + include_column_alias := true + ) + from @metrics_entity_type_table( + 'metrics.first_contribution_to_{entity_type}' + ) as first_contribution ), filtered_first_of as ( -- Filtered first of events to just the current period we are measuring. diff --git a/warehouse/metrics_tools/factory/factory.py b/warehouse/metrics_tools/factory/factory.py index 757d437d8..a9fc1af83 100644 --- a/warehouse/metrics_tools/factory/factory.py +++ b/warehouse/metrics_tools/factory/factory.py @@ -23,6 +23,7 @@ metrics_end, metrics_entity_type_alias, metrics_entity_type_col, + metrics_entity_type_table, metrics_name, metrics_peer_ref, metrics_sample_date, @@ -214,6 +215,7 @@ def _generate_metrics_queries( additional_macros = [ metrics_peer_ref, metrics_entity_type_col, + metrics_entity_type_table, metrics_entity_type_alias, relative_window_sample_date, (metrics_name, ["metric_name"]), diff --git a/warehouse/metrics_tools/macros/macros.py b/warehouse/metrics_tools/macros/macros.py index 5387fdb7b..461866846 100644 --- a/warehouse/metrics_tools/macros/macros.py +++ b/warehouse/metrics_tools/macros/macros.py @@ -1,17 +1,15 @@ import typing as t import sqlglot -from sqlglot import expressions as exp -from sqlmesh.core.dialect import MacroVar -from sqlmesh.core.macros import MacroEvaluator -from sqlmesh.core.dialect import parse_one - from metrics_tools.definition import ( PeerMetricDependencyRef, time_suffix, to_actual_table_name, ) from metrics_tools.utils import exp_literal_to_py_literal +from sqlglot import expressions as exp +from sqlmesh.core.dialect import MacroVar, parse_one +from sqlmesh.core.macros import MacroEvaluator def relative_window_sample_date( @@ -311,6 +309,16 @@ def metrics_entity_type_alias( return exp.alias_(to_alias, alias_name) +def metrics_entity_type_table(evaluator: MacroEvaluator, format_str: str): + """Turns a format string into a table name""" + if isinstance(format_str, exp.Literal): + format_str = format_str.this + table_name = format_str.format( + entity_type=evaluator.locals.get("entity_type", "artifact") + ) + return sqlglot.to_table(table_name, quoted=True) + + def metrics_peer_ref( evaluator: MacroEvaluator, name: str,