Skip to content

Commit

Permalink
Metrics model performance tweaks (#2658)
Browse files Browse the repository at this point in the history
* Hopefully increase performance for metrics model

* sqlmesh format
  • Loading branch information
ravenac95 authored Dec 17, 2024
1 parent 76ae3d6 commit 51ab1e1
Show file tree
Hide file tree
Showing 17 changed files with 219 additions and 138 deletions.
9 changes: 4 additions & 5 deletions warehouse/metrics_mesh/models/artifacts_by_project_v1.sql
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
-- Mirrors the artifacts_by_project_v1 table in the source database. This is
-- important for situations like trino and bigquery connections. As trino has no
-- ways to optimize queries to bigquery since it's using the storage api
/* Mirrors the artifacts_by_project_v1 table in the source database. This is */ /* important for situations like trino and bigquery connections. As trino has no */ /* ways to optimize queries to bigquery since it's using the storage api */
MODEL (
name metrics.artifacts_by_project_v1,
kind FULL
);
select

SELECT
artifact_id,
artifact_source_id,
artifact_source,
Expand All @@ -15,4 +14,4 @@ select
project_source,
project_namespace,
project_name
from @oso_source('artifacts_by_project_v1')
FROM @oso_source('artifacts_by_project_v1')
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
TODO: This is a hack for now to fix performance issues with the contributor
classifications. We should use some kind of factory for this in the future to
get all dimensions
*/
MODEL (
name metrics.first_contribution_to_artifact,
kind FULL,
partitioned_by (YEAR("time"), "event_source"),
grain (time, event_source, from_artifact_id, to_artifact_id)
);

SELECT
MIN(time) AS time,
event_source,
from_artifact_id,
to_artifact_id
FROM metrics.first_of_event_from_artifact
WHERE
event_type IN ('COMMIT_CODE', 'ISSUE_OPENED', 'PULL_REQUEST_OPENED', 'PULL_REQUEST_MERGED')
GROUP BY
event_source,
from_artifact_id,
to_artifact_id
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
TODO: This is a hack for now to fix performance issues with the contributor
classifications. We should use some kind of factory for this in the future to
get all dimensions
*/
MODEL (
name metrics.first_contribution_to_collection,
kind FULL,
partitioned_by (YEAR("time"), "event_source"),
grain (time, event_source, from_artifact_id, to_collection_id)
);

SELECT
MIN(time) AS time,
first_contribution_to_project.event_source,
first_contribution_to_project.from_artifact_id,
projects_by_collection_v1.collection_id AS to_collection_id
FROM metrics.first_contribution_to_project
INNER JOIN metrics.projects_by_collection_v1
ON first_contribution_to_project.to_project_id = projects_by_collection_v1.project_id
GROUP BY
event_source,
from_artifact_id,
to_collection_id
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
TODO: This is a hack for now to fix performance issues with the contributor
classifications. We should use some kind of factory for this in the future to
get all dimensions
*/
MODEL (
name metrics.first_contribution_to_project,
kind FULL,
partitioned_by (YEAR("time"), "event_source"),
grain (time, event_source, from_artifact_id, to_project_id)
);

SELECT
MIN(first_contribution_to_artifact.time) AS time,
first_contribution_to_artifact.event_source,
first_contribution_to_artifact.from_artifact_id,
artifacts_by_project_v1.project_id AS to_project_id
FROM metrics.first_contribution_to_artifact
INNER JOIN metrics.artifacts_by_project_v1
ON first_contribution_to_artifact.to_artifact_id = artifacts_by_project_v1.artifact_id
GROUP BY
event_source,
from_artifact_id,
to_project_id
50 changes: 21 additions & 29 deletions warehouse/metrics_mesh/models/code/issue_event_time_deltas.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
-- Model that records the delta (in seconds) since the creation of the issue or
-- pr.
/* Model that records the delta (in seconds) since the creation of the issue or */ /* pr. */
MODEL (
name metrics.issue_event_time_deltas,
kind INCREMENTAL_BY_TIME_RANGE (
Expand All @@ -9,38 +8,31 @@ MODEL (
),
start '2015-01-01',
cron '@daily',
partitioned_by (day("time"), "event_type"),
grain (
time,
event_type,
event_source,
from_artifact_id,
to_artifact_id
)
partitioned_by (DAY("time"), "event_type"),
grain (time, event_type, event_source, from_artifact_id, to_artifact_id)
);
select

SELECT
"time",
event_type,
event_source,
@oso_id(
event_source,
to_artifact_id,
issue_number
) as issue_id,
@oso_id(event_source, to_artifact_id, issue_number) AS issue_id,
issue_number,
to_artifact_id,
from_artifact_id,
created_at::timestamp,
merged_at::timestamp,
closed_at::timestamp,
date_diff('second', created_at, "time")::double as created_delta,
case
when merged_at is null then null
else date_diff('second', merged_at, "time")
end::double as merged_delta,
case
when closed_at is null then null
else date_diff('second', closed_at, "time")
end::double as closed_delta,
created_at::TIMESTAMP,
merged_at::TIMESTAMP,
closed_at::TIMESTAMP,
DATE_DIFF('SECOND', created_at, "time")::DOUBLE AS created_delta,
CASE
WHEN merged_at IS NULL
THEN NULL
ELSE DATE_DIFF('SECOND', merged_at, "time")
END::DOUBLE AS merged_delta,
CASE
WHEN closed_at IS NULL
THEN NULL
ELSE DATE_DIFF('SECOND', closed_at, "time")
END::DOUBLE AS closed_delta,
comments
from @oso_source('timeseries_events_aux_issues_by_artifact_v0')
FROM @oso_source('timeseries_events_aux_issues_by_artifact_v0')
25 changes: 12 additions & 13 deletions warehouse/metrics_mesh/models/events_daily_to_artifact.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,33 +7,32 @@ MODEL (
),
start '2015-01-01',
cron '@daily',
partitioned_by (day("bucket_day"), "event_type"),
grain (
bucket_day,
event_type,
event_source,
from_artifact_id,
to_artifact_id
),
partitioned_by (DAY("bucket_day"), "event_type"),
grain (bucket_day, event_type, event_source, from_artifact_id, to_artifact_id)
);

WITH events AS (
SELECT DISTINCT from_artifact_id,
SELECT DISTINCT
from_artifact_id,
to_artifact_id,
event_source,
event_type,
time,
amount
from @oso_source('timeseries_events_by_artifact_v0')
where CAST(time AS DATE) between STR_TO_DATE(@start_ds, '%Y-%m-%d')::Date and STR_TO_DATE(@end_ds, '%Y-%m-%d')::Date
FROM @oso_source('timeseries_events_by_artifact_v0')
WHERE
time::DATE BETWEEN STRPTIME(@start_ds, '%Y-%m-%d')::DATE::DATE AND STRPTIME(@end_ds, '%Y-%m-%d')::DATE::DATE
)
SELECT from_artifact_id,
SELECT
from_artifact_id,
to_artifact_id,
event_source,
event_type,
DATE_TRUNC('DAY', time::DATE) AS bucket_day,
SUM(amount) AS amount
FROM events
GROUP BY from_artifact_id,
GROUP BY
from_artifact_id,
to_artifact_id,
event_source,
event_type,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,16 @@ MODEL (
kind FULL,
start '2015-01-01',
cron '@daily',
partitioned_by (day("bucket_day"), "event_type"),
grain (
bucket_day,
event_type,
event_source,
from_artifact_id,
to_artifact_id
),
partitioned_by (DAY("bucket_day"), "event_type"),
grain (bucket_day, event_type, event_source, from_artifact_id, to_artifact_id)
);
SELECT bucket_day,

SELECT
bucket_day,
to_artifact_id,
from_artifact_id,
event_source,
event_type,
amount,
LAG(bucket_day) OVER (
PARTITION BY to_artifact_id,
from_artifact_id,
event_source,
event_type
ORDER BY bucket_day
) AS last_event
LAG(bucket_day) OVER (PARTITION BY to_artifact_id, from_artifact_id, event_source, event_type ORDER BY bucket_day) AS last_event
FROM metrics.events_daily_to_artifact
19 changes: 8 additions & 11 deletions warehouse/metrics_mesh/models/first_of_event_from_artifact.sql
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
MODEL (
name metrics.first_of_event_from_artifact,
kind FULL,
partitioned_by (year("time"), "event_type", "event_source"),
grain (
time,
event_type,
event_source,
from_artifact_id,
to_artifact_id
),
partitioned_by (YEAR("time"), "event_type", "event_source"),
grain (time, event_type, event_source, from_artifact_id, to_artifact_id)
);
select MIN(time) as time,

SELECT
MIN(time) AS time,
event_type,
event_source,
from_artifact_id,
to_artifact_id
from @oso_source('timeseries_events_by_artifact_v0')
group by event_type,
FROM @oso_source('timeseries_events_by_artifact_v0')
GROUP BY
event_type,
event_source,
from_artifact_id,
to_artifact_id
19 changes: 8 additions & 11 deletions warehouse/metrics_mesh/models/last_of_event_from_artifact.sql
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
MODEL (
name metrics.last_of_event_from_artifact,
kind FULL,
partitioned_by (year("time"), "event_type", "event_source"),
grain (
time,
event_type,
event_source,
from_artifact_id,
to_artifact_id
),
partitioned_by (YEAR("time"), "event_type", "event_source"),
grain (time, event_type, event_source, from_artifact_id, to_artifact_id)
);
select MAX(time) as time,

SELECT
MAX(time) AS time,
event_type,
event_source,
from_artifact_id,
to_artifact_id
from @oso_source('timeseries_events_by_artifact_v0')
group by event_type,
FROM @oso_source('timeseries_events_by_artifact_v0')
GROUP BY
event_type,
event_source,
from_artifact_id,
to_artifact_id
41 changes: 23 additions & 18 deletions warehouse/metrics_mesh/models/metrics_v0.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,26 @@ MODEL (
name metrics.metrics_v0,
kind FULL
);

WITH unioned_metric_names AS (
SELECT DISTINCT metric
SELECT DISTINCT
metric
FROM metrics.timeseries_metrics_to_artifact
UNION ALL
SELECT DISTINCT metric
SELECT DISTINCT
metric
FROM metrics.timeseries_metrics_to_project
UNION ALL
SELECT DISTINCT metric
SELECT DISTINCT
metric
FROM metrics.timeseries_metrics_to_collection
),
all_timeseries_metric_names AS (
SELECT DISTINCT metric
), all_timeseries_metric_names AS (
SELECT DISTINCT
metric
FROM unioned_metric_names
),
metrics_v0_no_casting AS (
SELECT @oso_id('OSO', 'oso', metric) AS metric_id,
), metrics_v0_no_casting AS (
SELECT
@oso_id('OSO', 'oso', metric) AS metric_id,
'OSO' AS metric_source,
'oso' AS metric_namespace,
metric AS metric_name,
Expand All @@ -28,13 +32,14 @@ metrics_v0_no_casting AS (
'UNKNOWN' AS aggregation_function
FROM all_timeseries_metric_names
)
select metric_id::varchar,
metric_source::varchar,
metric_namespace::varchar,
metric_name::varchar,
display_name::varchar,
description::varchar,
raw_definition::varchar,
definition_ref::varchar,
aggregation_function::varchar
SELECT
metric_id::TEXT,
metric_source::TEXT,
metric_namespace::TEXT,
metric_name::TEXT,
display_name::TEXT,
description::TEXT,
raw_definition::TEXT,
definition_ref::TEXT,
aggregation_function::TEXT
FROM metrics_v0_no_casting
9 changes: 4 additions & 5 deletions warehouse/metrics_mesh/models/projects_by_collection_v1.sql
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
-- Mirrors the projects_by_collection_v1 table in the source database. This is
-- important for situations like trino and bigquery connections. As trino has no
-- ways to optimize queries to bigquery since it's using the storage api
/* Mirrors the projects_by_collection_v1 table in the source database. This is */ /* important for situations like trino and bigquery connections. As trino has no */ /* ways to optimize queries to bigquery since it's using the storage api */
MODEL (
name metrics.projects_by_collection_v1,
kind FULL
);
select

SELECT
project_id,
project_source,
project_namespace,
Expand All @@ -14,4 +13,4 @@ select
collection_source,
collection_namespace,
collection_name
from @oso_source('projects_by_collection_v1')
FROM @oso_source('projects_by_collection_v1')
Loading

0 comments on commit 51ab1e1

Please sign in to comment.