Skip to content

Commit

Permalink
add: sqlmesh metrics (#2483)
Browse files Browse the repository at this point in the history
* add: `active_developers` sqlmesh metric

* add: `closed_issues` sqlmesh metric

* add: `contributors` sqlmesh metric

* add: `first_commit_date` sqlmesh metric

* add: `last_commit` sqlmesh metric

* add: `new_contributors` sqlmesh metric

* add: `repositories` sqlmesh metric

* add: developer_states sqlmesh metric models

* fix: remove `duplicated` model

* add: point in time and interval `models`

* Adds dbt point in time metrics from source data

* Disable first and last commits for now

* Add point_in_time_v0

* fix

* more fixes and tests

---------

Co-authored-by: Reuven V. Gonzales <[email protected]>
  • Loading branch information
Jabolol and ravenac95 authored Nov 22, 2024
1 parent 2d9c4f2 commit bf53eb7
Show file tree
Hide file tree
Showing 13 changed files with 297 additions and 15 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{#
Point in time metrics that are taken from raw sources. This does not do any
point in time aggregations for running sums/averages.

Not all collected data has historical state information but some things do. This
is to include those for greater accuracy when rendering metrics

This is particularly useful for:
* Star Count (using STARRED events doesn't capture accurate star counts)
* Watcher Count
* Repository Count (This is an aggregated metric for a project/collection)
Other things in the future will likely be useful here but for now this is just
for repository related metrics that aren't timeseries by nature.
#}
select
`time`,
artifact_source,
{{ oso_id("artifact_source", "artifact_source_id") }} as artifact_id,
metric,
amount
from {{ ref("stg_ossd__repository_point_in_time") }}
14 changes: 14 additions & 0 deletions warehouse/dbt/models/marts/point_in_time/point_in_time_v0.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{{
config(meta = {
'sync_to_db': True,
'order_by': [ 'artifact_source', 'metric', 'artifact_id', 'time' ]
})
}}

select
time,
artifact_source,
artifact_id,
metric,
amount
from {{ ref("int_point_in_time_from_sources") }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{#
Point in time view for repository data.
#}
select
repo.ingestion_time as `time`,
"GITHUB" as artifact_source,
"REPOSITORY" as artifact_type,
repo.owner as artifact_namespace,
repo.name as artifact_name,
repo.id as artifact_source_id,
unpivoted.metric as metric,
unpivoted.amount as amount
from {{ oso_source('ossd', 'repositories') }} as repo, unnest([
struct("fork_count" as metric, fork_count as amount),
struct("star_count" as metric, star_count as amount),
struct("watcher_count" as metric, watcher_count as amount)
]) as unpivoted
80 changes: 74 additions & 6 deletions warehouse/metrics_mesh/models/metrics_factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,18 @@
ref="gas_fees.sql",
time_aggregations=["daily", "weekly", "monthly"],
),
"repositories": MetricQueryDef(
ref="repositories.sql",
time_aggregations=["daily", "weekly", "monthly"],
),
"contributors": MetricQueryDef(
ref="contributors.sql",
time_aggregations=["daily", "weekly", "monthly"],
),
"active_developers": MetricQueryDef(
ref="active_developers.sql",
time_aggregations=["daily", "weekly", "monthly"],
),
# This defines something with a rolling option that allows you to look back
# to some arbitrary window. So you specify the window and specify the unit.
# The unit and the window are used to pass in variables to the query. So it's
Expand All @@ -57,7 +69,7 @@
"activity_event_types": ["COMMIT_CODE"],
},
rolling=RollingConfig(
windows=[30, 60, 90],
windows=[30, 90, 180],
unit="day",
cron="@daily", # This determines how often this is calculated
),
Expand All @@ -70,7 +82,7 @@
"full_time_ratio": 10 / 30,
},
rolling=RollingConfig(
windows=[30, 60, 90],
windows=[30, 90, 180],
unit="day",
cron="@daily",
),
Expand All @@ -79,12 +91,12 @@
ref="contributor_activity_classification.sql",
vars={"full_time_ratio": 10 / 30},
rolling=RollingConfig(
windows=[30, 60, 90],
windows=[30, 90, 180],
unit="day",
cron="@daily",
),
),
"change_in_30_developer_activity": MetricQueryDef(
"change_in_30_day_developer_activity": MetricQueryDef(
vars={
"comparison_interval": 30,
},
Expand All @@ -95,9 +107,9 @@
cron="@daily",
),
),
"change_in_60_developer_activity": MetricQueryDef(
"change_in_90_day_developer_activity": MetricQueryDef(
vars={
"comparison_interval": 60,
"comparison_interval": 90,
},
ref="change_in_developers.sql",
rolling=RollingConfig(
Expand All @@ -106,6 +118,62 @@
cron="@daily",
),
),
"change_in_180_day_developer_activity": MetricQueryDef(
vars={
"comparison_interval": 180,
},
ref="change_in_developers.sql",
rolling=RollingConfig(
windows=[2],
unit="period",
cron="@daily",
),
),
"commits_rolling": MetricQueryDef(
ref="commits.sql",
rolling=RollingConfig(
windows=[180],
unit="day",
cron="@daily",
),
entity_types=["artifact", "project", "collection"],
),
"opened_pull_requests": MetricQueryDef(
ref="prs_opened.sql",
rolling=RollingConfig(
windows=[180],
unit="day",
cron="@daily",
),
entity_types=["artifact", "project", "collection"],
),
"merged_pull_requests": MetricQueryDef(
ref="prs_merged.sql",
rolling=RollingConfig(
windows=[180],
unit="day",
cron="@daily",
),
entity_types=["artifact", "project", "collection"],
),
"opened_issues": MetricQueryDef(
ref="issues_opened.sql",
rolling=RollingConfig(
windows=[180],
unit="day",
cron="@daily",
),
entity_types=["artifact", "project", "collection"],
),
"closed_issues_6_months": MetricQueryDef(
ref="issues_closed.sql",
rolling=RollingConfig(
windows=[180],
unit="day",
cron="@daily",
),
entity_types=["artifact", "project", "collection"],
),
},
default_dialect="clickhouse",
)
14 changes: 14 additions & 0 deletions warehouse/metrics_mesh/oso_metrics/active_developers.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
select @metrics_sample_date(events.bucket_day) as metrics_sample_date,
events.event_source,
events.to_artifact_id as to_artifact_id,
'' as from_artifact_id,
@metric_name() as metric,
COUNT(distinct events.from_artifact_id) as amount
from metrics.events_daily_to_artifact as events
where events.event_type = 'COMMIT_CODE'
and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE')
group by 1,
metric,
from_artifact_id,
to_artifact_id,
event_source
19 changes: 19 additions & 0 deletions warehouse/metrics_mesh/oso_metrics/contributors.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
select @metrics_sample_date(events.bucket_day) as metrics_sample_date,
events.event_source,
events.to_artifact_id as to_artifact_id,
'' as from_artifact_id,
@metric_name() as metric,
COUNT(distinct events.from_artifact_id) as amount
from metrics.events_daily_to_artifact as events
where events.event_type in (
'COMMIT_CODE',
'ISSUE_OPENED',
'PULL_REQUEST_OPENED',
'PULL_REQUEST_MERGED'
)
and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE')
group by 1,
metric,
from_artifact_id,
to_artifact_id,
event_source
17 changes: 17 additions & 0 deletions warehouse/metrics_mesh/oso_metrics/first_commit_date.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
-- TODO (@ravenac95) keeping this for now, might prove useful, but we likely need
-- a different kind of model for first commit data
select @metrics_sample_date(events.bucket_day) as metrics_sample_date,
events.event_source,
events.to_artifact_id as to_artifact_id,
'' as from_artifact_id,
@metric_name() as metric,
1 as amount,
MIN(events.bucket_day) as first_commit_date
from metrics.events_daily_to_artifact as events
where events.event_type = 'COMMIT_CODE'
and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE')
group by 1,
metric,
from_artifact_id,
to_artifact_id,
event_source
17 changes: 17 additions & 0 deletions warehouse/metrics_mesh/oso_metrics/last_commit_date.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
-- TODO (@ravenac95) keeping this for now, might prove useful, but we likely need
-- a different kind of model for last commit data
select @metrics_sample_date(events.bucket_day) as metrics_sample_date,
events.event_source,
events.to_artifact_id as to_artifact_id,
'' as from_artifact_id,
@metric_name() as metric,
1 as amount,
MAX(events.bucket_day) as last_commit_date
from metrics.events_daily_to_artifact as events
where events.event_type = 'COMMIT_CODE'
and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE')
group by 1,
metric,
from_artifact_id,
to_artifact_id,
event_source
25 changes: 25 additions & 0 deletions warehouse/metrics_mesh/oso_metrics/repositories.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
select @metrics_sample_date(events.bucket_day) as metrics_sample_date,
events.event_source,
events.to_artifact_id as to_artifact_id,
'' as from_artifact_id,
@metric_name() as metric,
COUNT(distinct events.to_artifact_id) as amount
from metrics.events_daily_to_artifact as events
where events.event_type in (
'ISSUE_OPENED',
'STARRED',
'PULL_REQUEST_OPENED',
'FORKED',
'PULL_REQUEST_REOPENED',
'PULL_REQUEST_CLOSED',
'COMMIT_CODE',
'ISSUE_REOPENED',
'PULL_REQUEST_MERGED',
'ISSUE_CLOSED',
)
and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE')
group by 1,
metric,
from_artifact_id,
to_artifact_id,
event_source
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
test_change_in_30_day_developer_activity_to_artifact_over_2_period_window_full_time_devs:
# Tests rolling count of active days when the user is active 4 of the 5 days
# in the test interval
gateway: local
model: metrics.change_in_30_day_developer_activity_to_artifact_over_2_period_window
vars:
start: 2024-01-31
end: 2024-01-31
inputs:
metrics.developer_classifications_to_artifact_over_30_day_window:
rows:
- to_artifact_id: repo_0
from_artifact_id: null
event_source: SOURCE_PROVIDER
metrics_sample_date: 2024-01-01
metric: full_time_developers_over_30_day_window
amount: 30
- to_artifact_id: repo_0
from_artifact_id: null
event_source: SOURCE_PROVIDER
metrics_sample_date: 2024-01-31
metric: full_time_developers_over_30_day_window
amount: 10
outputs:
partial: true
query:
partial: true
rows:
- metrics_sample_date: 2024-01-31
to_artifact_id: repo_0
from_artifact_id: ""
amount: -20

Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
test_developer_active_days_to_artifact_over_60_day_window_with_cumulative_active_days:
test_developer_active_days_to_artifact_over_30_day_window_with_cumulative_active_days:
# Tests rolling count of active days when the user is active 4 of the 5 days
# in the test interval
gateway: local
model: metrics.developer_active_days_to_artifact_over_60_day_window
model: metrics.developer_active_days_to_artifact_over_30_day_window
vars:
start: 2024-01-01
end: 2024-01-05
Expand Down Expand Up @@ -59,10 +59,10 @@ test_developer_active_days_to_artifact_over_60_day_window_with_cumulative_active
metrics_sample_date: 2024-01-05
amount: 4

test_developer_active_days_to_artifact_over_60_day_window_with_1_active_day:
test_developer_active_days_to_artifact_over_30_day_window_with_1_active_day:
# Tests rolling count of active days when the user is active 1 in the test interval
gateway: local
model: metrics.developer_active_days_to_artifact_over_60_day_window
model: metrics.developer_active_days_to_artifact_over_30_day_window
vars:
start: 2024-01-01
end: 2024-01-03
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
test_developer_classifications_to_artifact_over_30_day_window_full_time_devs:
# Tests rolling count of active days when the user is active 4 of the 5 days
# in the test interval
gateway: local
model: metrics.developer_classifications_to_artifact_over_30_day_window
vars:
start: 2024-01-01
end: 2024-01-01
inputs:
metrics.developer_active_days_to_artifact_over_30_day_window:
rows:
- to_artifact_id: repo_0
from_artifact_id: dev_0
event_source: SOURCE_PROVIDER
metrics_sample_date: 2024-01-01
metric: developer_active_days
amount: 30
- to_artifact_id: repo_0
from_artifact_id: dev_1
event_source: SOURCE_PROVIDER
metrics_sample_date: 2024-01-01
metric: developer_active_days
amount: 30
outputs:
partial: true
query:
partial: true
rows:
- metrics_sample_date: 2024-01-01
to_artifact_id: repo_0
from_artifact_id: ""
metric: full_time_developers_over_30_day_window
amount: 2
- metrics_sample_date: 2024-01-01
to_artifact_id: repo_0
from_artifact_id: ""
metric: active_developers_over_30_day_window
amount: 2

Loading

0 comments on commit bf53eb7

Please sign in to comment.