Merge branch 'main' into dbt/open-collective-models

opensource-observer · Dec 19, 2024 · afd0cff · afd0cff
2 parents 15af465 + dc907a0
commit afd0cff
Show file tree

Hide file tree

Showing 21 changed files with 587 additions and 129 deletions.
diff --git a/apps/docs/docs/tutorials/gitcoin-social-networks.md b/apps/docs/docs/tutorials/gitcoin-social-networks.md
@@ -0,0 +1,42 @@
+---
+title: Funding in a Social Network
+sidebar_position: 5
+---
+
+Analyze Gitcoin grants funding in a social network. New to OSO? Check out our [Getting Started guide](../get-started/index.md) to set up your BigQuery or API access.
+
+This tutorial combines Farcaster and Gitcoin data to to identify popular projects within a social network.
+
+## BigQuery
+
+If you haven't already, then the first step is to subscribe to OSO public datasets in BigQuery. You can do this by clicking the "Subscribe" button on our [Datasets page](../integrate/datasets/#oso-production-data-pipeline). For this tutorial, you'll need to subscribe to the Gitcoin and Karma3/OpenRank datasets. (You can also use the Farcaster dataset in place of OpenRank.)
+
+The following queries should work if you copy-paste them into your [BigQuery console](https://console.cloud.google.com/bigquery).
+
+### Identify popular projects within your social network
+
+```sql
+select distinct
+  donations.donor_address,
+  users.user_source_id as fid,
+  users.user_name as username,
+  donations.project_name,
+  amount_in_usd,
+  timestamp
+from `gitcoin.all_donations` as donations
+join `oso_production.artifacts_by_user_v1` as users
+  on lower(donations.donor_address) = users.artifact_name
+where
+  user_source = 'FARCASTER'
+  and users.user_source_id in (
+    with max_date as (
+      select max(date) as last_date
+      from `karma3.localtrust`
+    )
+    select cast(j as string) as fid
+    from `karma3.localtrust`
+    where i = 5650
+    order by v desc
+    limit 150
+  )
+```
diff --git a/ops/k8s-apps/base/trino/trino.yaml b/ops/k8s-apps/base/trino/trino.yaml
@@ -112,7 +112,7 @@ spec:
         hive.metastore-cache-ttl=0s
         hive.metastore-refresh-interval=5s
         hive.metastore.thrift.client.connect-timeout=10s
-        hive.metastore.thrift.client.read-timeout=10s
+        hive.metastore.thrift.client.read-timeout=30s
         iceberg.use-file-size-from-metadata=false
         fs.native-gcs.enabled=true
         fs.cache.enabled=true

diff --git a/ops/k8s-apps/production/metrics-calculation-service/custom-helm-values.yaml b/ops/k8s-apps/production/metrics-calculation-service/custom-helm-values.yaml
@@ -32,8 +32,8 @@ spec:
         worker:
           threads: "16"
           memory:
-            limit: "96Gi"
-            request: "90Gi"
+            limit: "390Gi"
+            request: "375Gi"
           poolType: "mcs-worker"
           duckdb_path: "/scratch/mcs-local.db"
       trino:

diff --git a/ops/tf-modules/warehouse-cluster/main.tf b/ops/tf-modules/warehouse-cluster/main.tf
@@ -158,12 +158,12 @@ locals {
     # MCS Workers
     {
       name                              = "${var.cluster_name}-mcs-worker-node-pool"
-      machine_type                      = "n1-highmem-16"
+      machine_type                      = "n1-highmem-64"
       node_locations                    = join(",", var.cluster_zones)
       min_count                         = 0
-      max_count                         = 20
+      max_count                         = 50
       local_ssd_count                   = 0
-      local_ssd_ephemeral_storage_count = 2
+      local_ssd_ephemeral_storage_count = 3
       spot                              = true
       disk_size_gb                      = 100
       disk_type                         = "pd-standard"

diff --git a/...test_change_in_developers_over_window.yml → ...test_change_in_developers_over_window.yml b/...test_change_in_developers_over_window.yml → ...test_change_in_developers_over_window.yml
diff --git a/warehouse/metrics_mesh/models/metrics_factories.py b/warehouse/metrics_mesh/models/metrics_factories.py
@@ -88,6 +88,8 @@
                 windows=[30, 90, 180],
                 unit="day",
                 cron="@daily",  # This determines how often this is calculated
+                model_batch_size=90,
+                slots=32,
             ),
             entity_types=["artifact", "project", "collection"],
             is_intermediate=True,
@@ -103,23 +105,23 @@
                 cron="@daily",
             ),
         ),
-        "contributor_classifications": MetricQueryDef(
-            ref="contributor_activity_classification.sql",
-            vars={
-                "full_time_ratio": 10 / 30,
-                "activity_event_types": [
-                    "COMMIT_CODE",
-                    "ISSUE_OPENED",
-                    "PULL_REQUEST_OPENED",
-                    "PULL_REQUEST_MERGED",
-                ],
-            },
-            rolling=RollingConfig(
-                windows=[30, 90, 180],
-                unit="day",
-                cron="@daily",
-            ),
-        ),
+        # "contributor_classifications": MetricQueryDef(
+        #     ref="contributor_activity_classification.sql",
+        #     vars={
+        #         "full_time_ratio": 10 / 30,
+        #         "activity_event_types": [
+        #             "COMMIT_CODE",
+        #             "ISSUE_OPENED",
+        #             "PULL_REQUEST_OPENED",
+        #             "PULL_REQUEST_MERGED",
+        #         ],
+        #     },
+        #     rolling=RollingConfig(
+        #         windows=[30, 90, 180],
+        #         unit="day",
+        #         cron="@daily",
+        #     ),
+        # ),
         # Currently this query performs really poorly. We need to do some debugging on it
         # "user_retention_classifications": MetricQueryDef(
         #     ref="user_retention_classification.sql",
@@ -133,14 +135,14 @@
         #     ),
         #     entity_types=["artifact", "project", "collection"],
         # ),
-        "change_in_developer_activity": MetricQueryDef(
-            ref="change_in_developers.sql",
-            rolling=RollingConfig(
-                windows=[30, 90, 180],
-                unit="day",
-                cron="@daily",
-            ),
-        ),
+        # "change_in_developer_activity": MetricQueryDef(
+        #     ref="change_in_developers.sql",
+        #     rolling=RollingConfig(
+        #         windows=[30, 90, 180],
+        #         unit="day",
+        #         cron="@daily",
+        #     ),
+        # ),
         "commits_rolling": MetricQueryDef(
             ref="commits.sql",
             rolling=RollingConfig(
@@ -228,6 +230,7 @@
                 windows=[30, 90, 180],
                 unit="day",
                 cron="@daily",
+                slots=8,
             ),
             entity_types=["artifact", "project", "collection"],
         ),
@@ -237,26 +240,27 @@
                 windows=[30, 90, 180],
                 unit="day",
                 cron="@daily",
+                slots=32,
             ),
             entity_types=["artifact", "project", "collection"],
         ),
-        "contributors_lifecycle": MetricQueryDef(
-            ref="lifecycle.sql",
-            vars={
-                "activity_event_types": [
-                    "COMMIT_CODE",
-                    "ISSUE_OPENED",
-                    "PULL_REQUEST_OPENED",
-                    "PULL_REQUEST_MERGED",
-                ],
-            },
-            rolling=RollingConfig(
-                windows=[30, 90, 180],
-                unit="day",
-                cron="@daily",
-            ),
-            entity_types=["artifact", "project", "collection"],
-        ),
+        # "contributors_lifecycle": MetricQueryDef(
+        #     ref="lifecycle.sql",
+        #     vars={
+        #         "activity_event_types": [
+        #             "COMMIT_CODE",
+        #             "ISSUE_OPENED",
+        #             "PULL_REQUEST_OPENED",
+        #             "PULL_REQUEST_MERGED",
+        #         ],
+        #     },
+        #     rolling=RollingConfig(
+        #         windows=[30, 90, 180],
+        #         unit="day",
+        #         cron="@monthly",
+        #     ),
+        #     entity_types=["artifact", "project", "collection"],
+        # ),
         "funding_received": MetricQueryDef(
             ref="funding_received.sql",
             rolling=RollingConfig(

diff --git a/warehouse/metrics_tools/compute/app.py b/warehouse/metrics_tools/compute/app.py
@@ -91,6 +91,7 @@ async def initialize_app(app: FastAPI):
             cluster_spec = make_new_cluster_with_defaults(config)
             cluster_factory = KubeClusterFactory(
                 config.cluster_namespace,
+                config.worker_resources,
                 cluster_spec=cluster_spec,
                 shutdown_on_close=not config.debug_cluster_no_shutdown,
             )

diff --git a/warehouse/metrics_tools/compute/client.py b/warehouse/metrics_tools/compute/client.py
@@ -130,6 +130,8 @@ def calculate_metrics(
         cluster_min_size: int = 6,
         cluster_max_size: int = 6,
         job_retries: int = 3,
+        slots: int = 1,
+        execution_time: t.Optional[datetime] = None,
     ):
         """Calculate metrics for a given period and write the results to a gcs
         folder. This method is a high level method that triggers all of the
@@ -151,6 +153,7 @@ def calculate_metrics(
             locals (t.Dict[str, t.Any]): The local variables to use
             dependent_tables_map (t.Dict[str, str]): The dependent tables map
             job_retries (int): The number of retries for a given job in the worker queue. Defaults to 3.
+            execution_time (t.Optional[datetime]): The execution time for the job
 
         Returns:
             ExportReference: The export reference for the resulting calculation
@@ -172,6 +175,8 @@ def calculate_metrics(
             locals,
             dependent_tables_map,
             job_retries,
+            slots=slots,
+            execution_time=execution_time,
         )
         job_id = job_response.job_id
         export_reference = job_response.export_reference
@@ -240,6 +245,8 @@ def submit_job(
         locals: t.Dict[str, t.Any],
         dependent_tables_map: t.Dict[str, str],
         job_retries: t.Optional[int] = None,
+        slots: int = 2,
+        execution_time: t.Optional[datetime] = None,
     ):
         """Submit a job to the metrics calculation service
 
@@ -268,8 +275,9 @@ def submit_job(
             ref=ref,
             locals=locals,
             dependent_tables_map=dependent_tables_map,
+            slots=slots,
             retries=job_retries,
-            execution_time=datetime.now(),
+            execution_time=execution_time or datetime.now(),
         )
         job_response = self.service_post_with_input(
             JobSubmitResponse, "/job/submit", request

diff --git a/warehouse/metrics_tools/compute/cluster.py b/warehouse/metrics_tools/compute/cluster.py
@@ -46,6 +46,7 @@ def start_duckdb_cluster(
 
 async def start_duckdb_cluster_async(
     namespace: str,
+    resources: t.Dict[str, int],
     cluster_spec: t.Optional[dict] = None,
     min_size: int = 6,
     max_size: int = 6,
@@ -55,24 +56,30 @@ async def start_duckdb_cluster_async(
     a thread. The "async" version of dask's KubeCluster doesn't work as
     expected. So for now we do this."""
 
-    options: t.Dict[str, t.Any] = {"namespace": namespace}
+    worker_command = ["dask", "worker"]
+    resources_to_join = []
+
+    for resource, value in resources.items():
+        resources_to_join.append(f"{resource}={value}")
+    if resources_to_join:
+        resources_str = f'{",".join(resources_to_join)}'
+        worker_command.extend(["--resources", resources_str])
+
+    options: t.Dict[str, t.Any] = {
+        "namespace": namespace,
+        "worker_command": worker_command,
+    }
     options.update(kwargs)
     if cluster_spec:
         options["custom_cluster_spec"] = cluster_spec
 
     # loop = asyncio.get_running_loop()
     cluster = await KubeCluster(asynchronous=True, **options)
-    print(f"is cluster awaitable?: {inspect.isawaitable(cluster)}")
     adapt_response = cluster.adapt(minimum=min_size, maximum=max_size)
-    print(f"is adapt_response awaitable?: {inspect.isawaitable(adapt_response)}")
     if inspect.isawaitable(adapt_response):
         await adapt_response
     return cluster
 
-    # return await asyncio.to_thread(
-    #     start_duckdb_cluster, namespace, cluster_spec, min_size, max_size
-    # )
-
 
 class ClusterProxy(abc.ABC):
     async def client(self) -> Client:
@@ -155,26 +162,35 @@ def workers(self):
 class LocalClusterFactory(ClusterFactory):
     async def create_cluster(self, min_size: int, max_size: int) -> ClusterProxy:
         return LocalClusterProxy(
-            await LocalCluster(n_workers=max_size, asynchronous=True)
+            await LocalCluster(
+                n_workers=max_size, resources={"slots": 10}, asynchronous=True
+            )
         )
 
 
 class KubeClusterFactory(ClusterFactory):
     def __init__(
         self,
         namespace: str,
+        resources: t.Dict[str, int],
         cluster_spec: t.Optional[dict] = None,
         log_override: t.Optional[logging.Logger] = None,
         **kwargs: t.Any,
     ):
         self._namespace = namespace
         self.logger = log_override or logger
         self._cluster_spec = cluster_spec
+        self._resources = resources
         self.kwargs = kwargs
 
     async def create_cluster(self, min_size: int, max_size: int):
         cluster = await start_duckdb_cluster_async(
-            self._namespace, self._cluster_spec, min_size, max_size, **self.kwargs
+            self._namespace,
+            self._resources,
+            self._cluster_spec,
+            min_size,
+            max_size,
+            **self.kwargs,
         )
         return KubeClusterProxy(cluster)
 

diff --git a/warehouse/metrics_tools/compute/debug.py b/warehouse/metrics_tools/compute/debug.py
@@ -17,6 +17,7 @@ def async_test_setup_cluster(config: AppConfig):
 
     cluster_factory = KubeClusterFactory(
         config.cluster_namespace,
+        config.worker_resources,
         cluster_spec=cluster_spec,
         log_override=logger,
     )

diff --git a/warehouse/metrics_tools/compute/manual_testing_utils.py b/warehouse/metrics_tools/compute/manual_testing_utils.py
@@ -102,7 +102,11 @@ def run_local_test(
             ("amount", "NUMERIC"),
         ],
         ref=PeerMetricDependencyRef(
-            name="", entity_type="artifact", window=30, unit="day"
+            name="",
+            entity_type="artifact",
+            window=30,
+            unit="day",
+            cron="@daily",
         ),
         locals={},
         dependent_tables_map={