Fix int_derived_contracts (#1690)

* Fix and clean up int_derived_contracts * implements small fixes * fix linting error * Fix lint errors * Add cloudquery as separate pipeline --------- Co-authored-by: Carl Cervone <[email protected]>
opensource-observer · Jun 21, 2024 · 4951ca4 · 4951ca4
1 parent e984650
commit 4951ca4
Show file tree

Hide file tree

Showing 2 changed files with 181 additions and 31 deletions.
diff --git a/.github/workflows/warehouse-run-data-pipeline-cloudquery.yml b/.github/workflows/warehouse-run-data-pipeline-cloudquery.yml
@@ -0,0 +1,128 @@
+name: warehouse-run-data-pipeline-cloudquery
+env:
+  X_GITHUB_GRAPHQL_API: ${{ vars.X_GITHUB_GRAPHQL_API }}
+  X_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  CLOUDQUERY_VERSION: 5.5.0
+  CLOUDQUERY_FILE_DIRECTORY: /tmp/cloudquery
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  CLOUDSQL_DB_USER: ${{ secrets.CLOUDSQL_DB_USER }}
+  CLOUDSQL_DB_PASSWORD: ${{ secrets.CLOUDSQL_DB_PASSWORD }}
+  CLOUDSQL_DB_NAME: ${{ vars.CLOUDSQL_DB_NAME }}
+  CLOUDSQL_REGION: ${{ vars.CLOUDSQL_REGION }}
+  CLOUDSQL_INSTANCE_ID: ${{ vars.CLOUDSQL_INSTANCE_ID }}
+  GOOGLE_PROJECT_ID: ${{ vars.GOOGLE_PROJECT_ID }}
+  CLOUDSTORAGE_BUCKET_NAME: ${{ vars.CLOUDSTORAGE_BUCKET_NAME }}
+  BIGQUERY_DATASET_ID: ${{ vars.BIGQUERY_DATASET_ID }}
+
+# For now this only runs on a schedule once a day. Once we have made some of the
+# plugin workflows more incremental we will run this on _every_ commit to main
+on:
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+    inputs:
+      docker_tag:
+        description: The docker tag to use for cloudquery plugins (only)
+
+jobs:
+  warehouse-run-data-pipeline-cloudquery:
+    name: warehouse-run-data-pipeline-cloudquery
+    environment: indexer
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: 'read'
+      id-token: 'write'
+
+    env:
+      DOCKER_TAG: ${{ inputs.docker_tag != '' && inputs.docker_tag || github.sha }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: 'Login to GitHub Container Registry'
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: "Setup Python, Poetry and Dependencies"
+        uses: packetcoders/action-setup-cache-python-poetry@main
+        with:
+          python-version: 3.12
+          poetry-version: 1.8.2
+
+      - name: Run poetry install
+        run: |
+          poetry install
+      
+      # At this time this auth isn't working for dbt
+      # - uses: 'google-github-actions/auth@v2'
+      #   with:
+      #     service_account: [email protected] 
+      #     workload_identity_provider: projects/1054148520225/locations/global/workloadIdentityPools/github/providers/oso-github-actions
+      #     create_credentials_file: true
+      #     access_token_lifetime: 3600s
+
+      - uses: 'google-github-actions/auth@v2'
+        with:
+          credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_JSON }}'
+          create_credentials_file: true
+
+      - name: 'Set up Cloud SDK'
+        uses: 'google-github-actions/setup-gcloud@v2'
+        with:
+          version: '>= 363.0.0'
+
+      - name: Download and install cloudquery
+        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
+        run: |
+          curl -L https://github.com/cloudquery/cloudquery/releases/download/cli-v${CLOUDQUERY_VERSION}/cloudquery_linux_amd64 -o /tmp/cloudquery &&
+          chmod a+x /tmp/cloudquery && 
+          mv /tmp/cloudquery /usr/local/bin/cloudquery
+
+      # For now this is a bit of a hack for the oss-directory plugins as the output from one plugin is the input to 
+      # another. Ideally we would simply tell whatever system to run and it will handle dependencies.
+      - name: Run cloudquery for oss-directory
+        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
+        run: |
+          cloudquery sync .github/workflows/cloudquery/oss-directory.yml --log-level debug --log-console
+
+      - name: Concat the project jsonl files (if there are many)
+        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
+        run: |
+          ls -laht ${CLOUDQUERY_FILE_DIRECTORY}/ &&
+          find ${CLOUDQUERY_FILE_DIRECTORY}/projects_ossd -name "*.json" -type f -exec cat {} \; > ${CLOUDQUERY_FILE_DIRECTORY}/projects.json &&
+          head -n 5 ${CLOUDQUERY_FILE_DIRECTORY}/projects.json
+
+      - uses: actions/upload-artifact@v4
+        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
+        with:
+          name: projects.json
+          path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/projects.json
+
+      # This is currently a punt on how to run this properly because a previous cloudquery 
+      # plugin's output can't be used as input into a different one
+      # We start the github-resolve-repos container with a volume that can access the project file and use 
+      # Ideally we'd either have a plugin that can act as both a destination/source (so we can chain multiple plugins)
+      # Or potentially we use something else that can achieve a similar things
+      - name: Run cloudquery for github-resolve-directory
+        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
+        run: |
+          docker run -d --rm -p 7777:7777 \
+            -v ${CLOUDQUERY_FILE_DIRECTORY}:${CLOUDQUERY_FILE_DIRECTORY} \
+            --name github-resolve-repos \
+            ghcr.io/opensource-observer/cloudquery-github-resolve-repos:${DOCKER_TAG} \
+            serve --address 0.0.0.0:7777 &&
+          cloudquery sync .github/workflows/cloudquery/github-resolve-repos.yml --log-level debug --log-console &&
+          docker logs github-resolve-repos 2>&1 | tee ${CLOUDQUERY_FILE_DIRECTORY}/github-resolve-repos.log &&
+          docker stop github-resolve-repos
+
+      - uses: actions/upload-artifact@v4
+        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
+        with:
+          name: github-resolve-repos.log
+          path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/github-resolve-repos.log
diff --git a/warehouse/dbt/models/intermediate/blockchain_artifacts/int_derived_contracts.sql b/warehouse/dbt/models/intermediate/blockchain_artifacts/int_derived_contracts.sql
@@ -1,51 +1,73 @@
-with factories_and_deployers as (
+with contracts_deployed_no_factory as (
+  {#
+    This gets all of the contracts that weren't deployed with a factory
+  #}
   select
-    factories.block_timestamp,
-    factories.transaction_hash,
-    factories.network,
-    factories.originating_address as deployer_address,
-    factories.contract_address as contract_address
-  from {{ ref("int_factories") }} as factories
-  inner join {{ ref("int_deployers") }} as deployers
-    on
-      factories.factory_address = deployers.contract_address
-      and factories.network = deployers.network
-  union all
-  select
-    block_timestamp,
-    transaction_hash,
     network,
     deployer_address,
     contract_address
   from {{ ref("int_deployers") }}
+  where contract_address is not null
+),
+
+contracts_deployed_via_factory as (
+  {# 
+    This gets all of the contracts deployed by any factory.
+
+    Deployer Address is the EOA address that started the transaction
+  #}
+  select
+    network,
+    originating_address as deployer_address,
+    contract_address as contract_address
+  from {{ ref("int_factories") }}
+  where contract_address is not null
 ),
 
-factories_and_proxies as (
+contracts_deployed_by_safe_or_known_proxy as (
+  {# 
+    This gets all of the contracts deployed by a safe or other known proxy
+
+    Deployer address is a proxy (safe or other known proxy) that deployed the contract
+  #}
   select
-    factories.block_timestamp,
-    factories.transaction_hash,
     factories.network,
     proxies.address as deployer_address,
     factories.contract_address as contract_address
   from {{ ref("int_factories") }} as factories
   inner join {{ ref("int_proxies") }} as proxies
     on
-      factories.originating_address = proxies.address
+      factories.originating_contract = proxies.address
       and factories.network = proxies.network
+  where contract_address is not null
+),
+
+derived_contracts as (
+  select
+    network,
+    deployer_address,
+    contract_address
+  from contracts_deployed_no_factory
+
+  union all
+
+  select
+    network,
+    deployer_address,
+    contract_address
+  from contracts_deployed_via_factory
+
+  union all
+
+  select
+    network,
+    deployer_address,
+    contract_address
+  from contracts_deployed_by_safe_or_known_proxy
 )
 
-select
-  block_timestamp,
-  transaction_hash,
-  network,
-  deployer_address,
-  contract_address
-from factories_and_deployers
-union all
-select
-  block_timestamp,
-  transaction_hash,
+select distinct
   network,
   deployer_address,
   contract_address
-from factories_and_proxies
+from derived_contracts