-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Fix and clean up int_derived_contracts * implements small fixes * fix linting error * Fix lint errors * Add cloudquery as separate pipeline --------- Co-authored-by: Carl Cervone <[email protected]>
- Loading branch information
Showing
2 changed files
with
181 additions
and
31 deletions.
There are no files selected for viewing
128 changes: 128 additions & 0 deletions
128
.github/workflows/warehouse-run-data-pipeline-cloudquery.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
name: warehouse-run-data-pipeline-cloudquery | ||
env: | ||
X_GITHUB_GRAPHQL_API: ${{ vars.X_GITHUB_GRAPHQL_API }} | ||
X_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
CLOUDQUERY_VERSION: 5.5.0 | ||
CLOUDQUERY_FILE_DIRECTORY: /tmp/cloudquery | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
CLOUDSQL_DB_USER: ${{ secrets.CLOUDSQL_DB_USER }} | ||
CLOUDSQL_DB_PASSWORD: ${{ secrets.CLOUDSQL_DB_PASSWORD }} | ||
CLOUDSQL_DB_NAME: ${{ vars.CLOUDSQL_DB_NAME }} | ||
CLOUDSQL_REGION: ${{ vars.CLOUDSQL_REGION }} | ||
CLOUDSQL_INSTANCE_ID: ${{ vars.CLOUDSQL_INSTANCE_ID }} | ||
GOOGLE_PROJECT_ID: ${{ vars.GOOGLE_PROJECT_ID }} | ||
CLOUDSTORAGE_BUCKET_NAME: ${{ vars.CLOUDSTORAGE_BUCKET_NAME }} | ||
BIGQUERY_DATASET_ID: ${{ vars.BIGQUERY_DATASET_ID }} | ||
|
||
# For now this only runs on a schedule once a day. Once we have made some of the | ||
# plugin workflows more incremental we will run this on _every_ commit to main | ||
on: | ||
# Allows you to run this workflow manually from the Actions tab | ||
workflow_dispatch: | ||
inputs: | ||
docker_tag: | ||
description: The docker tag to use for cloudquery plugins (only) | ||
|
||
jobs: | ||
warehouse-run-data-pipeline-cloudquery: | ||
name: warehouse-run-data-pipeline-cloudquery | ||
environment: indexer | ||
runs-on: ubuntu-latest | ||
|
||
permissions: | ||
contents: 'read' | ||
id-token: 'write' | ||
|
||
env: | ||
DOCKER_TAG: ${{ inputs.docker_tag != '' && inputs.docker_tag || github.sha }} | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 1 | ||
|
||
- name: 'Login to GitHub Container Registry' | ||
uses: docker/login-action@v3 | ||
with: | ||
registry: ghcr.io | ||
username: ${{ github.actor }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: "Setup Python, Poetry and Dependencies" | ||
uses: packetcoders/action-setup-cache-python-poetry@main | ||
with: | ||
python-version: 3.12 | ||
poetry-version: 1.8.2 | ||
|
||
- name: Run poetry install | ||
run: | | ||
poetry install | ||
# At this time this auth isn't working for dbt | ||
# - uses: 'google-github-actions/auth@v2' | ||
# with: | ||
# service_account: [email protected] | ||
# workload_identity_provider: projects/1054148520225/locations/global/workloadIdentityPools/github/providers/oso-github-actions | ||
# create_credentials_file: true | ||
# access_token_lifetime: 3600s | ||
|
||
- uses: 'google-github-actions/auth@v2' | ||
with: | ||
credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_JSON }}' | ||
create_credentials_file: true | ||
|
||
- name: 'Set up Cloud SDK' | ||
uses: 'google-github-actions/setup-gcloud@v2' | ||
with: | ||
version: '>= 363.0.0' | ||
|
||
- name: Download and install cloudquery | ||
if: ${{ inputs.skip_cloudquery_plugins != 'true' }} | ||
run: | | ||
curl -L https://github.com/cloudquery/cloudquery/releases/download/cli-v${CLOUDQUERY_VERSION}/cloudquery_linux_amd64 -o /tmp/cloudquery && | ||
chmod a+x /tmp/cloudquery && | ||
mv /tmp/cloudquery /usr/local/bin/cloudquery | ||
# For now this is a bit of a hack for the oss-directory plugins as the output from one plugin is the input to | ||
# another. Ideally we would simply tell whatever system to run and it will handle dependencies. | ||
- name: Run cloudquery for oss-directory | ||
if: ${{ inputs.skip_cloudquery_plugins != 'true' }} | ||
run: | | ||
cloudquery sync .github/workflows/cloudquery/oss-directory.yml --log-level debug --log-console | ||
- name: Concat the project jsonl files (if there are many) | ||
if: ${{ inputs.skip_cloudquery_plugins != 'true' }} | ||
run: | | ||
ls -laht ${CLOUDQUERY_FILE_DIRECTORY}/ && | ||
find ${CLOUDQUERY_FILE_DIRECTORY}/projects_ossd -name "*.json" -type f -exec cat {} \; > ${CLOUDQUERY_FILE_DIRECTORY}/projects.json && | ||
head -n 5 ${CLOUDQUERY_FILE_DIRECTORY}/projects.json | ||
- uses: actions/upload-artifact@v4 | ||
if: ${{ inputs.skip_cloudquery_plugins != 'true' }} | ||
with: | ||
name: projects.json | ||
path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/projects.json | ||
|
||
# This is currently a punt on how to run this properly because a previous cloudquery | ||
# plugin's output can't be used as input into a different one | ||
# We start the github-resolve-repos container with a volume that can access the project file and use | ||
# Ideally we'd either have a plugin that can act as both a destination/source (so we can chain multiple plugins) | ||
# Or potentially we use something else that can achieve a similar things | ||
- name: Run cloudquery for github-resolve-directory | ||
if: ${{ inputs.skip_cloudquery_plugins != 'true' }} | ||
run: | | ||
docker run -d --rm -p 7777:7777 \ | ||
-v ${CLOUDQUERY_FILE_DIRECTORY}:${CLOUDQUERY_FILE_DIRECTORY} \ | ||
--name github-resolve-repos \ | ||
ghcr.io/opensource-observer/cloudquery-github-resolve-repos:${DOCKER_TAG} \ | ||
serve --address 0.0.0.0:7777 && | ||
cloudquery sync .github/workflows/cloudquery/github-resolve-repos.yml --log-level debug --log-console && | ||
docker logs github-resolve-repos 2>&1 | tee ${CLOUDQUERY_FILE_DIRECTORY}/github-resolve-repos.log && | ||
docker stop github-resolve-repos | ||
- uses: actions/upload-artifact@v4 | ||
if: ${{ inputs.skip_cloudquery_plugins != 'true' }} | ||
with: | ||
name: github-resolve-repos.log | ||
path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/github-resolve-repos.log |
84 changes: 53 additions & 31 deletions
84
warehouse/dbt/models/intermediate/blockchain_artifacts/int_derived_contracts.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,51 +1,73 @@ | ||
with factories_and_deployers as ( | ||
with contracts_deployed_no_factory as ( | ||
{# | ||
This gets all of the contracts that weren't deployed with a factory | ||
#} | ||
select | ||
factories.block_timestamp, | ||
factories.transaction_hash, | ||
factories.network, | ||
factories.originating_address as deployer_address, | ||
factories.contract_address as contract_address | ||
from {{ ref("int_factories") }} as factories | ||
inner join {{ ref("int_deployers") }} as deployers | ||
on | ||
factories.factory_address = deployers.contract_address | ||
and factories.network = deployers.network | ||
union all | ||
select | ||
block_timestamp, | ||
transaction_hash, | ||
network, | ||
deployer_address, | ||
contract_address | ||
from {{ ref("int_deployers") }} | ||
where contract_address is not null | ||
), | ||
contracts_deployed_via_factory as ( | ||
{# | ||
This gets all of the contracts deployed by any factory. | ||
Deployer Address is the EOA address that started the transaction | ||
#} | ||
select | ||
network, | ||
originating_address as deployer_address, | ||
contract_address as contract_address | ||
from {{ ref("int_factories") }} | ||
where contract_address is not null | ||
), | ||
factories_and_proxies as ( | ||
contracts_deployed_by_safe_or_known_proxy as ( | ||
{# | ||
This gets all of the contracts deployed by a safe or other known proxy | ||
Deployer address is a proxy (safe or other known proxy) that deployed the contract | ||
#} | ||
select | ||
factories.block_timestamp, | ||
factories.transaction_hash, | ||
factories.network, | ||
proxies.address as deployer_address, | ||
factories.contract_address as contract_address | ||
from {{ ref("int_factories") }} as factories | ||
inner join {{ ref("int_proxies") }} as proxies | ||
on | ||
factories.originating_address = proxies.address | ||
factories.originating_contract = proxies.address | ||
and factories.network = proxies.network | ||
where contract_address is not null | ||
), | ||
derived_contracts as ( | ||
select | ||
network, | ||
deployer_address, | ||
contract_address | ||
from contracts_deployed_no_factory | ||
union all | ||
select | ||
network, | ||
deployer_address, | ||
contract_address | ||
from contracts_deployed_via_factory | ||
union all | ||
select | ||
network, | ||
deployer_address, | ||
contract_address | ||
from contracts_deployed_by_safe_or_known_proxy | ||
) | ||
select | ||
block_timestamp, | ||
transaction_hash, | ||
network, | ||
deployer_address, | ||
contract_address | ||
from factories_and_deployers | ||
union all | ||
select | ||
block_timestamp, | ||
transaction_hash, | ||
select distinct | ||
network, | ||
deployer_address, | ||
contract_address | ||
from factories_and_proxies | ||
from derived_contracts |