From 7123997e8f0d05c660c6ac6d0bd6d5804fdf5fb7 Mon Sep 17 00:00:00 2001
From: Reuven Gonzales <reuven@karibalabs.co>
Date: Fri, 2 Aug 2024 01:08:06 -0700
Subject: [PATCH] More cloudquery clean up (#1905)

---
 ...warehouse-run-data-pipeline-cloudquery.yml | 128 --------------
 .../workflows/warehouse-run-data-pipeline.yml | 163 ------------------
 docker/images/dagster-dask/Dockerfile         |   1 -
 ops/tf-modules/warehouse/main.tf              |   3 +-
 .../stg_ossd__current_collections.sql         |   2 +-
 .../stg_ossd__current_projects.sql            |   2 +-
 .../stg_ossd__current_repositories.sql        |   2 +-
 7 files changed, 4 insertions(+), 297 deletions(-)
 delete mode 100644 .github/workflows/warehouse-run-data-pipeline-cloudquery.yml
 delete mode 100644 .github/workflows/warehouse-run-data-pipeline.yml

diff --git a/.github/workflows/warehouse-run-data-pipeline-cloudquery.yml b/.github/workflows/warehouse-run-data-pipeline-cloudquery.yml
deleted file mode 100644
index 1aad26bcc..000000000
--- a/.github/workflows/warehouse-run-data-pipeline-cloudquery.yml
+++ /dev/null
@@ -1,128 +0,0 @@
-name: warehouse-run-data-pipeline-cloudquery
-env:
-  X_GITHUB_GRAPHQL_API: ${{ vars.X_GITHUB_GRAPHQL_API }}
-  X_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  CLOUDQUERY_VERSION: 5.5.0
-  CLOUDQUERY_FILE_DIRECTORY: /tmp/cloudquery
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  CLOUDSQL_DB_USER: ${{ secrets.CLOUDSQL_DB_USER }}
-  CLOUDSQL_DB_PASSWORD: ${{ secrets.CLOUDSQL_DB_PASSWORD }}
-  CLOUDSQL_DB_NAME: ${{ vars.CLOUDSQL_DB_NAME }}
-  CLOUDSQL_REGION: ${{ vars.CLOUDSQL_REGION }}
-  CLOUDSQL_INSTANCE_ID: ${{ vars.CLOUDSQL_INSTANCE_ID }}
-  GOOGLE_PROJECT_ID: ${{ vars.GOOGLE_PROJECT_ID }}
-  CLOUDSTORAGE_BUCKET_NAME: ${{ vars.CLOUDSTORAGE_BUCKET_NAME }}
-  BIGQUERY_DATASET_ID: ${{ vars.BIGQUERY_DATASET_ID }}
-
-# For now this only runs on a schedule once a day. Once we have made some of the
-# plugin workflows more incremental we will run this on _every_ commit to main
-on:
-  # Allows you to run this workflow manually from the Actions tab
-  workflow_dispatch:
-    inputs:
-      docker_tag:
-        description: The docker tag to use for cloudquery plugins (only)
-
-jobs:
-  warehouse-run-data-pipeline-cloudquery:
-    name: warehouse-run-data-pipeline-cloudquery
-    environment: indexer
-    runs-on: ubuntu-latest
-
-    permissions:
-      contents: 'read'
-      id-token: 'write'
-
-    env:
-      DOCKER_TAG: ${{ inputs.docker_tag != '' && inputs.docker_tag || github.sha }}
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 1
-
-      - name: 'Login to GitHub Container Registry'
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      
-      - name: "Setup Python, Poetry and Dependencies"
-        uses: packetcoders/action-setup-cache-python-poetry@main
-        with:
-          python-version: 3.12
-          poetry-version: 1.8.2
-      
-      - name: Run poetry install
-        run: |
-          poetry install
-      
-      # At this time this auth isn't working for dbt
-      # - uses: 'google-github-actions/auth@v2'
-      #   with:
-      #     service_account: oso-github-actions@oso-production.iam.gserviceaccount.com 
-      #     workload_identity_provider: projects/1054148520225/locations/global/workloadIdentityPools/github/providers/oso-github-actions
-      #     create_credentials_file: true
-      #     access_token_lifetime: 3600s
-
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_JSON }}'
-          create_credentials_file: true
-      
-      - name: 'Set up Cloud SDK'
-        uses: 'google-github-actions/setup-gcloud@v2'
-        with:
-          version: '>= 363.0.0'
-
-      - name: Download and install cloudquery
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        run: |
-          curl -L https://github.com/cloudquery/cloudquery/releases/download/cli-v${CLOUDQUERY_VERSION}/cloudquery_linux_amd64 -o /tmp/cloudquery &&
-          chmod a+x /tmp/cloudquery && 
-          mv /tmp/cloudquery /usr/local/bin/cloudquery
-
-      # For now this is a bit of a hack for the oss-directory plugins as the output from one plugin is the input to 
-      # another. Ideally we would simply tell whatever system to run and it will handle dependencies.
-      - name: Run cloudquery for oss-directory
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        run: |
-          cloudquery sync .github/workflows/cloudquery/oss-directory.yml --log-level debug --log-console
-
-      - name: Concat the project jsonl files (if there are many)
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        run: |
-          ls -laht ${CLOUDQUERY_FILE_DIRECTORY}/ &&
-          find ${CLOUDQUERY_FILE_DIRECTORY}/projects_ossd -name "*.json" -type f -exec cat {} \; > ${CLOUDQUERY_FILE_DIRECTORY}/projects.json &&
-          head -n 5 ${CLOUDQUERY_FILE_DIRECTORY}/projects.json
-
-      - uses: actions/upload-artifact@v4
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        with:
-          name: projects.json
-          path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/projects.json
-
-      # This is currently a punt on how to run this properly because a previous cloudquery 
-      # plugin's output can't be used as input into a different one
-      # We start the github-resolve-repos container with a volume that can access the project file and use 
-      # Ideally we'd either have a plugin that can act as both a destination/source (so we can chain multiple plugins)
-      # Or potentially we use something else that can achieve a similar things
-      - name: Run cloudquery for github-resolve-directory
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        run: |
-          docker run -d --rm -p 7777:7777 \
-            -v ${CLOUDQUERY_FILE_DIRECTORY}:${CLOUDQUERY_FILE_DIRECTORY} \
-            --name github-resolve-repos \
-            ghcr.io/opensource-observer/cloudquery-github-resolve-repos:${DOCKER_TAG} \
-            serve --address 0.0.0.0:7777 &&
-          cloudquery sync .github/workflows/cloudquery/github-resolve-repos.yml --log-level debug --log-console &&
-          docker logs github-resolve-repos 2>&1 | tee ${CLOUDQUERY_FILE_DIRECTORY}/github-resolve-repos.log &&
-          docker stop github-resolve-repos
-
-      - uses: actions/upload-artifact@v4
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        with:
-          name: github-resolve-repos.log
-          path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/github-resolve-repos.log
diff --git a/.github/workflows/warehouse-run-data-pipeline.yml b/.github/workflows/warehouse-run-data-pipeline.yml
deleted file mode 100644
index f65237048..000000000
--- a/.github/workflows/warehouse-run-data-pipeline.yml
+++ /dev/null
@@ -1,163 +0,0 @@
-name: warehouse-run-data-pipeline
-env:
-  X_GITHUB_GRAPHQL_API: ${{ vars.X_GITHUB_GRAPHQL_API }}
-  X_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  CLOUDQUERY_VERSION: 5.5.0
-  CLOUDQUERY_FILE_DIRECTORY: /tmp/cloudquery
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  CLOUDSQL_DB_USER: ${{ secrets.CLOUDSQL_DB_USER }}
-  CLOUDSQL_DB_PASSWORD: ${{ secrets.CLOUDSQL_DB_PASSWORD }}
-  CLOUDSQL_DB_NAME: ${{ vars.CLOUDSQL_DB_NAME }}
-  CLOUDSQL_REGION: ${{ vars.CLOUDSQL_REGION }}
-  CLOUDSQL_INSTANCE_ID: ${{ vars.CLOUDSQL_INSTANCE_ID }}
-  GOOGLE_PROJECT_ID: ${{ vars.GOOGLE_PROJECT_ID }}
-  CLOUDSTORAGE_BUCKET_NAME: ${{ vars.CLOUDSTORAGE_BUCKET_NAME }}
-  BIGQUERY_DATASET_ID: ${{ vars.BIGQUERY_DATASET_ID }}
-
-# For now this only runs on a schedule once a day. Once we have made some of the
-# plugin workflows more incremental we will run this on _every_ commit to main
-on:
-  # Allows you to run this workflow manually from the Actions tab
-  workflow_dispatch:
-    inputs:
-      docker_tag:
-        description: The docker tag to use for cloudquery plugins (only)
-      skip_cloudquery_plugins:
-        description: Skip CloudQuery plugins (run dbt only)
-        default: 'false'
-        required: false
-  schedule:
-
-    # Schedule every day at 2AM UTC. This is so we ensure anything that is #
-    # commited daily has completed writing from whatever data source. This likely
-    # isn't necessary in the future if we do everything incrementally
-    - cron: '0 2 * * *'
-
-jobs:
-  warehouse-run-data-pipeline:
-    name: warehouse-run-data-pipeline
-    environment: indexer
-    runs-on: ubuntu-latest
-
-    permissions:
-      contents: 'read'
-      id-token: 'write'
-
-    env:
-      DOCKER_TAG: ${{ inputs.docker_tag != '' && inputs.docker_tag || github.sha }}
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 1
-
-      - name: 'Login to GitHub Container Registry'
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      
-      - name: "Setup Python, Poetry and Dependencies"
-        uses: packetcoders/action-setup-cache-python-poetry@main
-        with:
-          python-version: 3.12
-          poetry-version: 1.8.2
-      
-      - name: Run poetry install
-        run: |
-          poetry install
-      
-      # At this time this auth isn't working for dbt
-      # - uses: 'google-github-actions/auth@v2'
-      #   with:
-      #     service_account: oso-github-actions@oso-production.iam.gserviceaccount.com 
-      #     workload_identity_provider: projects/1054148520225/locations/global/workloadIdentityPools/github/providers/oso-github-actions
-      #     create_credentials_file: true
-      #     access_token_lifetime: 3600s
-
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_JSON }}'
-          create_credentials_file: true
-      
-      - name: 'Set up Cloud SDK'
-        uses: 'google-github-actions/setup-gcloud@v2'
-        with:
-          version: '>= 363.0.0'
-
-      - name: Download and install cloudquery
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        run: |
-          curl -L https://github.com/cloudquery/cloudquery/releases/download/cli-v${CLOUDQUERY_VERSION}/cloudquery_linux_amd64 -o /tmp/cloudquery &&
-          chmod a+x /tmp/cloudquery && 
-          mv /tmp/cloudquery /usr/local/bin/cloudquery
-
-      # For now this is a bit of a hack for the oss-directory plugins as the output from one plugin is the input to 
-      # another. Ideally we would simply tell whatever system to run and it will handle dependencies.
-      - name: Run cloudquery for oss-directory
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        run: |
-          cloudquery sync .github/workflows/cloudquery/oss-directory.yml --log-level debug --log-console
-
-      - name: Concat the project jsonl files (if there are many)
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        run: |
-          ls -laht ${CLOUDQUERY_FILE_DIRECTORY}/ &&
-          find ${CLOUDQUERY_FILE_DIRECTORY}/projects_ossd -name "*.json" -type f -exec cat {} \; > ${CLOUDQUERY_FILE_DIRECTORY}/projects.json &&
-          head -n 5 ${CLOUDQUERY_FILE_DIRECTORY}/projects.json
-
-      - uses: actions/upload-artifact@v4
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        with:
-          name: projects.json
-          path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/projects.json
-
-      # This is currently a punt on how to run this properly because a previous cloudquery 
-      # plugin's output can't be used as input into a different one
-      # We start the github-resolve-repos container with a volume that can access the project file and use 
-      # Ideally we'd either have a plugin that can act as both a destination/source (so we can chain multiple plugins)
-      # Or potentially we use something else that can achieve a similar things
-      - name: Run cloudquery for github-resolve-directory
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        run: |
-          docker run -d --rm -p 7777:7777 \
-            -v ${CLOUDQUERY_FILE_DIRECTORY}:${CLOUDQUERY_FILE_DIRECTORY} \
-            --name github-resolve-repos \
-            ghcr.io/opensource-observer/cloudquery-github-resolve-repos:${DOCKER_TAG} \
-            serve --address 0.0.0.0:7777 &&
-          cloudquery sync .github/workflows/cloudquery/github-resolve-repos.yml --log-level debug --log-console &&
-          docker logs github-resolve-repos 2>&1 | tee ${CLOUDQUERY_FILE_DIRECTORY}/github-resolve-repos.log &&
-          docker stop github-resolve-repos
-
-      - uses: actions/upload-artifact@v4
-        if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
-        with:
-          name: github-resolve-repos.log
-          path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/github-resolve-repos.log
-
-      - name: Setup dbt
-        run: |
-          bash .github/scripts/create-dbt-profile.sh ${GOOGLE_APPLICATION_CREDENTIALS} &&
-          gcloud auth list
-
-      - name: Run dbt for production
-        run: |
-          poetry run dbt run --target production
-      
-      - name: Run dbt for the base_playground
-        run: |
-          poetry run dbt run --target base_playground
-
-      - name: Run dbt for the playground
-        run: |
-          poetry run dbt run --target playground --full-refresh
-        env:
-          PLAYGROUND_DAYS: 30
-
-      - name: Copy the bigquery tables to cloudsql
-        run: |
-          poetry run bq2cloudsql
-        env:
-          DBT_TARGET: production
diff --git a/docker/images/dagster-dask/Dockerfile b/docker/images/dagster-dask/Dockerfile
index bd6da0bf0..79cb3f141 100644
--- a/docker/images/dagster-dask/Dockerfile
+++ b/docker/images/dagster-dask/Dockerfile
@@ -32,7 +32,6 @@ RUN mkdir -p /usr/src/app && \
 
 WORKDIR /usr/src/app
 COPY pyproject.toml poetry.lock /usr/src/app/
-COPY warehouse/cloudquery-example-plugin /usr/src/app/warehouse/cloudquery-example-plugin
 
 # Install everything onto the system path
 RUN poetry config virtualenvs.create false && \
diff --git a/ops/tf-modules/warehouse/main.tf b/ops/tf-modules/warehouse/main.tf
index abdf8fb85..fc5150bf7 100644
--- a/ops/tf-modules/warehouse/main.tf
+++ b/ops/tf-modules/warehouse/main.tf
@@ -81,8 +81,7 @@ resource "google_bigquery_dataset" "dataset" {
 }
 
 ###
-# A dataset for receiving data from airbyte/cloudquery/or any other data
-# connections
+# A dataset for receiving data
 ###
 resource "google_bigquery_dataset" "raw_dataset" {
   dataset_id    = local.raw_dataset_id
diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql
index bbd1f4843..1a90a7921 100644
--- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql
+++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql
@@ -1,5 +1,5 @@
 {# 
-  The most recent view of collections from the ossd cloudquery plugin.
+  The most recent view of collections from the ossd dagster source.
 #}
 
 select
diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql
index 95a145bc9..73a31dd72 100644
--- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql
+++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql
@@ -1,5 +1,5 @@
 {# 
-  The most recent view of projects from the ossd cloudquery plugin.
+  The most recent view of projects from the ossd dagster source.
 #}
 select
   {# 
diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql
index 7ac6e88c0..257b0c980 100644
--- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql
+++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql
@@ -1,5 +1,5 @@
 {# 
-  The most recent view of repositories from the github-resolve-repos cloudquery plugin.
+  The most recent view of repositories from the ossd repositories dagster source.
 #}
 with ranked_repositories as (
   select