From 7123997e8f0d05c660c6ac6d0bd6d5804fdf5fb7 Mon Sep 17 00:00:00 2001 From: Reuven Gonzales Date: Fri, 2 Aug 2024 01:08:06 -0700 Subject: [PATCH] More cloudquery clean up (#1905) --- ...warehouse-run-data-pipeline-cloudquery.yml | 128 -------------- .../workflows/warehouse-run-data-pipeline.yml | 163 ------------------ docker/images/dagster-dask/Dockerfile | 1 - ops/tf-modules/warehouse/main.tf | 3 +- .../stg_ossd__current_collections.sql | 2 +- .../stg_ossd__current_projects.sql | 2 +- .../stg_ossd__current_repositories.sql | 2 +- 7 files changed, 4 insertions(+), 297 deletions(-) delete mode 100644 .github/workflows/warehouse-run-data-pipeline-cloudquery.yml delete mode 100644 .github/workflows/warehouse-run-data-pipeline.yml diff --git a/.github/workflows/warehouse-run-data-pipeline-cloudquery.yml b/.github/workflows/warehouse-run-data-pipeline-cloudquery.yml deleted file mode 100644 index 1aad26bcc..000000000 --- a/.github/workflows/warehouse-run-data-pipeline-cloudquery.yml +++ /dev/null @@ -1,128 +0,0 @@ -name: warehouse-run-data-pipeline-cloudquery -env: - X_GITHUB_GRAPHQL_API: ${{ vars.X_GITHUB_GRAPHQL_API }} - X_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - CLOUDQUERY_VERSION: 5.5.0 - CLOUDQUERY_FILE_DIRECTORY: /tmp/cloudquery - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - CLOUDSQL_DB_USER: ${{ secrets.CLOUDSQL_DB_USER }} - CLOUDSQL_DB_PASSWORD: ${{ secrets.CLOUDSQL_DB_PASSWORD }} - CLOUDSQL_DB_NAME: ${{ vars.CLOUDSQL_DB_NAME }} - CLOUDSQL_REGION: ${{ vars.CLOUDSQL_REGION }} - CLOUDSQL_INSTANCE_ID: ${{ vars.CLOUDSQL_INSTANCE_ID }} - GOOGLE_PROJECT_ID: ${{ vars.GOOGLE_PROJECT_ID }} - CLOUDSTORAGE_BUCKET_NAME: ${{ vars.CLOUDSTORAGE_BUCKET_NAME }} - BIGQUERY_DATASET_ID: ${{ vars.BIGQUERY_DATASET_ID }} - -# For now this only runs on a schedule once a day. Once we have made some of the -# plugin workflows more incremental we will run this on _every_ commit to main -on: - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - inputs: - docker_tag: - description: The docker tag to use for cloudquery plugins (only) - -jobs: - warehouse-run-data-pipeline-cloudquery: - name: warehouse-run-data-pipeline-cloudquery - environment: indexer - runs-on: ubuntu-latest - - permissions: - contents: 'read' - id-token: 'write' - - env: - DOCKER_TAG: ${{ inputs.docker_tag != '' && inputs.docker_tag || github.sha }} - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - - name: 'Login to GitHub Container Registry' - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: "Setup Python, Poetry and Dependencies" - uses: packetcoders/action-setup-cache-python-poetry@main - with: - python-version: 3.12 - poetry-version: 1.8.2 - - - name: Run poetry install - run: | - poetry install - - # At this time this auth isn't working for dbt - # - uses: 'google-github-actions/auth@v2' - # with: - # service_account: oso-github-actions@oso-production.iam.gserviceaccount.com - # workload_identity_provider: projects/1054148520225/locations/global/workloadIdentityPools/github/providers/oso-github-actions - # create_credentials_file: true - # access_token_lifetime: 3600s - - - uses: 'google-github-actions/auth@v2' - with: - credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_JSON }}' - create_credentials_file: true - - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v2' - with: - version: '>= 363.0.0' - - - name: Download and install cloudquery - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - run: | - curl -L https://github.com/cloudquery/cloudquery/releases/download/cli-v${CLOUDQUERY_VERSION}/cloudquery_linux_amd64 -o /tmp/cloudquery && - chmod a+x /tmp/cloudquery && - mv /tmp/cloudquery /usr/local/bin/cloudquery - - # For now this is a bit of a hack for the oss-directory plugins as the output from one plugin is the input to - # another. Ideally we would simply tell whatever system to run and it will handle dependencies. - - name: Run cloudquery for oss-directory - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - run: | - cloudquery sync .github/workflows/cloudquery/oss-directory.yml --log-level debug --log-console - - - name: Concat the project jsonl files (if there are many) - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - run: | - ls -laht ${CLOUDQUERY_FILE_DIRECTORY}/ && - find ${CLOUDQUERY_FILE_DIRECTORY}/projects_ossd -name "*.json" -type f -exec cat {} \; > ${CLOUDQUERY_FILE_DIRECTORY}/projects.json && - head -n 5 ${CLOUDQUERY_FILE_DIRECTORY}/projects.json - - - uses: actions/upload-artifact@v4 - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - with: - name: projects.json - path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/projects.json - - # This is currently a punt on how to run this properly because a previous cloudquery - # plugin's output can't be used as input into a different one - # We start the github-resolve-repos container with a volume that can access the project file and use - # Ideally we'd either have a plugin that can act as both a destination/source (so we can chain multiple plugins) - # Or potentially we use something else that can achieve a similar things - - name: Run cloudquery for github-resolve-directory - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - run: | - docker run -d --rm -p 7777:7777 \ - -v ${CLOUDQUERY_FILE_DIRECTORY}:${CLOUDQUERY_FILE_DIRECTORY} \ - --name github-resolve-repos \ - ghcr.io/opensource-observer/cloudquery-github-resolve-repos:${DOCKER_TAG} \ - serve --address 0.0.0.0:7777 && - cloudquery sync .github/workflows/cloudquery/github-resolve-repos.yml --log-level debug --log-console && - docker logs github-resolve-repos 2>&1 | tee ${CLOUDQUERY_FILE_DIRECTORY}/github-resolve-repos.log && - docker stop github-resolve-repos - - - uses: actions/upload-artifact@v4 - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - with: - name: github-resolve-repos.log - path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/github-resolve-repos.log diff --git a/.github/workflows/warehouse-run-data-pipeline.yml b/.github/workflows/warehouse-run-data-pipeline.yml deleted file mode 100644 index f65237048..000000000 --- a/.github/workflows/warehouse-run-data-pipeline.yml +++ /dev/null @@ -1,163 +0,0 @@ -name: warehouse-run-data-pipeline -env: - X_GITHUB_GRAPHQL_API: ${{ vars.X_GITHUB_GRAPHQL_API }} - X_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - CLOUDQUERY_VERSION: 5.5.0 - CLOUDQUERY_FILE_DIRECTORY: /tmp/cloudquery - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - CLOUDSQL_DB_USER: ${{ secrets.CLOUDSQL_DB_USER }} - CLOUDSQL_DB_PASSWORD: ${{ secrets.CLOUDSQL_DB_PASSWORD }} - CLOUDSQL_DB_NAME: ${{ vars.CLOUDSQL_DB_NAME }} - CLOUDSQL_REGION: ${{ vars.CLOUDSQL_REGION }} - CLOUDSQL_INSTANCE_ID: ${{ vars.CLOUDSQL_INSTANCE_ID }} - GOOGLE_PROJECT_ID: ${{ vars.GOOGLE_PROJECT_ID }} - CLOUDSTORAGE_BUCKET_NAME: ${{ vars.CLOUDSTORAGE_BUCKET_NAME }} - BIGQUERY_DATASET_ID: ${{ vars.BIGQUERY_DATASET_ID }} - -# For now this only runs on a schedule once a day. Once we have made some of the -# plugin workflows more incremental we will run this on _every_ commit to main -on: - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - inputs: - docker_tag: - description: The docker tag to use for cloudquery plugins (only) - skip_cloudquery_plugins: - description: Skip CloudQuery plugins (run dbt only) - default: 'false' - required: false - schedule: - - # Schedule every day at 2AM UTC. This is so we ensure anything that is # - # commited daily has completed writing from whatever data source. This likely - # isn't necessary in the future if we do everything incrementally - - cron: '0 2 * * *' - -jobs: - warehouse-run-data-pipeline: - name: warehouse-run-data-pipeline - environment: indexer - runs-on: ubuntu-latest - - permissions: - contents: 'read' - id-token: 'write' - - env: - DOCKER_TAG: ${{ inputs.docker_tag != '' && inputs.docker_tag || github.sha }} - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - - name: 'Login to GitHub Container Registry' - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: "Setup Python, Poetry and Dependencies" - uses: packetcoders/action-setup-cache-python-poetry@main - with: - python-version: 3.12 - poetry-version: 1.8.2 - - - name: Run poetry install - run: | - poetry install - - # At this time this auth isn't working for dbt - # - uses: 'google-github-actions/auth@v2' - # with: - # service_account: oso-github-actions@oso-production.iam.gserviceaccount.com - # workload_identity_provider: projects/1054148520225/locations/global/workloadIdentityPools/github/providers/oso-github-actions - # create_credentials_file: true - # access_token_lifetime: 3600s - - - uses: 'google-github-actions/auth@v2' - with: - credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_JSON }}' - create_credentials_file: true - - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v2' - with: - version: '>= 363.0.0' - - - name: Download and install cloudquery - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - run: | - curl -L https://github.com/cloudquery/cloudquery/releases/download/cli-v${CLOUDQUERY_VERSION}/cloudquery_linux_amd64 -o /tmp/cloudquery && - chmod a+x /tmp/cloudquery && - mv /tmp/cloudquery /usr/local/bin/cloudquery - - # For now this is a bit of a hack for the oss-directory plugins as the output from one plugin is the input to - # another. Ideally we would simply tell whatever system to run and it will handle dependencies. - - name: Run cloudquery for oss-directory - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - run: | - cloudquery sync .github/workflows/cloudquery/oss-directory.yml --log-level debug --log-console - - - name: Concat the project jsonl files (if there are many) - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - run: | - ls -laht ${CLOUDQUERY_FILE_DIRECTORY}/ && - find ${CLOUDQUERY_FILE_DIRECTORY}/projects_ossd -name "*.json" -type f -exec cat {} \; > ${CLOUDQUERY_FILE_DIRECTORY}/projects.json && - head -n 5 ${CLOUDQUERY_FILE_DIRECTORY}/projects.json - - - uses: actions/upload-artifact@v4 - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - with: - name: projects.json - path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/projects.json - - # This is currently a punt on how to run this properly because a previous cloudquery - # plugin's output can't be used as input into a different one - # We start the github-resolve-repos container with a volume that can access the project file and use - # Ideally we'd either have a plugin that can act as both a destination/source (so we can chain multiple plugins) - # Or potentially we use something else that can achieve a similar things - - name: Run cloudquery for github-resolve-directory - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - run: | - docker run -d --rm -p 7777:7777 \ - -v ${CLOUDQUERY_FILE_DIRECTORY}:${CLOUDQUERY_FILE_DIRECTORY} \ - --name github-resolve-repos \ - ghcr.io/opensource-observer/cloudquery-github-resolve-repos:${DOCKER_TAG} \ - serve --address 0.0.0.0:7777 && - cloudquery sync .github/workflows/cloudquery/github-resolve-repos.yml --log-level debug --log-console && - docker logs github-resolve-repos 2>&1 | tee ${CLOUDQUERY_FILE_DIRECTORY}/github-resolve-repos.log && - docker stop github-resolve-repos - - - uses: actions/upload-artifact@v4 - if: ${{ inputs.skip_cloudquery_plugins != 'true' }} - with: - name: github-resolve-repos.log - path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/github-resolve-repos.log - - - name: Setup dbt - run: | - bash .github/scripts/create-dbt-profile.sh ${GOOGLE_APPLICATION_CREDENTIALS} && - gcloud auth list - - - name: Run dbt for production - run: | - poetry run dbt run --target production - - - name: Run dbt for the base_playground - run: | - poetry run dbt run --target base_playground - - - name: Run dbt for the playground - run: | - poetry run dbt run --target playground --full-refresh - env: - PLAYGROUND_DAYS: 30 - - - name: Copy the bigquery tables to cloudsql - run: | - poetry run bq2cloudsql - env: - DBT_TARGET: production diff --git a/docker/images/dagster-dask/Dockerfile b/docker/images/dagster-dask/Dockerfile index bd6da0bf0..79cb3f141 100644 --- a/docker/images/dagster-dask/Dockerfile +++ b/docker/images/dagster-dask/Dockerfile @@ -32,7 +32,6 @@ RUN mkdir -p /usr/src/app && \ WORKDIR /usr/src/app COPY pyproject.toml poetry.lock /usr/src/app/ -COPY warehouse/cloudquery-example-plugin /usr/src/app/warehouse/cloudquery-example-plugin # Install everything onto the system path RUN poetry config virtualenvs.create false && \ diff --git a/ops/tf-modules/warehouse/main.tf b/ops/tf-modules/warehouse/main.tf index abdf8fb85..fc5150bf7 100644 --- a/ops/tf-modules/warehouse/main.tf +++ b/ops/tf-modules/warehouse/main.tf @@ -81,8 +81,7 @@ resource "google_bigquery_dataset" "dataset" { } ### -# A dataset for receiving data from airbyte/cloudquery/or any other data -# connections +# A dataset for receiving data ### resource "google_bigquery_dataset" "raw_dataset" { dataset_id = local.raw_dataset_id diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql index bbd1f4843..1a90a7921 100644 --- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql +++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql @@ -1,5 +1,5 @@ {# - The most recent view of collections from the ossd cloudquery plugin. + The most recent view of collections from the ossd dagster source. #} select diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql index 95a145bc9..73a31dd72 100644 --- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql +++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql @@ -1,5 +1,5 @@ {# - The most recent view of projects from the ossd cloudquery plugin. + The most recent view of projects from the ossd dagster source. #} select {# diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql index 7ac6e88c0..257b0c980 100644 --- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql +++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql @@ -1,5 +1,5 @@ {# - The most recent view of repositories from the github-resolve-repos cloudquery plugin. + The most recent view of repositories from the ossd repositories dagster source. #} with ranked_repositories as ( select