warehouse-run-data-pipeline #81
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: warehouse-run-data-pipeline | |
env: | |
X_GITHUB_GRAPHQL_API: ${{ vars.X_GITHUB_GRAPHQL_API }} | |
X_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
CLOUDQUERY_VERSION: 5.5.0 | |
CLOUDQUERY_FILE_DIRECTORY: /tmp/cloudquery | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
CLOUDSQL_DB_USER: ${{ secrets.CLOUDSQL_DB_USER }} | |
CLOUDSQL_DB_PASSWORD: ${{ secrets.CLOUDSQL_DB_PASSWORD }} | |
CLOUDSQL_DB_NAME: ${{ vars.CLOUDSQL_DB_NAME }} | |
CLOUDSQL_REGION: ${{ vars.CLOUDSQL_REGION }} | |
CLOUDSQL_INSTANCE_ID: ${{ vars.CLOUDSQL_INSTANCE_ID }} | |
GOOGLE_PROJECT_ID: ${{ vars.GOOGLE_PROJECT_ID }} | |
CLOUDSTORAGE_BUCKET_NAME: ${{ vars.CLOUDSTORAGE_BUCKET_NAME }} | |
BIGQUERY_DATASET_ID: ${{ vars.BIGQUERY_DATASET_ID }} | |
# For now this only runs on a schedule once a day. Once we have made some of the | |
# plugin workflows more incremental we will run this on _every_ commit to main | |
on: | |
# Allows you to run this workflow manually from the Actions tab | |
workflow_dispatch: | |
inputs: | |
docker_tag: | |
description: The docker tag to use for cloudquery plugins (only) | |
skip_cloudquery_plugins: | |
description: Skip CloudQuery plugins (run dbt only) | |
default: 'false' | |
required: false | |
schedule: | |
# Schedule every day at 2AM UTC. This is so we ensure anything that is # | |
# commited daily has completed writing from whatever data source. This likely | |
# isn't necessary in the future if we do everything incrementally | |
- cron: '0 2 * * *' | |
jobs: | |
warehouse-run-data-pipeline: | |
name: warehouse-run-data-pipeline | |
environment: indexer | |
runs-on: ubuntu-latest | |
permissions: | |
contents: 'read' | |
id-token: 'write' | |
env: | |
DOCKER_TAG: ${{ inputs.docker_tag != '' && inputs.docker_tag || github.sha }} | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 1 | |
- name: 'Login to GitHub Container Registry' | |
uses: docker/login-action@v3 | |
with: | |
registry: ghcr.io | |
username: ${{ github.actor }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: "Setup Python, Poetry and Dependencies" | |
uses: packetcoders/action-setup-cache-python-poetry@main | |
with: | |
python-version: 3.12 | |
poetry-version: 1.7.1 | |
- name: Run poetry install | |
run: | | |
poetry install | |
# At this time this auth isn't working for dbt | |
# - uses: 'google-github-actions/auth@v2' | |
# with: | |
# service_account: [email protected] | |
# workload_identity_provider: projects/1054148520225/locations/global/workloadIdentityPools/github/providers/oso-github-actions | |
# create_credentials_file: true | |
# access_token_lifetime: 3600s | |
- uses: 'google-github-actions/auth@v2' | |
with: | |
credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_JSON }}' | |
create_credentials_file: true | |
- name: 'Set up Cloud SDK' | |
uses: 'google-github-actions/setup-gcloud@v2' | |
with: | |
version: '>= 363.0.0' | |
- name: Download and install cloudquery | |
if: ${{ inputs.skip_cloudquery_plugins != 'true' }} | |
run: | | |
curl -L https://github.com/cloudquery/cloudquery/releases/download/cli-v${CLOUDQUERY_VERSION}/cloudquery_linux_amd64 -o /tmp/cloudquery && | |
chmod a+x /tmp/cloudquery && | |
mv /tmp/cloudquery /usr/local/bin/cloudquery | |
# For now this is a bit of a hack for the oss-directory plugins as the output from one plugin is the input to | |
# another. Ideally we would simply tell whatever system to run and it will handle dependencies. | |
- name: Run cloudquery for oss-directory | |
if: ${{ inputs.skip_cloudquery_plugins != 'true' }} | |
run: | | |
cloudquery sync .github/workflows/cloudquery/oss-directory.yml --log-level debug --log-console | |
- name: Concat the project jsonl files (if there are many) | |
if: ${{ inputs.skip_cloudquery_plugins != 'true' }} | |
run: | | |
ls -laht ${CLOUDQUERY_FILE_DIRECTORY}/ && | |
find ${CLOUDQUERY_FILE_DIRECTORY}/projects_ossd -name "*.json" -type f -exec cat {} \; > ${CLOUDQUERY_FILE_DIRECTORY}/projects.json && | |
head -n 5 ${CLOUDQUERY_FILE_DIRECTORY}/projects.json | |
- uses: actions/upload-artifact@v4 | |
if: ${{ inputs.skip_cloudquery_plugins != 'true' }} | |
with: | |
name: projects.json | |
path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/projects.json | |
# This is currently a punt on how to run this properly because a previous cloudquery | |
# plugin's output can't be used as input into a different one | |
# We start the github-resolve-repos container with a volume that can access the project file and use | |
# Ideally we'd either have a plugin that can act as both a destination/source (so we can chain multiple plugins) | |
# Or potentially we use something else that can achieve a similar things | |
- name: Run cloudquery for github-resolve-directory | |
if: ${{ inputs.skip_cloudquery_plugins != 'true' }} | |
run: | | |
docker run -d --rm -p 7777:7777 \ | |
-v ${CLOUDQUERY_FILE_DIRECTORY}:${CLOUDQUERY_FILE_DIRECTORY} \ | |
--name github-resolve-repos \ | |
ghcr.io/opensource-observer/cloudquery-github-resolve-repos:${DOCKER_TAG} \ | |
serve --address 0.0.0.0:7777 && | |
cloudquery sync .github/workflows/cloudquery/github-resolve-repos.yml --log-level debug --log-console && | |
docker stop github-resolve-repos | |
- name: Setup dbt | |
run: | | |
bash .github/scripts/create-dbt-profile.sh ${GOOGLE_APPLICATION_CREDENTIALS} && | |
gcloud auth list | |
- name: Run dbt for production | |
run: | | |
poetry run dbt run --target production | |
- name: Run dbt for the playground | |
run: | | |
poetry run dbt run --target playground | |
- name: Copy the bigquery tables to cloudsql | |
run: | | |
poetry run bq2cloudsql | |
env: | |
DBT_TARGET: production |