From 7f0c96df43405e9cf4cc98728c60718097c9fc75 Mon Sep 17 00:00:00 2001 From: Reuven Gonzales Date: Fri, 19 Apr 2024 08:09:56 -0700 Subject: [PATCH] Creates a generic meltano pipeline to bigquery (#1250) * update meltano path * fix * wip mapping * add additional user to psql db * fix mapping * start cloudsql proxy manually * fix * fix * fix * fix * fix * fix * ignore clean up errors * more fixes * fix * fix * fix * fix --- .github/workflows/meltano/action.yml | 46 +++++++++++++++++-- .../warehouse-meltano-ecosystems-ost.yml | 11 +++-- ops/tf-modules/warehouse/main.tf | 25 ++++++++++ ops/tf-modules/warehouse/outputs.tf | 4 ++ warehouse/meltano-setup/meltano.yml | 5 +- 5 files changed, 81 insertions(+), 10 deletions(-) diff --git a/.github/workflows/meltano/action.yml b/.github/workflows/meltano/action.yml index a5857fb19..fe7c11b12 100644 --- a/.github/workflows/meltano/action.yml +++ b/.github/workflows/meltano/action.yml @@ -13,26 +13,62 @@ inputs: meltano_path: description: The path to meltano required: true + meltano_state_backend_uri: + description: The state backend uri for meltano + required: true + google_credentials_path: + description: The path to the application secrets + required: true + db_instance_id: + description: DB instance Id + required: true + db_user: + description: DB password + required: true + db_name: + description: DB password + required: true + db_password: + description: DB password + required: true runs: using: "composite" steps: - uses: actions/setup-python@v5 with: - python-version: '3.10.13' + python-version: '3.11.9' - uses: snok/install-poetry@v1 with: version: '1.7.1' + + - name: Runner temp + shell: bash + run: | + mkdir -p ${{ runner.temp }}/meltano-tmp + + - name: Run cloudsql proxy + shell: bash + run: | + curl -o cloud-sql-proxy https://storage.googleapis.com/cloud-sql-connectors/cloud-sql-proxy/v2.11.0/cloud-sql-proxy.linux.amd64 && + chmod +x cloud-sql-proxy + ./cloud-sql-proxy ${{ inputs.db_instance_id }} & + - name: Install poetry deps for meltano shell: bash run: | - cd ${{ inputs.meltano_path }} && poetry install && meltano install + cd ${{ inputs.meltano_path }} && poetry install --no-root && poetry run meltano install - name: Run tap:${{ inputs.tap }} into target:${{ inputs.target }} shell: bash # This is for testing for now run: | - env - - \ No newline at end of file + env && + cd ${{ inputs.meltano_path }} && + poetry run meltano run ${{ inputs.tap }} ${{ inputs.target }} + env: + MELTANO_DATABASE_URI: postgresql+psycopg://${{ inputs.db_user }}:${{ inputs.db_password }}@127.0.0.1:5432/${{ inputs.db_name }} + MELTANO_STATE_BACKEND_URI: ${{ inputs.meltano_state_backend_uri }} + TMPDIR: ${{ runner.temp }}/meltano-tmp + \ No newline at end of file diff --git a/.github/workflows/warehouse-meltano-ecosystems-ost.yml b/.github/workflows/warehouse-meltano-ecosystems-ost.yml index b4ffce193..7161b790e 100644 --- a/.github/workflows/warehouse-meltano-ecosystems-ost.yml +++ b/.github/workflows/warehouse-meltano-ecosystems-ost.yml @@ -8,7 +8,7 @@ env: TAP_ECOSYSTEMS_OST_AIRBYTE_CONFIG_DATABASE: ${{ secrets.TAP_ECOSYSTEMS_OST_AIRBYTE_CONFIG_DATABASE }} TAP_ECOSYSTEMS_OST_AIRBYTE_CONFIG_PORT: ${{ secrets.TAP_ECOSYSTEMS_OST_AIRBYTE_CONFIG_PORT }} TAP_ECOSYSTEMS_OST_AIRBYTE_CONFIG_USERNAME: ${{ secrets.TAP_ECOSYSTEMS_OST_AIRBYTE_CONFIG_USERNAME }} - MELTANO_STATE_BACKEND_URI: ${{ secrets.MELTANO_STATE_BACKEND_URI }} + TARGET_BIGQUERY_PROJECT: ${{ vars.TARGET_BIGQUERY_PROJECT }} on: workflow_dispatch: @@ -39,8 +39,13 @@ jobs: with: tap: tap-ecosystems-ost target: target-bigquery + meltano_path: warehouse/meltano-setup + db_user: ${{ secrets.METADATA_DB_USER }} + db_name: ${{ secrets.METADATA_DB_NAME }} + db_password: ${{ secrets.METADATA_DB_PASSWORD }} + db_instance_id: ${{ secrets.METADATA_DB_INSTANCE_ID }} + meltano_state_backend_uri: ${{ secrets.MELTANO_STATE_BACKEND_URI }} env: TARGET_BIGQUERY_CREDENTIALS_PATH: ${{ env.GOOGLE_APPLICATION_CREDENTIALS }} TARGET_BIGQUERY_BUCKET: oso-dataset-transfer-bucket - TARGET_BIGQUERY_PROJECT: opensource-observer - TARGET_BIGQUERY_DATASET: ecosystems-ost \ No newline at end of file + TARGET_BIGQUERY_DATASET: ecosystems_ost \ No newline at end of file diff --git a/ops/tf-modules/warehouse/main.tf b/ops/tf-modules/warehouse/main.tf index 9d1b32261..c26dd8049 100644 --- a/ops/tf-modules/warehouse/main.tf +++ b/ops/tf-modules/warehouse/main.tf @@ -19,6 +19,8 @@ locals { readonly_service_account_name = "${var.name}-readonly" cloudsql_name = "${var.name}-psql" cloudsql_db_user = "${var.name}-admin" + metadata_db_name = "${var.name}-metadata" + metadata_db_user = "${var.name}-metadata-admin" dataset_id = replace(var.name, "-", "_") raw_dataset_id = replace("${var.name}_raw_sources", "-", "_") } @@ -108,6 +110,11 @@ resource "google_storage_bucket" "dataset_transfer" { uniform_bucket_level_access = true } +resource "random_password" "metadata_user_temp_password" { + length = 24 + special = true +} + ### # CloudSQL instance ### @@ -125,6 +132,24 @@ module "warehouse_cloudsql" { dw_name = var.name } ip_configuration = var.cloudsql_ip_configuration + + additional_databases = [ + { + name = local.metadata_db_name + charset = "UTF8" + collation = "en_US.UTF8" + }, + ] + + # At the moment this user needs to be manual configured with permissions to + # the metadata database + additional_users = [ + { + name = local.metadata_db_user, + password = random_password.metadata_user_temp_password.result + random_password = false + } + ] } ### diff --git a/ops/tf-modules/warehouse/outputs.tf b/ops/tf-modules/warehouse/outputs.tf index 1f3cd171d..4d5b001a4 100644 --- a/ops/tf-modules/warehouse/outputs.tf +++ b/ops/tf-modules/warehouse/outputs.tf @@ -1,3 +1,7 @@ output "dataset_id" { value = google_bigquery_dataset.dataset.id } + +output "metadata_user_temp_password" { + value = random_password.metadata_user_temp_password.result +} diff --git a/warehouse/meltano-setup/meltano.yml b/warehouse/meltano-setup/meltano.yml index a8f8cf05f..390c3fe85 100644 --- a/warehouse/meltano-setup/meltano.yml +++ b/warehouse/meltano-setup/meltano.yml @@ -15,7 +15,7 @@ plugins: pip_url: git+https://github.com/opensource-observer/tap-airbyte-wrapper.git config: nullable_generated_fields: - - "*._ab_cdc_deleted_at" + - '*._ab_cdc_deleted_at' airbyte_spec: image: airbyte/source-postgres tag: 3.3.26 @@ -31,6 +31,7 @@ plugins: publication: oso_publication replication_slot: oso_slot initial_waiting_seconds: 10 + force_docker_as_current_user: true select: - projects.* - issues.* @@ -43,7 +44,7 @@ plugins: pip_url: git+https://github.com/opensource-observer/tap-airbyte-wrapper.git config: nullable_generated_fields: - - "*._ab_cdc_deleted_at" + - '*._ab_cdc_deleted_at' airbyte_spec: image: airbyte/source-postgres tag: 3.3.26