DemocracyClub · symroe · Nov 12, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 14, 2024
diff --git a/deploy/data_exporter/.gitignore b/deploy/data_exporter/.gitignore
@@ -0,0 +1 @@
+.aws-sam
diff --git a/deploy/data_exporter/__init__.py b/deploy/data_exporter/__init__.py
diff --git a/deploy/data_exporter/data_export_function/Dockerfile b/deploy/data_exporter/data_export_function/Dockerfile
@@ -0,0 +1,14 @@
+FROM public.ecr.aws/docker/library/ubuntu:24.04
+
+RUN apt update && \
+    apt install -y postgresql-client-16 python3.12 python3-pip curl unzip && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip3 install --break-system-packages -r requirements.txt
+RUN pip3 install --break-system-packages awslambdaric
+
+COPY . .
+
+ENTRYPOINT ["python3", "-m", "awslambdaric" ]
+CMD [ "app.lambda_handler" ]
diff --git a/deploy/data_exporter/data_export_function/app.py b/deploy/data_exporter/data_export_function/app.py
@@ -0,0 +1,178 @@
+import os
+import subprocess
+from datetime import datetime, timedelta, timezone
+
+import boto3
+import psycopg
+from psycopg import sql
+
+ssm = boto3.client("ssm")
+s3 = boto3.client("s3", region_name="eu-west-1")
+bucket_name = "dc-ynr-short-term-backups"
+current_time = datetime.now().isoformat()
+PREFIX = "ynr-export"
+FILENAME = f"{PREFIX}-{current_time.replace(':', '-')}.dump"
+
+
+def get_parameter(name):
+    response = ssm.get_parameter(Name=name)
+    return response["Parameter"]["Value"]
+
+
+SOURCE_DATABASE = "ynr"
+TMP_DATABASE_NAME = "ynr-for-dev-export"
+DB_HOST = get_parameter("/ynr/production/POSTGRES_HOST")
+DB_USER = get_parameter("/ynr/production/POSTGRES_USERNAME")
+DB_PASSWORD = get_parameter("/ynr/production/POSTGRES_PASSWORD")
+DB_PORT = "5432"
+os.environ["PGPASSWORD"] = DB_PASSWORD
+
+
+def get_db_conn(db_name):
+    conn = psycopg.connect(
+        dbname=db_name,
+        user=DB_USER,
+        password=DB_PASSWORD,
+        host=DB_HOST,
+        port=DB_PORT,
+    )
+    conn.autocommit = True
+    return conn
+
+
+def create_database_from_template():
+    # Connect to the PostgreSQL server (usually to the 'postgres' database for administrative tasks)
+    conn = get_db_conn(SOURCE_DATABASE)
+    # Enable autocommit to run CREATE DATABASE commands
+    try:
+        with conn.cursor() as cur:
+            print(f"Deleting {TMP_DATABASE_NAME}")
+            cur.execute(
+                sql.SQL("DROP DATABASE IF EXISTS {};").format(
+                    sql.Identifier(TMP_DATABASE_NAME)
+                )
+            )
+        with conn.cursor() as cur:
+            # SQL to create the new database from the template
+            print(f"Creating {TMP_DATABASE_NAME}")
+            cur.execute(
+                sql.SQL("CREATE DATABASE {} TEMPLATE {};").format(
+                    sql.Identifier(TMP_DATABASE_NAME),
+                    sql.Identifier(SOURCE_DATABASE),
+                )
+            )
+            print(
+                f"Database '{TMP_DATABASE_NAME}' created successfully from template '{SOURCE_DATABASE}'."
+            )
+    except psycopg.Error as e:
+        print(f"Error creating database: {e}")
+    finally:
+        conn.close()
+
+
+def clean_database():
+    conn = get_db_conn(db_name=TMP_DATABASE_NAME)
+    with conn.cursor() as cur:
+        print("Cleaning Users table")
+        cur.execute(
+            """UPDATE auth_user SET 
+                email = CONCAT('anon_', id, '@example.com'), 
+                password = md5(random()::text);
+            """
+        )
+        print("Cleaning Account email table")
+        cur.execute(
+            """UPDATE auth_user SET 
+                email = CONCAT('anon_', id, '@example.com');
+            """
+        )
+        print("Cleaning IP addresses from LoggedActions")
+        cur.execute(
+            """UPDATE candidates_loggedaction SET 
+                ip_address = '127.0.0.1';
+            """
+        )
+        print("Cleaning API tokens")
+        cur.execute(
+            """UPDATE authtoken_token SET 
+                key = md5(random()::text);
+            """
+        )
+        print("Cleaning sessions")
+        cur.execute("""TRUNCATE TABLE django_session;""")
+
+
+def dump_and_export():
+    dump_file = "/tmp/db_dump.sql"  # Temporary file for the dump
+
+    # Database credentials and parameters
+
+    print("Run pg_dump to create the database dump")
+    try:
+        subprocess.run(
+            [
+                "pg_dump",
+                "-h",
+                DB_HOST,
+                "-U",
+                DB_USER,
+                "-d",
+                TMP_DATABASE_NAME,
+                "-Fc",
+                "-f",
+                dump_file,
+            ],
+            check=True,
+        )
+
+        print("Upload the dump to S3")
+        s3.upload_file(dump_file, bucket_name, FILENAME)
+
+        print("Generate a presigned URL for downloading the dump")
+        presigned_url = s3.generate_presigned_url(
+            "get_object",
+            Params={"Bucket": bucket_name, "Key": FILENAME},
+            ExpiresIn=3600,  # URL expires in 1 hour
+        )
+        print("Finished")
+        return presigned_url
+
+    except subprocess.CalledProcessError as e:
+        return f"Error generating database dump: {str(e)}"
+
+
+def check_for_recent_exports():
+    """
+    If we've exported a file in the last hour, don't export another one
+
+    """
+    one_hour_ago = datetime.now(timezone.utc) - timedelta(hours=1)
+    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=PREFIX)
+    if "Contents" in response:
+        recent_files = [
+            obj
+            for obj in response["Contents"]
+            if obj["LastModified"] >= one_hour_ago
+        ]
+
+        recent_files.sort(key=lambda obj: obj["LastModified"], reverse=True)
+
+        if recent_files:
+            return s3.generate_presigned_url(
+                "get_object",
+                Params={"Bucket": bucket_name, "Key": recent_files[0]["Key"]},
+                ExpiresIn=3600,  # URL expires in 1 hour
+            )
+    return None
+
+
+def lambda_handler(event, context):
+    if recent_export := check_for_recent_exports():
+        return recent_export
+
+    print("Creating temp database")
+    create_database_from_template()
+    print("Cleaning temp database")
+    clean_database()
+    print("Dumping and exporting")
+    return dump_and_export()
diff --git a/deploy/data_exporter/data_export_function/requirements.txt b/deploy/data_exporter/data_export_function/requirements.txt
@@ -0,0 +1,2 @@
+boto3===1.35.56
+psycopg[binary]==3.2.3
diff --git a/deploy/data_exporter/samconfig.toml b/deploy/data_exporter/samconfig.toml
@@ -0,0 +1,33 @@
+# More information about the configuration file can be found here:
+# https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-config.html
+version = 0.1
+
+[default.global.parameters]
+stack_name = "ynr-data-exporter"
+
+[default.build.parameters]
+cached = true
+parallel = true
+
+[default.validate.parameters]
+lint = true
+
+[default.deploy.parameters]
+capabilities = "CAPABILITY_IAM"
+confirm_changeset = true
+resolve_s3 = true
+s3_prefix = "ynr-data-exporter"
+region = "eu-west-2"
+image_repositories = ["DataExportFunction=929325949831.dkr.ecr.eu-west-2.amazonaws.com/ynrdataexporter736bb2dc/dataexportfunctionb95e9e19repo"]
+
+[default.package.parameters]
+resolve_s3 = true
+
+[default.sync.parameters]
+watch = true
+
+[default.local_start_api.parameters]
+warm_containers = "EAGER"
+
+[default.local_start_lambda.parameters]
+warm_containers = "EAGER"
diff --git a/deploy/data_exporter/template.yaml b/deploy/data_exporter/template.yaml
@@ -0,0 +1,49 @@
+AWSTemplateFormatVersion: '2010-09-09'
+Transform: AWS::Serverless-2016-10-31
+Description: >
+  data_exporter
+
+  Exports data from the prod database, cleans it and puts the resulting dump in an S3 bucket
+
+Globals:
+  Function:
+    Timeout: 600  # 10 minutes
+    MemorySize: 1024
+
+    LoggingConfig:
+      LogFormat: JSON
+Resources:
+  DataExportFunction:
+    Type: AWS::Serverless::Function
+    Properties:
+      FunctionName: ynr-data-exporter
+      PackageType: Image
+      ImageUri: data_export_function
+      # Needs to be at least as big as the DB export, currently at around 350mb
+      EphemeralStorage:
+        Size: 1024
+      # Don't allow more than one export job to run at a time
+      ReservedConcurrentExecutions: 1
+      Policies:
+        - Statement:
+            - Sid: S3Access
+              Effect: Allow
+              Action:
+                - s3:*
+              Resource:
+                - 'arn:aws:s3:::dc-ynr-short-term-backups'
+                - 'arn:aws:s3:::dc-ynr-short-term-backups/*'
+            - Sid: SSM
+              Effect: Allow
+              Action:
+                - ssm:*
+              Resource:
+                - 'arn:aws:ssm:*:*:parameter/ynr/*'
+
+Outputs:
+  DataExportFunction:
+    Description: Hello World Lambda Function ARN
+    Value: !GetAtt DataExportFunction.Arn
+  DataExportFunctionIamRole:
+    Description: Implicit IAM Role created for Hello World function
+    Value: !GetAtt DataExportFunctionRole.Arn
diff --git a/scripts/get-prod-db.sh b/scripts/get-prod-db.sh
@@ -0,0 +1,57 @@
+#!/bin/sh
+set -euxo
+
+# This script invokes an AWS Lambda function to retrieve a URL for downloading
+# a cleaned version of the production database and then restores
+# that data locally. By default the db name is "ynr-prod" but you can change the
+# local name by passing it as the first argument to the script.
+#
+# This script requires access to the YNR production AWS account
+#
+# Usage:
+#   ./script.sh [LOCAL_DB_NAME]
+#
+# Arguments:
+#   LOCAL_DB_NAME: Optional. Name of the local database to restore data to.
+#                  Defaults to 'ynr-prod' if not specified.
+
+# Configurable variables
+LAMBDA_FUNCTION_NAME="ynr-data-exporter"
+LOCAL_DB_NAME="${1:-ynr-prod}"
+
+# Check for required tools
+REQUIRED_TOOLS="aws dropdb createdb pg_restore wget"
+for tool in $REQUIRED_TOOLS; do
+  if ! command -v "$tool" >/dev/null 2>&1; then
+    echo "Error: $tool is required but not installed." >&2
+    exit 1
+  fi
+done
+
+# Create a temporary file and set up clean up on script exit
+TEMP_FILE=$(mktemp)
+trap 'rm -f "$TEMP_FILE"' EXIT
+
+# Invoke AWS Lambda and store the result in the temp file
+# The result is a presigned URL to the dump file on S3
+echo "Invoking Lambda to get DB URL. This might take a few minutes..."
+aws lambda invoke \
+  --function-name "$LAMBDA_FUNCTION_NAME" \
+  --cli-read-timeout=0 \
+  --no-cli-pager \
+  --output text \
+  --query 'Payload' \
+  "$TEMP_FILE"
+
+# Extract the URL from the response
+# This is because the response is quoted, so we just need to remove the quotation marks
+URL=$(sed 's/^"\(.*\)"$/\1/' "$TEMP_FILE")
+echo "Got URL: $(URL)"
+
+echo "Dropping DB $(LOCAL_DB_NAME)"
+dropdb --if-exists "$LOCAL_DB_NAME"
+echo "Creating DB $(LOCAL_DB_NAME)"
+createdb "$LOCAL_DB_NAME"
+
+echo "Downloading and restoring DB $(LOCAL_DB_NAME)"
+wget -qO- "$URL" | pg_restore -d "$LOCAL_DB_NAME" -Fc --no-owner --no-privileges