Merge pull request #15 from moj-analytical-services/security-update

Security update
moj-analytical-services · Feb 8, 2019 · ea4059a · ea4059a
2 parents 07e120b + 6d01108
commit ea4059a
Show file tree

Hide file tree

Showing 10 changed files with 50 additions and 35 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -12,7 +12,8 @@ RoxygenNote: 6.0.1
 Imports:
   reticulate (>= 1.10),
   s3tools,
-  readr,
+  readr
+Suggests:
   data.table (>= 1.11.8)
 Remotes:
   moj-analytical-services/s3tools
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,7 +2,6 @@
 
 export(get_athena_query_response)
 export(read_sql)
-import(data.table)
 import(readr)
 import(reticulate)
 import(s3tools)
diff --git a/R/get_athena_query_response.R b/R/get_athena_query_response.R
@@ -10,10 +10,6 @@
 #'
 #'@param sql_query A string specifying the SQL query you want to send to athena. See packages github readme for info on the flavour of SQL Athena uses.
 #'
-#'@param bucket The s3 bucket the query results will be written into.  You must have read and write permissions to this folder.
-#'
-#'@param output_folder The folder path where you want your athena query to be written to. If not specified the output folder is "__athena_temp__" which is recommended.
-#'
 #'@param return_athena_types Specifies if the list describing the data's meta data types should be defined using athena datatypes (TRUE) or using the data engineering team's generic metadata types (FALSE). If not specified the default value of this input parameter is set to FALSE.
 #'
 #'@param timeout Specifies How long you want your sql query to wait before it gives up (in seconds). Default parameter is NULL which will mean SQL query will not timeout and could wait forever if an issue occured.
@@ -22,7 +18,7 @@
 #'
 #'@examples
 #'# Read an sql query using readr::read_csv
-#'response <- dbtools::get_athena_query_response("SELECT * from crest_v1.flatfile limit 10000", "my-bucket")
+#'response <- dbtools::get_athena_query_response("SELECT * from crest_v1.flatfile limit 10000")
 #'
 #'# print out path to athena query output (as a csv)
 #'print(response$s3_path)
@@ -34,14 +30,14 @@
 #'s3_path_stripped = gsub("s3://", "", response$s3_path)
 #'df <- s3tools::read_using(FUN = read.csv, s3_path=s3_path_stripped)
 
-get_athena_query_response <- function(sql_query, bucket, output_folder="__athena_temp__/", return_athena_types=FALSE, timeout = NULL){
+get_athena_query_response <- function(sql_query, return_athena_types=FALSE, timeout = NULL){
 
   # Annoyingly I think you have to pull it in as the source_python function doesn't seem to be exported properly
   # require(reticulate)
 
   python_script <- system.file("extdata", "get_athena_query_response.py", package = "dbtools")
   reticulate::source_python(python_script)
   s3tools::get_credentials()
-  response <- get_athena_query_response(sql_query=sql_query, bucket=bucket, output_folder=output_folder, return_athena_types=return_athena_types, timeout=timeout)
+  response <- get_athena_query_response(sql_query=sql_query, return_athena_types=return_athena_types, timeout=timeout)
   return(response)
 }
diff --git a/R/read_sql.R b/R/read_sql.R
@@ -2,7 +2,7 @@
 #'
 #'@description uses boto3 (in python) to send an sql query to athena and return an R dataframe, tibble or data.table based on user preference.
 #'
-#'@import reticulate s3tools readr data.table
+#'@import reticulate s3tools readr
 #'
 #'@export
 #'
@@ -11,10 +11,6 @@
 #'
 #'@param sql_query A string specifying the SQL query you want to send to athena. See packages github readme for info on the flavour of SQL Athena uses.
 #'
-#'@param bucket The s3 bucket the query results will be written into.  You must have read and write permissions to this folder.
-#'
-#'@param output_folder The folder path where you want your athena query to be written to. If not specified the output folder is "__athena_temp__" which is recommended.
-#'
 #'@param return_df_as String specifying what the table should be returned as i.e. 'dataframe' (reads data using read.csv), 'tibble' (reads data using readr::read_csv) or 'data.table' (reads data using data.table::fread). Default is 'tibble'. Not all tables returned are a DataFrame class.
 #' Only return_df_as set to 'tibble' maintains date and datetime formats. dataframe and data.table will convert date and datetimes to characters.
 #'
@@ -24,10 +20,10 @@
 #'
 #'@examples
 #'# Read an sql query using readr::read_csv i.e. returning a Tibble
-#'df <- dbtools::read_sql("SELECT * from crest_v1.flatfile limit 10000", 'my-bucket')
+#'df <- dbtools::read_sql("SELECT * from crest_v1.flatfile limit 10000")
 #'df
 
-read_sql <- function(sql_query, bucket, output_folder="__athena_temp__/", return_df_as='tibble', timeout = NULL){
+read_sql <- function(sql_query, return_df_as='tibble', timeout = NULL){
 
   # Annoyingly I think you have to pull it in as the source_python function doesn't seem to be exported properly
   # require(reticulate)
@@ -37,8 +33,9 @@ read_sql <- function(sql_query, bucket, output_folder="__athena_temp__/", return
     stop("input var return_df_as must be one of the following 'dataframe', 'tibble' or 'data.table'")
   }
 
-  response <- dbtools::get_athena_query_response(sql_query=sql_query, bucket=bucket, output_folder=output_folder, return_athena_types=FALSE, timeout=timeout)
+  response <- dbtools::get_athena_query_response(sql_query=sql_query, return_athena_types=FALSE, timeout=timeout)
   s3_path_stripped <- gsub("s3://", "", response$s3_path)
+  bucket <- unlist(strsplit(s3_path_stripped, '/'))[1]
   s3_key <- gsub(paste0(bucket,"/"), "", s3_path_stripped)
 
   data_conversion <- dbtools:::get_data_conversion(return_df_as)
@@ -54,6 +51,10 @@ read_sql <- function(sql_query, bucket, output_folder="__athena_temp__/", return
     df <- s3tools::read_using(FUN=readr::read_csv, s3_path=s3_path_stripped, col_names=TRUE, col_types=col_types)
 
   } else if(return_df_as == 'data.table'){
+    dt_ver <- packageVersion("data.table")
+    if(dt_ver < '1.11.8'){
+      warning("Your version of data.table must be 1.11.8 or above please install a new version otherwise your outputs of type data.table may not convert data types properly.")
+    }
     df <- s3tools::read_using(FUN=data.table::fread, s3_path=s3_path_stripped, header=TRUE, colClasses=col_classes_vec)
   } else {
     df <- s3tools::read_using(FUN=read.csv, s3_path=s3_path_stripped, header=TRUE, colClasses=col_classes_vec)

diff --git a/R/utils.R b/R/utils.R
@@ -21,4 +21,14 @@ delete_object <- function(bucket, key){
   reticulate::source_python(python_script)
   s3tools::get_credentials()
   delete_object(bucket=bucket, key=key)
-}
+}
+
+get_iam_role <- function(){
+  user <- Sys.getenv("USER")
+  if(user==""){
+    stop("Error could not find username in env vars. Please raise an issue on the Github repo for dbtools.")
+  }
+  iam_role <- paste0("alpha_user_", user)
+  return(iam_role)
+}
+
diff --git a/README.md b/README.md
@@ -117,9 +117,19 @@ print(response$meta)
 - To specify a string in the sql query always use '' not "". Using ""'s means that you are referencing a database, table or col, etc.
 - When data is pulled back into rStudio the column types are either R characters (for any col that was a dates, datetimes, characters) or doubles (for everything else).
 
+#### Under The Hood
+
+When you run a query in SQL against our databases you are using Athena. When Athena produces the output of an SQL query it is always written to a location in S3 as a csv. dbtools defines the S3 location based on your AWS role. It will write the output CSV into a folder only you have read/write access to and then read it in using `s3tools`. Once the data has been read into a dataframe dbtools will delete the CSV from your folder.
+
+**Note:** dbtools requires you to have the StandardDatabaseAccess group policy attached. If you want to use dbtools please ask the data engineering team (on slack ideally via the #analytical_platform channel). 
 
 #### Changelog:
 
+## v2.0.0 - 2019-02-07
+
+- Input parameters `bucket` and `output_folder` in `read_sql` and `get_athena_query_response`. New section to README named 'Under The Hood' explains why.
+- Note package now requires the group policy `StandardDatabaseAccess` to be attached to the role that needs to use this package. 
+
 ## v1.0.0 - 2019-01-14
 
 - Added function `read_sql` which reads an SQL query directly into an R dataframe. See R documentation (i.e. `?read_sql`)

diff --git a/inst/extdata/boto_utils.py b/inst/extdata/boto_utils.py
@@ -2,4 +2,4 @@
 
 def delete_object(bucket, key):
     s3_client = boto3.client('s3')
-    s3_client.delete_object(Bucket=bucket, Key=key)
+    s3_client.delete_object(Bucket=bucket, Key=key)
diff --git a/inst/extdata/get_athena_query_response.py b/inst/extdata/get_athena_query_response.py
@@ -4,7 +4,7 @@
 import time
 import os
 
-def get_athena_query_response(sql_query, bucket, output_folder = "__athena_temp__/", return_athena_types = False, timeout = None) :
+def get_athena_query_response(sql_query, return_athena_types = False, timeout = None) :
 
     type_dictionary = {
         "char" : "character",
@@ -18,10 +18,18 @@ def get_athena_query_response(sql_query, bucket, output_folder = "__athena_temp_
         "double" : "double"
     }
 
-    out_path = os.path.join('s3://', bucket, output_folder)
+    # Get role specific path for athena output
+    bucket = "alpha-athena-query-dump"
+
+    sts_client=boto3.client('sts')
+    sts_resp=sts_client.get_caller_identity()
+
+    out_path = os.path.join('s3://', bucket, sts_resp['UserId'], "__athena_temp__/")
+
     if out_path[-1] != '/':
       out_path += '/'
 
+    # Run the athena query
     athena_client = boto3.client('athena', 'eu-west-1')
     response = athena_client.start_query_execution(
         QueryString=sql_query,

diff --git a/man/get_athena_query_response.Rd b/man/get_athena_query_response.Rd
diff --git a/man/read_sql.Rd b/man/read_sql.Rd