Skip to content

Commit

Permalink
Merge pull request #15 from moj-analytical-services/security-update
Browse files Browse the repository at this point in the history
Security update
  • Loading branch information
isichei authored Feb 8, 2019
2 parents 07e120b + 6d01108 commit ea4059a
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 35 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ RoxygenNote: 6.0.1
Imports:
reticulate (>= 1.10),
s3tools,
readr,
readr
Suggests:
data.table (>= 1.11.8)
Remotes:
moj-analytical-services/s3tools
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

export(get_athena_query_response)
export(read_sql)
import(data.table)
import(readr)
import(reticulate)
import(s3tools)
10 changes: 3 additions & 7 deletions R/get_athena_query_response.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@
#'
#'@param sql_query A string specifying the SQL query you want to send to athena. See packages github readme for info on the flavour of SQL Athena uses.
#'
#'@param bucket The s3 bucket the query results will be written into. You must have read and write permissions to this folder.
#'
#'@param output_folder The folder path where you want your athena query to be written to. If not specified the output folder is "__athena_temp__" which is recommended.
#'
#'@param return_athena_types Specifies if the list describing the data's meta data types should be defined using athena datatypes (TRUE) or using the data engineering team's generic metadata types (FALSE). If not specified the default value of this input parameter is set to FALSE.
#'
#'@param timeout Specifies How long you want your sql query to wait before it gives up (in seconds). Default parameter is NULL which will mean SQL query will not timeout and could wait forever if an issue occured.
Expand All @@ -22,7 +18,7 @@
#'
#'@examples
#'# Read an sql query using readr::read_csv
#'response <- dbtools::get_athena_query_response("SELECT * from crest_v1.flatfile limit 10000", "my-bucket")
#'response <- dbtools::get_athena_query_response("SELECT * from crest_v1.flatfile limit 10000")
#'
#'# print out path to athena query output (as a csv)
#'print(response$s3_path)
Expand All @@ -34,14 +30,14 @@
#'s3_path_stripped = gsub("s3://", "", response$s3_path)
#'df <- s3tools::read_using(FUN = read.csv, s3_path=s3_path_stripped)

get_athena_query_response <- function(sql_query, bucket, output_folder="__athena_temp__/", return_athena_types=FALSE, timeout = NULL){
get_athena_query_response <- function(sql_query, return_athena_types=FALSE, timeout = NULL){

# Annoyingly I think you have to pull it in as the source_python function doesn't seem to be exported properly
# require(reticulate)

python_script <- system.file("extdata", "get_athena_query_response.py", package = "dbtools")
reticulate::source_python(python_script)
s3tools::get_credentials()
response <- get_athena_query_response(sql_query=sql_query, bucket=bucket, output_folder=output_folder, return_athena_types=return_athena_types, timeout=timeout)
response <- get_athena_query_response(sql_query=sql_query, return_athena_types=return_athena_types, timeout=timeout)
return(response)
}
17 changes: 9 additions & 8 deletions R/read_sql.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#'
#'@description uses boto3 (in python) to send an sql query to athena and return an R dataframe, tibble or data.table based on user preference.
#'
#'@import reticulate s3tools readr data.table
#'@import reticulate s3tools readr
#'
#'@export
#'
Expand All @@ -11,10 +11,6 @@
#'
#'@param sql_query A string specifying the SQL query you want to send to athena. See packages github readme for info on the flavour of SQL Athena uses.
#'
#'@param bucket The s3 bucket the query results will be written into. You must have read and write permissions to this folder.
#'
#'@param output_folder The folder path where you want your athena query to be written to. If not specified the output folder is "__athena_temp__" which is recommended.
#'
#'@param return_df_as String specifying what the table should be returned as i.e. 'dataframe' (reads data using read.csv), 'tibble' (reads data using readr::read_csv) or 'data.table' (reads data using data.table::fread). Default is 'tibble'. Not all tables returned are a DataFrame class.
#' Only return_df_as set to 'tibble' maintains date and datetime formats. dataframe and data.table will convert date and datetimes to characters.
#'
Expand All @@ -24,10 +20,10 @@
#'
#'@examples
#'# Read an sql query using readr::read_csv i.e. returning a Tibble
#'df <- dbtools::read_sql("SELECT * from crest_v1.flatfile limit 10000", 'my-bucket')
#'df <- dbtools::read_sql("SELECT * from crest_v1.flatfile limit 10000")
#'df

read_sql <- function(sql_query, bucket, output_folder="__athena_temp__/", return_df_as='tibble', timeout = NULL){
read_sql <- function(sql_query, return_df_as='tibble', timeout = NULL){

# Annoyingly I think you have to pull it in as the source_python function doesn't seem to be exported properly
# require(reticulate)
Expand All @@ -37,8 +33,9 @@ read_sql <- function(sql_query, bucket, output_folder="__athena_temp__/", return
stop("input var return_df_as must be one of the following 'dataframe', 'tibble' or 'data.table'")
}

response <- dbtools::get_athena_query_response(sql_query=sql_query, bucket=bucket, output_folder=output_folder, return_athena_types=FALSE, timeout=timeout)
response <- dbtools::get_athena_query_response(sql_query=sql_query, return_athena_types=FALSE, timeout=timeout)
s3_path_stripped <- gsub("s3://", "", response$s3_path)
bucket <- unlist(strsplit(s3_path_stripped, '/'))[1]
s3_key <- gsub(paste0(bucket,"/"), "", s3_path_stripped)

data_conversion <- dbtools:::get_data_conversion(return_df_as)
Expand All @@ -54,6 +51,10 @@ read_sql <- function(sql_query, bucket, output_folder="__athena_temp__/", return
df <- s3tools::read_using(FUN=readr::read_csv, s3_path=s3_path_stripped, col_names=TRUE, col_types=col_types)

} else if(return_df_as == 'data.table'){
dt_ver <- packageVersion("data.table")
if(dt_ver < '1.11.8'){
warning("Your version of data.table must be 1.11.8 or above please install a new version otherwise your outputs of type data.table may not convert data types properly.")
}
df <- s3tools::read_using(FUN=data.table::fread, s3_path=s3_path_stripped, header=TRUE, colClasses=col_classes_vec)
} else {
df <- s3tools::read_using(FUN=read.csv, s3_path=s3_path_stripped, header=TRUE, colClasses=col_classes_vec)
Expand Down
12 changes: 11 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,14 @@ delete_object <- function(bucket, key){
reticulate::source_python(python_script)
s3tools::get_credentials()
delete_object(bucket=bucket, key=key)
}
}

get_iam_role <- function(){
user <- Sys.getenv("USER")
if(user==""){
stop("Error could not find username in env vars. Please raise an issue on the Github repo for dbtools.")
}
iam_role <- paste0("alpha_user_", user)
return(iam_role)
}

10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,19 @@ print(response$meta)
- To specify a string in the sql query always use '' not "". Using ""'s means that you are referencing a database, table or col, etc.
- When data is pulled back into rStudio the column types are either R characters (for any col that was a dates, datetimes, characters) or doubles (for everything else).

#### Under The Hood

When you run a query in SQL against our databases you are using Athena. When Athena produces the output of an SQL query it is always written to a location in S3 as a csv. dbtools defines the S3 location based on your AWS role. It will write the output CSV into a folder only you have read/write access to and then read it in using `s3tools`. Once the data has been read into a dataframe dbtools will delete the CSV from your folder.

**Note:** dbtools requires you to have the StandardDatabaseAccess group policy attached. If you want to use dbtools please ask the data engineering team (on slack ideally via the #analytical_platform channel).

#### Changelog:

## v2.0.0 - 2019-02-07

- Input parameters `bucket` and `output_folder` in `read_sql` and `get_athena_query_response`. New section to README named 'Under The Hood' explains why.
- Note package now requires the group policy `StandardDatabaseAccess` to be attached to the role that needs to use this package.

## v1.0.0 - 2019-01-14

- Added function `read_sql` which reads an SQL query directly into an R dataframe. See R documentation (i.e. `?read_sql`)
Expand Down
2 changes: 1 addition & 1 deletion inst/extdata/boto_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

def delete_object(bucket, key):
s3_client = boto3.client('s3')
s3_client.delete_object(Bucket=bucket, Key=key)
s3_client.delete_object(Bucket=bucket, Key=key)
12 changes: 10 additions & 2 deletions inst/extdata/get_athena_query_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import time
import os

def get_athena_query_response(sql_query, bucket, output_folder = "__athena_temp__/", return_athena_types = False, timeout = None) :
def get_athena_query_response(sql_query, return_athena_types = False, timeout = None) :

type_dictionary = {
"char" : "character",
Expand All @@ -18,10 +18,18 @@ def get_athena_query_response(sql_query, bucket, output_folder = "__athena_temp_
"double" : "double"
}

out_path = os.path.join('s3://', bucket, output_folder)
# Get role specific path for athena output
bucket = "alpha-athena-query-dump"

sts_client=boto3.client('sts')
sts_resp=sts_client.get_caller_identity()

out_path = os.path.join('s3://', bucket, sts_resp['UserId'], "__athena_temp__/")

if out_path[-1] != '/':
out_path += '/'

# Run the athena query
athena_client = boto3.client('athena', 'eu-west-1')
response = athena_client.start_query_execution(
QueryString=sql_query,
Expand Down
9 changes: 2 additions & 7 deletions man/get_athena_query_response.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 2 additions & 7 deletions man/read_sql.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit ea4059a

Please sign in to comment.