diff --git a/docs/S3-batchsource.md b/docs/S3-batchsource.md index 7db7469..53297fe 100644 --- a/docs/S3-batchsource.md +++ b/docs/S3-batchsource.md @@ -19,14 +19,27 @@ the credentials does not need to be provided. **Path:** Path to read from. For example, s3a:///path/to/input **Format:** Format of the data to read. -The format must be one of 'avro', 'blob', 'csv', 'delimited', 'json', 'parquet', 'text', or 'tsv'. +The format must be one of 'avro', 'blob', 'csv', 'delimited', 'json', 'parquet', 'text', 'tsv' or 'xls'. If the format is 'blob', every input file will be read into a separate record. The 'blob' format also requires a schema that contains a field named 'body' of type 'bytes'. If the format is 'text', the schema must contain a field named 'body' of type 'string'. +**Sample Size:** The maximum number of rows that will get investigated for automatic data type detection. +The default value is 1000. + +**Override:** A list of columns with the corresponding data types for whom the automatic data type detection gets +skipped. + +**Terminate If Empty Row:** Specify whether to stop reading after encountering the first empty row. Defaults to false. + +**Select Sheet Using:** Select the sheet by name or number. Default is 'Sheet Number'. + +**Sheet Value:** The name/number of the sheet to read from. If not specified, the first sheet will be read. +Sheet Number are 0 based, ie first sheet is 0. + **Delimiter:** Delimiter to use when the format is 'delimited'. This will be ignored for other formats. -**Use First Row as Header:** Whether to use first row as header. Supported formats are 'text', 'csv', 'tsv', 'delimited'. +**Use First Row as Header:** Whether to use first row as header. Supported formats are 'text', 'csv', 'tsv', 'delimited', 'xls'. **Authentication Method:** Authentication method to access S3. The default value is Access Credentials. IAM can only be used if the plugin is run in an AWS environment, such as on EMR. diff --git a/src/main/java/io/cdap/plugin/aws/s3/source/S3BatchSource.java b/src/main/java/io/cdap/plugin/aws/s3/source/S3BatchSource.java index 8e14f26..0b7e29e 100644 --- a/src/main/java/io/cdap/plugin/aws/s3/source/S3BatchSource.java +++ b/src/main/java/io/cdap/plugin/aws/s3/source/S3BatchSource.java @@ -147,6 +147,9 @@ public static class S3BatchConfig extends AbstractFileSourceConfig { public static final String NAME_PATH = "path"; private static final String NAME_FILE_SYSTEM_PROPERTIES = "fileSystemProperties"; private static final String NAME_DELIMITER = "delimiter"; + private static final String NAME_SHEET = "sheet"; + private static final String NAME_SHEET_VALUE = "sheetValue"; + private static final String NAME_TERMINATE_IF_EMPTY_ROW = "terminateIfEmptyRow"; private static final Gson GSON = new Gson(); private static final Type MAP_STRING_STRING_TYPE = new TypeToken>() { }.getType(); @@ -180,6 +183,36 @@ public static class S3BatchConfig extends AbstractFileSourceConfig { "fail, if credentials are incorrect. The default value is false.") private Boolean verifyCredentials; + @Macro + @Nullable + @Description("The maximum number of rows that will get investigated for automatic data type detection.") + private Long sampleSize; + + @Macro + @Nullable + @Description("A list of columns with the corresponding data types for whom the automatic data type detection gets" + + " skipped.") + private String override; + + @Name(NAME_SHEET) + @Macro + @Nullable + @Description("Select the sheet by name or number. Default is 'Sheet Number'.") + private String sheet; + + @Name(NAME_SHEET_VALUE) + @Macro + @Nullable + @Description("The name/number of the sheet to read from. If not specified, the first sheet will be read." + + "Sheet Number are 0 based, ie first sheet is 0.") + private String sheetValue; + + @Name(NAME_TERMINATE_IF_EMPTY_ROW) + @Macro + @Nullable + @Description("Specify whether to stop reading after encountering the first empty row. Defaults to false.") + private String terminateIfEmptyRow; + private S3BatchConfig(String path, @Nullable S3ConnectorConfig connection, String fileSystemProperties, Boolean verifyCredentials) { super(); diff --git a/widgets/S3-batchsource.json b/widgets/S3-batchsource.json index 0156239..82f5104 100644 --- a/widgets/S3-batchsource.json +++ b/widgets/S3-batchsource.json @@ -120,7 +120,8 @@ "json", "parquet", "text", - "tsv" + "tsv", + "xls" ], "default": "text" }, @@ -135,6 +136,36 @@ "plugin-method": "getSchema" } }, + { + "widget-type": "number", + "label": "Sample Size", + "name": "sampleSize", + "widget-attributes": { + "default": "1000", + "minimum": "1" + } + }, + { + "widget-type": "keyvalue-dropdown", + "label": "Override", + "name": "override", + "widget-attributes": { + "key-placeholder": "Field Name", + "value-placeholder": "Data Type", + "dropdownOptions": [ + "boolean", + "bytes", + "double", + "float", + "int", + "long", + "string", + "date", + "time", + "timestamp" + ] + } + }, { "widget-type": "textbox", "label": "Delimiter", @@ -174,6 +205,42 @@ "label": "False" } } + }, + { + "widget-type": "toggle", + "label": "Terminate If Empty Row", + "name": "terminateIfEmptyRow", + "widget-attributes": { + "default": "false", + "on": { + "value": "true", + "label": "True" + }, + "off": { + "value": "false", + "label": "False" + } + } + }, + { + "widget-type": "select", + "label": "Select Sheet Using", + "name": "sheet", + "widget-attributes": { + "values": [ + "Sheet Name", + "Sheet Number" + ], + "default": "Sheet Number" + } + }, + { + "widget-type": "textbox", + "label": "Sheet Value", + "name": "sheetValue", + "widget-attributes": { + "default": "0" + } } ] }, @@ -674,13 +741,46 @@ { "name": "skipHeader", "condition": { - "expression": "format == 'delimited' || format == 'csv' || format == 'tsv'" + "expression": "format == 'delimited' || format == 'csv' || format == 'tsv' || format == 'xls'" }, "show": [ { "name": "skipHeader" } ] + }, + { + "name": "sheet", + "condition": { + "expression": "format == 'xls'" + }, + "show": [ + { + "name": "sheet" + } + ] + }, + { + "name": "sheetValue", + "condition": { + "expression": "format == 'xls'" + }, + "show": [ + { + "name": "sheetValue" + } + ] + }, + { + "name": "terminateIfEmptyRow", + "condition": { + "expression": "format == 'xls'" + }, + "show": [ + { + "name": "terminateIfEmptyRow" + } + ] } ] }