Skip to content

Commit

Permalink
Merge branch 'numeric-stats-and-unique-stats'
Browse files Browse the repository at this point in the history
  • Loading branch information
Maxim Moinat committed Sep 23, 2019
2 parents 8b305a4 + ec52d29 commit 3a68fb7
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.ohdsi.utilities.ScanFieldName;
import org.ohdsi.utilities.files.QuickAndDirtyXlsxReader;
import org.ohdsi.utilities.files.QuickAndDirtyXlsxReader.Sheet;

Expand Down Expand Up @@ -130,43 +131,34 @@ public static Database generateModelFromCSV(InputStream stream, String dbName) {

public static Database generateModelFromScanReport(String filename) {
Database database = new Database();
Map<String, Table> nameToTable = new HashMap<String, Table>();
Map<String, Table> nameToTable = new HashMap<>();
QuickAndDirtyXlsxReader workbook = new QuickAndDirtyXlsxReader(filename);
Sheet sheet = workbook.get(0);
Iterator<org.ohdsi.utilities.files.QuickAndDirtyXlsxReader.Row> iterator = sheet.iterator();
Map<String, Integer> fieldName2ColumnIndex = new HashMap<String, Integer>();
for (String header : iterator.next())
fieldName2ColumnIndex.put(header, fieldName2ColumnIndex.size());

iterator.next(); // Skip header
while (iterator.hasNext()) {
org.ohdsi.utilities.files.QuickAndDirtyXlsxReader.Row row = iterator.next();
String tableName = row.get(fieldName2ColumnIndex.get("Table"));
String tableName = row.getStringByHeaderName(ScanFieldName.TABLE);
if (tableName.length() != 0) {
Table table = nameToTable.get(tableName);
if (table == null) {
table = new Table();
table.setName(tableName.toLowerCase());
table.setRowCount((int) Double.parseDouble(row.get(fieldName2ColumnIndex.get("N rows"))));
table.setRowsCheckedCount((int) Double.parseDouble(row.get(fieldName2ColumnIndex.get("N rows checked"))));
Integer nRows = row.getIntByHeaderName(ScanFieldName.N_ROWS);
Integer nRowChecked = row.getIntByHeaderName(ScanFieldName.N_ROWS_CHECKED);
table.setRowCount((nRows == null || nRows == -1) ? nRowChecked : nRows);
nameToTable.put(tableName, table);
database.tables.add(table);
}
String fieldName = row.get(fieldName2ColumnIndex.get("Field"));
String fieldName = row.getStringByHeaderName(ScanFieldName.FIELD);
Field field = new Field(fieldName.toLowerCase(), table);
Integer index;
// Someone may have manually deleted data, so can't assume this
// is always there:
index = fieldName2ColumnIndex.get("Fraction empty");
if (index != null && index < row.size())
field.setNullable(!row.get(index).equals("0"));

index = fieldName2ColumnIndex.get("Type");
if (index != null && index < row.size())
field.setType(row.get(index));

index = fieldName2ColumnIndex.get("Max length");
if (index != null && index >= 0 && index < row.size())
field.setMaxLength((int) (Double.parseDouble(row.get(index))));

String fractionEmpty = row.getByHeaderName(ScanFieldName.FRACTION_EMPTY);
field.setNullable(fractionEmpty == null || !fractionEmpty.equals("0"));
field.setType(row.getByHeaderName(ScanFieldName.TYPE));
field.setMaxLength(row.getIntByHeaderName(ScanFieldName.MAX_LENGTH));

field.setValueCounts(getValueCounts(workbook, tableName, fieldName));
table.getFields().add(field);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public class Field implements MappableItem {
private boolean isNullable;
private String type;
private String description = "";
private int maxLength;
private Integer maxLength;
private boolean isStem;

public Field(String name, Table table) {
Expand Down Expand Up @@ -111,11 +111,11 @@ public String getComment() {
return comment;
}

public int getMaxLength() {
public Integer getMaxLength() {
return maxLength;
}

public void setMaxLength(int maxLength) {
public void setMaxLength(Integer maxLength) {
this.maxLength = maxLength;
}

Expand Down
38 changes: 38 additions & 0 deletions rabbit-core/src/main/java/org/ohdsi/utilities/ScanFieldName.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package org.ohdsi.utilities;

public interface ScanFieldName {
String TABLE = "Table";
String FIELD = "Field";
String TYPE = "Type";
String MAX_LENGTH = "Max length";
String N_ROWS = "N rows";
String N_ROWS_CHECKED = "N rows checked";
String FRACTION_EMPTY = "Fraction empty";
String AVERAGE = "Average";
String STDEV = "Standard Deviation";
String MIN = "Min";
String Q1 = "25%";
String Q2 = "Median";
String Q3 = "75%";
String MAX = "Max";
}


//public enum ScanField {
// TABLE = "Table";
// FIELD = "Field";
// TYPE = "Type";
// MAX_LENGTH = "Max length";
// N_ROWS = "N rows";
// N_ROWS_CHECKED = "N rows checked";
// FRACTION_EMPTY = "Fraction empty";
// AVERAGE = "Average";
// STDEV = "Standard Deviation";
// MIN = "Min";
// Q1 = "25%";
// Q2 = "Median";
// Q3 = "75%";
// MAX = "Max%";
//
// private final String fieldName;
//}
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ private void processSheet(String filename, ZipInputStream inputStream) throws IO
fullSheet.append(line);

for (String rowLine : StringUtilities.multiFindBetween(fullSheet.toString(), "<row", "</row>")) {
Row row = new Row();
Row row = new Row(sheet);
row.addAll(findCellValues(rowLine));
if (row.size() != 0)
sheet.add(row);
Expand Down Expand Up @@ -529,6 +529,15 @@ public class Sheet extends ArrayList<Row> {
private static final long serialVersionUID = -8597151681911998153L;
private String name;
private int order;
private Map<String, Integer> fieldName2ColumnIndex = new HashMap<>();

public boolean add(Row row) {
// Assume first row is the header, preprocess it
if (this.size() == 0) {
createFieldNameIndex(row);
}
return super.add(row);
}

public String getName() {
return name;
Expand All @@ -538,10 +547,62 @@ public void setName(String name) {
this.name = name;
}

private void createFieldNameIndex(List<String> row) {
int i = 0;
for (String header : row) {
fieldName2ColumnIndex.put(header, i);
i += 1;
}
}

private Integer getFieldIndex(String fieldName) {
return fieldName2ColumnIndex.get(fieldName);
}
}

public class Row extends ArrayList<String> {
private static final long serialVersionUID = -6391290892840364766L;
private final Sheet sheet;

public Row(Sheet sheet) {
this.sheet = sheet;
}

/**
* Lookup index of the fieldName in first row of the sheet that this row belongs to.
* Use index to get value of this row.
* @param fieldName name of the field, as it appears in the header
* @return null if fieldName not in the header
*/
public String getByHeaderName(String fieldName) {
return getStringByHeaderName(fieldName);
}

public String getStringByHeaderName(String fieldName) {
// Someone may have manually deleted data, so can't assume fieldName
// is always there:
Integer index = sheet.getFieldIndex(fieldName);
if (index != null && index < this.size())
return this.get(index);
return null;
}

public Double getDoubleByHeaderName(String fieldName) {
String value = getStringByHeaderName(fieldName);
if (value != null) {
return Double.parseDouble(value);
} else {
return null;
}
}

public Integer getIntByHeaderName(String fieldName) {
Double value = getDoubleByHeaderName(fieldName);
if (value == null) {
return null;
} else {
return value.intValue();
}
}
}
}
Loading

0 comments on commit 3a68fb7

Please sign in to comment.