Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix header detection for tables with sparse numerical data #77

Merged
merged 9 commits into from
Jun 28, 2022
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ Example Float ``DoubleColumn`` ``d``
Example boolean ``BoolColumn`` ``b``
=============== ================= ====================

In the case of missing values, the column will be detected as ``StringColumn`` by default. If ``--allow-nan`` is passed to the
``omero metadata populate`` commands, missing values in floating-point columns will be detected as ``DoubleColumn`` and the
missing values will be stored as NaN.

However, it is possible to manually define the header types, ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below):

Expand Down
17 changes: 10 additions & 7 deletions src/omero_metadata/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,11 +241,13 @@ def _configure(self, parser):
populate.add_argument("--localcfg", help=(
"Local configuration file or a JSON object string"))

populate.add_argument("--allow_nan", action="store_true", help=(
"Allow empty values to become Nan in Long or Double columns"))
populate.add_argument(
"--allow-nan", "--allow_nan", action="store_true", help=(
"Allow empty values to become Nan in Long or Double columns"))

populate.add_argument("--manual_header", action="store_true", help=(
"Disable automatic header detection during population"))
populate.add_argument(
"--manual-header", "--manual_header", action="store_true", help=(
"Disable automatic header detection during population"))

populateroi.add_argument(
"--measurement", type=int, default=None,
Expand Down Expand Up @@ -489,15 +491,15 @@ def testtables(self, args):
self.ctx.die(100, "Failed to initialize Table")

@staticmethod
def detect_headers(csv_path):
def detect_headers(csv_path, keep_default_na=True):
'''
Function to automatically detect headers from a CSV file. This function
loads the table to pandas to detects the column type and match headers
'''

conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi']
headers = []
table = pd.read_csv(csv_path)
table = pd.read_csv(csv_path, keep_default_na=keep_default_na)
col_types = table.dtypes.values.tolist()
cols = list(table.columns)

Expand Down Expand Up @@ -577,7 +579,8 @@ def populate(self, args):
if not args.manual_header and \
not first_row[0].str.contains('# header').bool():
omero_metadata.populate.log.info("Detecting header types")
header_type = MetadataControl.detect_headers(args.file)
header_type = MetadataControl.detect_headers(
args.file, keep_default_na=args.allow_nan)
if args.dry_run:
omero_metadata.populate.log.info(f"Header Types:{header_type}")
else:
Expand Down
27 changes: 16 additions & 11 deletions test/integration/metadata/test_populate.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,13 @@ def assert_columns(self, columns):
col_names = "Well,Well Type,Concentration,Well Name"
assert col_names == ",".join([c.name for c in columns])

def assert_values(self, row_values):
# Unsure where the lower-casing is happening
if "A1" in row_values or "a1" in row_values:
assert "Control" in row_values
elif "A2" in row_values or "a2" in row_values:
assert "Treatment" in row_values

def assert_child_annotations(self, oas):
for ma, wid, wr, wc in oas:
assert isinstance(ma, MapAnnotationI)
Expand Down Expand Up @@ -767,6 +774,14 @@ def assert_columns(self, columns):
def assert_row_count(self, rows):
assert rows == len(self.roi_names)

def assert_values(self, row_values):
if "roi1" in row_values:
assert 0.5 in row_values
assert 100 in row_values
elif "roi2" in row_values:
assert 'nan' in [str(value) for value in row_values]
assert 200 in row_values

def get_target(self):
if not self.image:
image = self.test.make_image()
Expand Down Expand Up @@ -1218,17 +1233,7 @@ def _assert_parsing_context_values(self, t, fixture):
row_values = [col.values[0] for col in t.read(
list(range(len(cols))), hit, hit+1).columns]
assert len(row_values) == fixture.count
# Unsure where the lower-casing is happening
if "A1" in row_values or "a1" in row_values:
assert "Control" in row_values
elif "A2" in row_values or "a2" in row_values:
assert "Treatment" in row_values
elif "roi1" in row_values:
assert 0.5 in row_values
assert 100 in row_values
elif "roi2" in row_values:
assert 'nan' in [str(value) for value in row_values]
assert 200 in row_values
fixture.assert_values(row_values)

def _test_bulk_to_map_annotation_context(self, fixture, batch_size):
# self._testPopulateMetadataPlate()
Expand Down
114 changes: 77 additions & 37 deletions test/unit/test_automatic_header.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -17,43 +17,83 @@
StringColumn, WellColumn, DoubleColumn, BoolColumn, DatasetColumn


def test_detect_headers():
'''
Test of the default automatic column type detection behaviour
'''
d = {
'measurement 1': [11, 22, 33],
'measurement 2': [0.1, 0.2, 0.3],
'measurement 3': ['a', 'b', 'c'],
'measurement 4': [True, True, False],
'measurement 5': [11, 0.1, True]
}
prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ]
# Create a dictionary with every combination of headers
# eg plate_name/platename/plate name/plate_id/plateid/plate id
for prefix in prefix_list:
d[f'{prefix}_name'] = ['a', 'b', 'c']
d[f'{prefix} name'] = ['a', 'b', 'c']
d[f'{prefix}name'] = ['a', 'b', 'c']
d[f'{prefix}_id'] = [1, 2, 3]
d[f'{prefix} id'] = [1, 2, 3]
d[f'{prefix}id'] = [1, 2, 3]
d[f'{prefix}'] = [1, 2, 3]

df = pd.DataFrame(data=d)
tmp = tempfile.NamedTemporaryFile()
df.to_csv(tmp.name, index=False)
header = MetadataControl.detect_headers(tmp.name)
expected_header = [
'l', 'd', 's', 'b', 's',
's', 's', 's', 'l', 'l', 'l', 'l',
's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset',
'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate',
'well', 'well', 'well', 'l', 'l', 'l', 'well',
's', 's', 's', 'image', 'image', 'image', 'image',
's', 's', 's', 'roi', 'roi', 'roi', 'roi'
]
assert header == expected_header
class TestDetectHeaders:
"""Test the MetadataControl.detect_headers API"""
def assert_detect_headers(self, **kwargs):
df = pd.DataFrame(data=self.d)
tmp = tempfile.NamedTemporaryFile()
df.to_csv(tmp.name, index=False)
header = MetadataControl.detect_headers(tmp.name, **kwargs)
assert header == self.expected_header

def create_objects_dictionary(self):
# Create a dictionary with every combination of headers
# eg plate_name/platename/plate name/plate_id/plateid/plate id
self.d = {}
prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ]
for prefix in prefix_list:
self.d[f'{prefix}_name'] = ['a', 'b', 'c']
self.d[f'{prefix} name'] = ['a', 'b', 'c']
self.d[f'{prefix}name'] = ['a', 'b', 'c']
self.d[f'{prefix}_id'] = [1, 2, 3]
self.d[f'{prefix} id'] = [1, 2, 3]
self.d[f'{prefix}id'] = [1, 2, 3]
self.d[f'{prefix}'] = [1, 2, 3]
self.expected_header = [
's', 's', 's', 'l', 'l', 'l', 'l',
's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset',
'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate',
'well', 'well', 'well', 'l', 'l', 'l', 'well',
's', 's', 's', 'image', 'image', 'image', 'image',
's', 's', 's', 'roi', 'roi', 'roi', 'roi'
]

def test_objects_columns(self):
self.create_objects_dictionary()
self.assert_detect_headers()

def test_dense_columns(self):
'''
Test of the default automatic column type detection behaviour
'''
self.create_objects_dictionary()
self.d.update({
'measurement 1': [11, 22, 33],
'measurement 2': [0.1, 0.2, 0.3],
'measurement 3': ['a', 'b', 'c'],
'measurement 4': [True, True, False],
'measurement 5': [11, 0.1, True]
})
self.expected_header.extend(['l', 'd', 's', 'b', 's'])
self.assert_detect_headers()

def test_sparse_default_na(self):
'''
Test default handling of missing values
'''
self.create_objects_dictionary()
self.d.update({
'measurement 1': [11, None, 33],
'measurement 2': [0.1, 0.2, None],
'measurement 3': ['a', 'b', None],
'measurement 4': [True, None, False],
})
self.expected_header.extend(['d', 'd', 's', 's'])
self.assert_detect_headers(keep_default_na=True)

def test_sparse_no_default_na(self):
'''
Test handling of missing values as string columns
'''
self.create_objects_dictionary()
self.d.update({
'measurement 1': [11, None, 33],
'measurement 2': [0.1, 0.2, None],
'measurement 3': ['a', 'b', None],
'measurement 4': [True, None, False],
})
self.expected_header.extend(['s', 's', 's', 's'])
self.assert_detect_headers(keep_default_na=False)


class TestColumnTypes:
Expand Down