From 12512a3192bcc515c2da956a6a6704849cdadeee Mon Sep 17 00:00:00 2001 From: Balazs Kegl Date: Mon, 22 Jul 2019 13:03:28 +0200 Subject: [PATCH] [WIP] hyperopt (#177) * hyperopt interface test * Create hyperopt.py module * moving hyperopt * basic hyperparameter class * Fix flake errors * Move hyperopt.py * rename * random engine, file interface working, no stats yet * random engine hooked up, experiment outputs a summary table and the submission with the best hypers * Tests, documentation, fixing the time in the score table, changing --n-iter meaning to count number of hyper combinations times cv folds * adding init to test dir * division by zero * file rename, cleanup * idmax --- rampwf/hyperopt/__init__.py | 7 + rampwf/hyperopt/cli/__init__.py | 0 rampwf/hyperopt/cli/hyperopt.py | 48 ++ rampwf/hyperopt/hyperopt.py | 524 ++++++++++++++++++ rampwf/hyperopt/tests/__init__.py | 0 .../header_in_files/titanic/README.md | 23 + .../header_in_files/titanic/data/test.csv | 90 +++ .../header_in_files/titanic/data/train.csv | 357 ++++++++++++ .../header_in_files/titanic/problem.py | 42 ++ .../header_in_files/titanic/requirements.txt | 5 + .../submissions/starting_kit/classifier.py | 27 + .../starting_kit/feature_extractor.py | 50 ++ rampwf/hyperopt/tests/test_hyperparameter.py | 44 ++ rampwf/utils/__init__.py | 7 +- rampwf/utils/scoring.py | 8 +- rampwf/utils/submission.py | 17 +- rampwf/utils/testing.py | 8 +- setup.py | 1 + 18 files changed, 1241 insertions(+), 17 deletions(-) create mode 100644 rampwf/hyperopt/__init__.py create mode 100644 rampwf/hyperopt/cli/__init__.py create mode 100644 rampwf/hyperopt/cli/hyperopt.py create mode 100644 rampwf/hyperopt/hyperopt.py create mode 100644 rampwf/hyperopt/tests/__init__.py create mode 100644 rampwf/hyperopt/tests/interfaces/header_in_files/titanic/README.md create mode 100644 rampwf/hyperopt/tests/interfaces/header_in_files/titanic/data/test.csv create mode 100644 rampwf/hyperopt/tests/interfaces/header_in_files/titanic/data/train.csv create mode 100644 rampwf/hyperopt/tests/interfaces/header_in_files/titanic/problem.py create mode 100644 rampwf/hyperopt/tests/interfaces/header_in_files/titanic/requirements.txt create mode 100755 rampwf/hyperopt/tests/interfaces/header_in_files/titanic/submissions/starting_kit/classifier.py create mode 100755 rampwf/hyperopt/tests/interfaces/header_in_files/titanic/submissions/starting_kit/feature_extractor.py create mode 100644 rampwf/hyperopt/tests/test_hyperparameter.py diff --git a/rampwf/hyperopt/__init__.py b/rampwf/hyperopt/__init__.py new file mode 100644 index 00000000..93a0caf4 --- /dev/null +++ b/rampwf/hyperopt/__init__.py @@ -0,0 +1,7 @@ +from .hyperopt import Hyperparameter, init_hyperopt, run_hyperopt + +__all__ = [ + 'Hyperparameter', + 'init_hyperopt', + 'run_hyperopt' +] diff --git a/rampwf/hyperopt/cli/__init__.py b/rampwf/hyperopt/cli/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rampwf/hyperopt/cli/hyperopt.py b/rampwf/hyperopt/cli/hyperopt.py new file mode 100644 index 00000000..6d1eb096 --- /dev/null +++ b/rampwf/hyperopt/cli/hyperopt.py @@ -0,0 +1,48 @@ +import click + +from ..hyperopt import run_hyperopt + +CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) + + +@click.command(context_settings=CONTEXT_SETTINGS) +@click.option('--submission', default='starting_kit', show_default=True, + help='The kit to hyperopt. It should be located in the ' + '"submissions" folder of the starting kit.') +@click.option('--ramp-kit-dir', default='.', show_default=True, + help='Root directory of the ramp-kit to hyperopt.') +@click.option('--ramp-data-dir', default='.', show_default=True, + help='Directory containing the data. This directory should ' + 'contain a "data" folder.') +@click.option('--ramp-submission-dir', default='submissions', + show_default=True, + help='Directory where the submissions are stored. It is the ' + 'directory (typically called "submissions" in the ramp-kit) ' + 'that contains the individual submission subdirectories.') +@click.option('--engine', default='random', show_default=True, + help='The name of the hyperopt engine, e.g., "random".') +@click.option('--n-iter', default=10, show_default=True, + help='The number of hyperopt iterations, inputted to the ' + 'engine. The granularity is per cv fold, so if you want to ' + 'fully test 7 hyperparameter combinations for example with the ' + 'random engine and you have 8 CV folds, you should enter ' + '--n-iter 56') +@click.option('--save-best', is_flag=True, default=True, + show_default=True, + help='Specify this flag to create a _hyperopt ' + 'in hte "submissions" dir with the best submission.') +def main(submission, ramp_kit_dir, ramp_data_dir, ramp_submission_dir, + engine, n_iter, save_best): + """Hyperopt a submission.""" + run_hyperopt( + ramp_kit_dir=ramp_kit_dir, ramp_data_dir=ramp_data_dir, + ramp_submission_dir=ramp_submission_dir, submission=submission, + engine_name=engine, n_iter=n_iter, save_best=save_best) + + +def start(): + main() + + +if __name__ == '__main__': + start() diff --git a/rampwf/hyperopt/hyperopt.py b/rampwf/hyperopt/hyperopt.py new file mode 100644 index 00000000..465fc805 --- /dev/null +++ b/rampwf/hyperopt/hyperopt.py @@ -0,0 +1,524 @@ +"""Hyperparameter optiomization for ramp-kits.""" +from __future__ import print_function + +import re +import os +import shutil +import numpy as np +import pandas as pd +from tempfile import mkdtemp +from ..utils import ( + assert_read_problem, import_file, run_submission_on_cv_fold) + +HYPERPARAMS_SECTION_START = '# RAMP START HYPERPARAMETERS' +HYPERPARAMS_SECTION_END = '# RAMP END HYPERPARAMETERS' +HYPERPARAMS_REPL_REGEX = re.compile('{}.*{}'.format( + HYPERPARAMS_SECTION_START, HYPERPARAMS_SECTION_END), re.S) + + +class Hyperparameter(object): + """Discrete grid hyperparameter. + + Represented by a list of values, a default value, the name of the + hyperparameter (specified by the user in the workflow element), the + name of the workflow element in which the hyperparemeter appears, and an + optional prior probability vector. + + Attributes: + name : string + The name of the hyperparameter variable, used in user interface, + both for specifying the grid of values and getting the report on + an experiment. Initialized to '' then set in set_names, to the + name the user chose for the variable in the workflow element. + workflow_element_name : string + The name of the workflow element in which the hyperparameter is + used. Initialized to '' then set in set_names. + dtype : string + The dtype of the hyperparameter. + default_index: int + The index in values of the current value of the hyperparameter. + values: numpy array of any dtype + The list of hyperparameter values. + prior: numpy array of float + A list of reals that the hyperopt can use as a prior probability + over values. Positivity and summing to one are not checked, + hyperparameter optimizers should do that when using the list + """ + + def __init__(self, dtype, default=None, values=None, prior=None): + self.name = '' + self.workflow_element_name = '' + self.dtype = dtype + if default is None and values is None: + raise ValueError('Either default or values must be defined.') + if values is None: + self.values = np.array([default], dtype=self.dtype) + else: + if len(values) < 1: + raise ValueError( + 'Values needs to contain at least one element.') + self.values = np.array(values, dtype=self.dtype) + if default is None: + self.default_index = 0 + else: + if default not in self.values: + raise ValueError('Default must be among values.') + else: + self.set_default(default) + + if prior is None: + self.prior = np.array([1. / self.n_values] * self.n_values) + else: + if len(prior) != len(values): + raise ValueError( + 'len(values) == {} != {} == len(prior)'.format( + len(values), len(prior))) + self.prior = prior + + @property + def n_values(self): + """The number of hyperparameter values. + + Return: + n_values : int + The number of hyperparameter values len(values) + """ + return len(self.values) + + @property + def default(self): + """The current value of the hyperparameter. + + Return: + default : any dtype + The current value of the hyperparameter values[default_index]. + """ + return self.values[self.default_index] + + @property + def default_repr(self): + """The string representation of the default value. + + It can be used to output the default value into a python file. For + object types it adds '', otherwise it's the string representation of + the default value. + + Return: + default_repr : str + The string representation of the default value. + """ + if self.dtype == 'object': + return '\'{}\''.format(self.default) + else: + return str(self.default) + + @property + def values_repr(self): + """The string representation of the list of values. + + It can be used to output the list of values into a python file. For + object types it adds '' around the values, otherwise it's the list of + string representations of the values in brackets. + + Return: + values_repr : list of str + The string representation of the list of values. + """ + s = '[' + for v in self.values: + if self.dtype == 'object': + s += '\'{}\', '.format(v) + else: + s += '{}, '.format(v) + s += ']' + return s + + @property + def python_repr(self): + """The string representation of the hyperparameter. + + It can be used to output the hyperparameter definition into a python + file: + = Hyperparameter( + dtype=, default=, values=[]) + + Return: + python_repr : str + The string representation of the hyperparameter. + """ + repr = '{} = Hyperparameter(\n'.format(self.name) + repr += '\tdtype={}'.format(str(self.dtype)) + repr += ', default={}'.format(self.default_repr) + repr += ', values={})\n'.format(self.values_repr) + return repr + + def set_names(self, name, workflow_element_name): + """Set the name and workflow element name. + + Used when a hyperparameter object is loaded from a workflow element. + + Parameters: + name : str + The name of the hyperparameter, declared by the user in the + workflow element. + workflow_element_name : str + The name of the workflow element in which the hyperparameter + is defined. + + """ + self.name = name + self.workflow_element_name = workflow_element_name + + def get_index(self, value): + """Get the index of a value. + + Parameters: + value : any dtype + The value to look for. + """ + return list(self.values).index(value) + + def set_default(self, default): + """Set the default value. + + Parameters: + default : any dtype + The new default value. + """ + self.default_index = self.get_index(default) + + def __int__(self): + """Cast the default value into an integer. + + It can be used in the workflow element for an integer hyperparameter. + + Return: + int(default) : int + The integer representation of the default value. + """ + return int(self.default) + + def __float__(self): + """Cast the default value into an float. + + It can be used in the workflow element for an float hyperparameter. + + Return: + float(default) : float + The float representation of the default value. + """ + return float(self.default) + + def __str__(self): + """Cast the default value into a string. + + It can be used in the workflow element for a string hyperparameter. + + Return: + str(default) : str + The string representation of the default value. + """ + return str(self.default) + + +def parse_hyperparameters(module_path, workflow_element_name): + """Parse hyperparameters in a workflow element. + + Load the module, take all Hyperparameter objects, and set the name of each + to the name of the hyperparameter the user chose and the workflow element + name of each to workflow_element_name. + + Parameters: + module_path : str + The path to the submission directory. + workflow_element_name : string + The name of the workflow element. + Return: + hyperparameters : list of instances of Hyperparameter + """ + hyperparameters = [] + workflow_element = import_file(module_path, workflow_element_name) + for object_name in dir(workflow_element): + o = getattr(workflow_element, object_name) + if type(o) == Hyperparameter: + o.set_names(object_name, workflow_element_name) + hyperparameters.append(o) + return hyperparameters + + +def parse_all_hyperparameters(module_path, workflow): + """Parse hyperparameters in a submission. + + Load all the the modules, take all Hyperparameter objects, and set the name + of each to the name of the hyperparameter the user chose and the workflow + element name of each to the corresponding workflow_element_name. + + Parameters: + module_path : str + The path to the submission directory. + workflow_element_name : string + The name of the workflow element. + Return: + hyperparameters : list of instances of Hyperparameter + """ + hyperparameters = [] + for wen in workflow.element_names: + hyperparameters += parse_hyperparameters(module_path, wen) + return hyperparameters + + +def write_hyperparameters(submission_dir, output_submission_dir, + hypers_per_workflow_element): + """Write hyperparameters in a submission. + + Read workflow elements from submission_dir, replace the hyperparameter + section with the hyperparameters in the hypers_per_workflow_element + dictionary (with new hyperparamter values set by, e.g, a hyperopt engine), + then write the new workflow elements into output_submission_dir (which + can be a temporary directory or submission_dir itself when the function + is called to replace the hyperparameters in the input submission with the + best hyperparameters.) + + Parameters: + submission_dir : str + The path to the submission directory from which the submission is + read. + output_submission_dir : str + The path to the ouput submission directory into which the + submission with the new hyperparameter values is written. + hypers_per_workflow_element : dictionary + Each key is a workflow element name and each value is a list of + Hyperparameter instances, representing the hyperparemters in + the workflow element. + """ + for wen, hs in hypers_per_workflow_element.items(): + hyper_section = '{}\n'.format(HYPERPARAMS_SECTION_START) + for h in hs: + hyper_section += h.python_repr + hyper_section += HYPERPARAMS_SECTION_END + f_name = os.path.join(submission_dir, wen + '.py') + with open(f_name) as f: + content = f.read() + content = HYPERPARAMS_REPL_REGEX.sub(hyper_section, content) + output_f_name = os.path.join(output_submission_dir, wen + '.py') + with open(output_f_name, 'w') as f: + f.write(content) + + +class RandomEngine(object): + """Random search hyperopt engine. + + Attributes: + hyperparameters: a list of Hyperparameters + """ + + def __init__(self, hyperparameters): + self.hyperparameters = hyperparameters + + def next_hyperparameter_indices(self, df_scores, n_folds): + """Return the next hyperparameter indices to try. + + Parameters: + df_scores : pandas DataFrame + It represents the results of the experiments that have been + run so far. + Return: + next_value_indices : list of int + The indices in corresponding to the values to try in + hyperparameters. + """ + # First finish incomplete cv's. + hyperparameter_names = [h.name for h in self.hyperparameters] + df_n_folds = df_scores.groupby(hyperparameter_names).count() + incomplete_folds = df_n_folds[(df_n_folds['fold_i'] % n_folds > 0)] + if len(incomplete_folds) > 0: + incomplete_folds = incomplete_folds.reset_index() + next_values = incomplete_folds.iloc[0][ + [h.name for h in self.hyperparameters]].values + next_value_indices = [ + h.get_index(v) for h, v + in zip(self.hyperparameters, next_values)] + fold_i = incomplete_folds.iloc[0]['fold_i'] % n_folds + # Otherwise select hyperparameter values from those that haven't + # been selected yet, using also prior + else: + fold_i = 0 + next_value_indices = [] + df_scores_local = df_scores.copy() + for h in self.hyperparameters: + # unnormalized but positive prior + prior = np.clip(h.prior, 1e-15, None) + # How many times each hyperparameter value was tried, given the + # selected values next_value_indices so far + frequencies = np.zeros(len(prior)) + if len(df_scores_local) > 0: + for i, v in np.ndenumerate(h.values): + frequencies[i] = len( + df_scores_local[df_scores_local[h.name] == v]) + # How many times each hyperparameter value was not tried, given + # the selected values next_value_indices so far, in this round + # of full grid search + frequencies = max(frequencies) - frequencies + prior *= frequencies + if prior.sum() <= 0: + prior = np.ones(len(prior)) + prior /= prior.sum() + selected_index = np.random.choice( + range(len(h.values)), p=prior) + # keep only experiments that used the selected values so far + df_scores_local = df_scores_local[( + df_scores_local[h.name] == h.values[selected_index])] + next_value_indices.append(selected_index) + return fold_i, next_value_indices + + +class HyperparameterOptimization(object): + """A hyperparameter optimization. + + Attributes: + hyperparameters: a list of Hyperparameters + engine: a hyperopt engine + ramp_kit_dir: the directory where the ramp kit is found + submission_dir: the directory where the submission to be optimized + is found + """ + + def __init__(self, hyperparameters, engine, ramp_kit_dir, submission_dir): + self.hyperparameters = hyperparameters + self.engine = engine + self.problem = assert_read_problem(ramp_kit_dir) + self.X_train, self.y_train = self.problem.get_train_data( + path=ramp_kit_dir) + self.cv = list(self.problem.get_cv(self.X_train, self.y_train)) + self.submission_dir = submission_dir + self.hyperparameter_names = [h.name for h in hyperparameters] + self.score_names = [s.name for s in self.problem.score_types] + self.df_summary_ = None + + # Set up hypers_per_workflow_element disctionary: keys are + # workflow element names, values are lists are hypers belonging + # to the workflow element + self.hypers_per_workflow_element = { + wen: [] for wen in self.problem.workflow.element_names} + for h in self.hyperparameters: + self.hypers_per_workflow_element[h.workflow_element_name].append(h) + + # Set up df_scores_ which will contain one row per experiment + scores_columns = ['fold_i'] + scores_columns += self.hyperparameter_names + scores_columns += ['train_' + name for name in self.score_names] + scores_columns += ['valid_' + name for name in self.score_names] + scores_columns += ['train_time', 'valid_time', 'n_train', 'n_valid'] + dtypes = ['int'] + [h.dtype for h in self.hyperparameters] +\ + ['float'] * 2 * len(self.score_names) + ['float'] * 2 + ['int'] * 2 + self.df_scores_ = pd.DataFrame(columns=scores_columns) + for column, dtype in zip(scores_columns, dtypes): + self.df_scores_[column] = self.df_scores_[column].astype(dtype) + + def _run_next_experiment(self, module_path, fold_i): + _, _, df_scores = run_submission_on_cv_fold( + self.problem, module_path=module_path, fold=self.cv[fold_i], + X_train=self.X_train, y_train=self.y_train) + return df_scores + + def _update_df_scores(self, df_scores, fold_i): + row = {'fold_i': fold_i} + for h in self.hyperparameters: + row[h.name] = h.default + for name in self.score_names: + row['train_' + name] = df_scores.loc['train'][name] + row['valid_' + name] = df_scores.loc['valid'][name] + row['train_time'] = float(df_scores.loc['train']['time']) + row['valid_time'] = float(df_scores.loc['valid']['time']) + row['n_train'] = len(self.cv[fold_i][0]) + row['n_valid'] = len(self.cv[fold_i][1]) + self.df_scores_ = self.df_scores_.append(row, ignore_index=True) + + def _make_and_save_summary(self, hyperopt_output_path): + summary_groupby = self.df_scores_.groupby( + self.hyperparameter_names) + means = summary_groupby.mean().drop(columns=['fold_i']) + stds = summary_groupby.std().drop(columns=['fold_i']) + counts = summary_groupby.count()[['n_train']].rename( + columns={'n_train': 'n_folds'}) + self.df_summary_ = pd.merge( + means, stds, left_index=True, right_index=True, + suffixes=('_m', '_s')) + self.df_summary_ = pd.merge( + counts, self.df_summary_, left_index=True, right_index=True) + print(self.df_summary_) + summary_fname = os.path.join(hyperopt_output_path, 'summary.csv') + self.df_summary_.to_csv(summary_fname) + + def _save_best_model(self): + official_scores = self.df_summary_[ + 'valid_' + self.problem.score_types[0].name + '_m'] + if self.problem.score_types[0].is_lower_the_better: + best_defaults = official_scores.idxmin() + else: + best_defaults = official_scores.idxmax() + print('Best hyperparameters: ', best_defaults) + for bd, h in zip(best_defaults, self.hyperparameters): + h.set_default(bd) + # Overwrite the submission with the best hyperparameter values + write_hyperparameters( + self.submission_dir, self.submission_dir, + self.hypers_per_workflow_element) + + def run(self, n_iter): + # Create hyperopt output directory + hyperopt_output_path = os.path.join( + self.submission_dir, 'hyperopt_output') + if not os.path.exists(hyperopt_output_path): + os.makedirs(hyperopt_output_path) + for i in range(n_iter): + # Getting new hyperparameter values from engine + fold_i, next_value_indices =\ + self.engine.next_hyperparameter_indices( + self.df_scores_, len(self.cv)) + # Updating hyperparameters + for h, i in zip(self.hyperparameters, next_value_indices): + h.default_index = i + # Writing submission files with new hyperparameter values + output_submission_dir = mkdtemp() + write_hyperparameters( + self.submission_dir, output_submission_dir, + self.hypers_per_workflow_element) + # Calling the training script. + df_scores = self._run_next_experiment( + output_submission_dir, fold_i) + self._update_df_scores(df_scores, fold_i) + shutil.rmtree(output_submission_dir) + self._make_and_save_summary(hyperopt_output_path) + self._save_best_model() + + +def init_hyperopt(ramp_kit_dir, ramp_submission_dir, submission, engine_name): + problem = assert_read_problem(ramp_kit_dir) + hyperopt_submission = submission + '_hyperopt' + hyperopt_submission_dir = os.path.join( + ramp_submission_dir, hyperopt_submission) + submission_dir = os.path.join( + ramp_submission_dir, submission) + if os.path.exists(hyperopt_submission_dir): + shutil.rmtree(hyperopt_submission_dir) + shutil.copytree(submission_dir, hyperopt_submission_dir) + hyperparameters = parse_all_hyperparameters( + hyperopt_submission_dir, problem.workflow) + if engine_name == 'random': + engine = RandomEngine(hyperparameters) + else: + raise ValueError('{} is not a valide engine name'.format(engine_name)) + hyperparameter_experiment = HyperparameterOptimization( + hyperparameters, engine, ramp_kit_dir, hyperopt_submission_dir) + return hyperparameter_experiment + + +def run_hyperopt(ramp_kit_dir, ramp_data_dir, ramp_submission_dir, + submission, engine_name, n_iter, save_best=True, + is_cleanup=False): + hyperparameter_experiment = init_hyperopt( + ramp_kit_dir, ramp_submission_dir, submission, engine_name) + hyperparameter_experiment.run(n_iter) + if is_cleanup: + shutil.rmtree(hyperparameter_experiment.submission_dir) diff --git a/rampwf/hyperopt/tests/__init__.py b/rampwf/hyperopt/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/README.md b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/README.md new file mode 100644 index 00000000..c63e86cc --- /dev/null +++ b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/README.md @@ -0,0 +1,23 @@ +# RAMP starting kit on the Titanic dataset + +Authors: Alexandre Gramfort & Balazs Kegl + +[![Build Status](https://travis-ci.org/ramp-kits/titanic.svg?branch=master)](https://travis-ci.org/ramp-kits/titanic) + +Go to [`ramp-worflow`](https://github.com/paris-saclay-cds/ramp-workflow) for more help on the [RAMP](http:www.ramp.studio) ecosystem. + +Install ramp-workflow (rampwf), then execute + +``` +ramp_test_submission +``` + +to test the starting kit submission (`submissions/starting_kit`) and + +``` +ramp_test_submission --submission=random_forest_20_5 +``` + +to test `random_forest_20_5` or any other submission in `submissions`. + +Get started on this RAMP with the [dedicated notebook](titanic_starting_kit.ipynb). diff --git a/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/data/test.csv b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/data/test.csv new file mode 100644 index 00000000..e08b3a0e --- /dev/null +++ b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/data/test.csv @@ -0,0 +1,90 @@ +PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked +270,1,1,"Bissette, Miss. Amelia",female,35.0,0,0,PC 17760,135.6333,C99,S +23,1,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q +827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S +397,0,3,"Olsson, Miss. Elina",female,31.0,0,0,350407,7.8542,,S +129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C +519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36.0,1,0,226875,26.0,,S +155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S +278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0,,S +738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C +251,0,3,"Reed, Mr. James George",male,,0,0,362316,7.25,,S +549,0,3,"Goldsmith, Mr. Frank John",male,33.0,1,1,363291,20.525,,S +173,1,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S +291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26.0,0,0,19877,78.85,,S +639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41.0,0,5,3101295,39.6875,,S +399,0,2,"Pain, Dr. Alfred",male,23.0,0,0,244278,10.5,,S +326,1,1,"Young, Miss. Marie Grice",female,36.0,0,0,PC 17760,135.6333,C32,C +219,1,1,"Bazzani, Miss. Albina",female,32.0,0,0,11813,76.2917,D15,C +835,0,3,"Allum, Mr. Owen George",male,18.0,0,0,2223,8.3,,S +764,1,1,"Carter, Mrs. William Ernest (Lucile Polk)",female,36.0,1,2,113760,120.0,B96 B98,S +313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26.0,1,1,250651,26.0,,S +838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S +228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S +271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0,,S +473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33.0,1,2,C.A. 34651,27.75,,S +126,1,3,"Nicola-Yarred, Master. Elias",male,12.0,1,0,2651,11.2417,,C +37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C +532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C +638,0,2,"Collyer, Mr. Harvey",male,31.0,1,1,C.A. 31921,26.25,,S +259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C +267,0,3,"Panula, Mr. Ernesti Arvid",male,16.0,4,1,3101295,39.6875,,S +350,0,3,"Dimic, Mr. Jovan",male,42.0,0,0,315088,8.6625,,S +169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S +754,0,3,"Jonkoff, Mr. Lalio",male,23.0,0,0,349204,7.8958,,S +402,0,3,"Adams, Mr. John",male,26.0,0,0,341826,8.05,,S +75,1,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S +117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q +746,0,1,"Crosby, Capt. Edward Gifford",male,70.0,1,1,WE/P 5735,71.0,B22,S +843,1,1,"Serepeca, Miss. Augusta",female,30.0,0,0,113798,31.0,,C +77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S +645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C +643,0,3,"Skoog, Miss. Margit Elizabeth",female,2.0,3,2,347088,27.9,,S +422,0,3,"Charters, Mr. David",male,21.0,0,0,A/5. 13032,7.7333,,Q +295,0,3,"Mineff, Mr. Ivan",male,24.0,0,0,349233,7.8958,,S +2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C +181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S +177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S +823,0,1,"Reuchlin, Jonkheer. John George",male,38.0,0,0,19972,0.0,,S +206,0,3,"Strom, Miss. Telma Matilda",female,2.0,0,1,347054,10.4625,G6,S +672,0,1,"Davidson, Mr. Thornton",male,31.0,1,0,F.C. 12750,52.0,B71,S +174,0,3,"Sivola, Mr. Antti Wilhelm",male,21.0,0,0,STON/O 2. 3101280,7.925,,S +571,1,2,"Harris, Mr. George",male,62.0,0,0,S.W./PP 752,10.5,,S +531,1,2,"Quick, Miss. Phyllis May",female,2.0,1,1,26360,26.0,,S +453,0,1,"Foreman, Mr. Benjamin Laventall",male,30.0,0,0,113051,27.75,C111,C +88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S +528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S +71,0,2,"Jenkin, Mr. Stephen Curnow",male,32.0,0,0,C.A. 33111,10.5,,S +367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60.0,1,0,110813,75.25,D37,C +884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5,,S +834,0,3,"Augustsson, Mr. Albert",male,23.0,0,0,347468,7.8542,,S +208,1,3,"Albimona, Mr. Nassef Cassem",male,26.0,0,0,2699,18.7875,,C +79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S +865,0,2,"Gill, Mr. John William",male,24.0,0,0,233866,13.0,,S +379,0,3,"Betros, Mr. Tannous",male,20.0,0,0,2648,4.0125,,C +5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S +613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q +386,0,2,"Davies, Mr. Charles Henry",male,18.0,0,0,S.O.C. 14879,73.5,,S +517,1,2,"Lemore, Mrs. (Amelia Milley)",female,34.0,0,0,C.A. 34260,10.5,F33,S +533,0,3,"Elias, Mr. Joseph Jr",male,17.0,1,1,2690,7.2292,,C +765,0,3,"Eklund, Mr. Hans Linus",male,16.0,0,0,347074,7.775,,S +523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C +325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S +187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q +618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26.0,1,0,A/5. 3336,16.1,,S +619,1,2,"Becker, Miss. Marion Louise",female,4.0,2,1,230136,39.0,F4,S +305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S +633,1,1,"Stahelin-Maeglin, Dr. Max",male,32.0,0,0,13214,30.5,B50,C +612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S +721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S +526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q +655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18.0,0,0,365226,6.75,,Q +392,1,3,"Jansson, Mr. Carl Olof",male,21.0,0,0,350034,7.7958,,S +730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25.0,1,0,STON/O2. 3101271,7.925,,S +856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.35,,S +32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C +207,0,3,"Backstrom, Mr. Karl Alfred",male,32.0,1,0,3101278,15.85,,S +671,1,2,"Brown, Mrs. Thomas William Solomon (Elizabeth Catherine Ford)",female,40.0,1,1,29750,39.0,,S +384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35.0,1,0,113789,52.0,,S +496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C +76,0,3,"Moen, Mr. Sigurd Hansen",male,25.0,0,0,348123,7.65,F G73,S diff --git a/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/data/train.csv b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/data/train.csv new file mode 100644 index 00000000..14b79922 --- /dev/null +++ b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/data/train.csv @@ -0,0 +1,357 @@ +PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked +568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S +544,1,2,"Beane, Mr. Edward",male,32.0,1,0,2908,26.0,,S +375,0,3,"Palsson, Miss. Stina Viola",female,3.0,3,1,349909,21.075,,S +604,0,3,"Torber, Mr. Ernst William",male,44.0,0,0,364511,8.05,,S +866,1,2,"Bystrom, Mrs. (Karolina)",female,42.0,0,0,236852,13.0,,S +780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43.0,0,1,24160,211.3375,B3,S +789,1,3,"Dean, Master. Bertram Vere",male,1.0,1,2,C.A. 2315,20.575,,S +279,0,3,"Rice, Master. Eric",male,7.0,4,1,382652,29.125,,Q +842,0,2,"Mudd, Mr. Thomas Charles",male,16.0,0,0,S.O./P.P. 3,10.5,,S +734,0,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S +761,0,3,"Garfirth, Mr. John",male,,0,0,358585,14.5,,S +813,0,2,"Slemen, Mr. Richard James",male,35.0,0,0,28206,10.5,,S +368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C +757,0,3,"Carlsson, Mr. August Sigfrid",male,28.0,0,0,350042,7.7958,,S +616,1,2,"Herman, Miss. Alice",female,24.0,1,2,220845,65.0,,S +710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C +221,1,3,"Sunderland, Mr. Victor Francis",male,16.0,0,0,SOTON/OQ 392089,8.05,,S +311,1,1,"Hays, Miss. Margaret Bechstein",female,24.0,0,0,11767,83.1583,C54,C +333,0,1,"Graham, Mr. George Edward",male,38.0,0,1,PC 17582,153.4625,C91,S +344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25.0,0,0,244361,13.0,,S +762,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,SOTON/O2 3101272,7.125,,S +121,0,2,"Hickman, Mr. Stanley George",male,21.0,2,0,S.O.C. 14879,73.5,,S +269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58.0,0,1,PC 17582,153.4625,C125,S +146,0,2,"Nicholls, Mr. Joseph Charles",male,19.0,1,1,C.A. 33112,36.75,,S +658,0,3,"Bourke, Mrs. John (Catherine)",female,32.0,1,1,364849,15.5,,Q +25,0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.075,,S +788,0,3,"Rice, Master. George Hugh",male,8.0,4,1,382652,29.125,,Q +878,0,3,"Petroff, Mr. Nedelio",male,19.0,0,0,349212,7.8958,,S +178,0,1,"Isham, Miss. Ann Elizabeth",female,50.0,0,0,PC 17595,28.7125,C49,C +19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31.0,1,0,345763,18.0,,S +274,0,1,"Natsch, Mr. Charles H",male,37.0,0,1,PC 17596,29.7,C118,C +255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41.0,0,2,370129,20.2125,,S +501,0,3,"Calic, Mr. Petar",male,17.0,0,0,315086,8.6625,,S +182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C +436,1,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0,B96 B98,S +490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9.0,1,1,C.A. 37671,15.9,,S +745,1,3,"Stranden, Mr. Juho",male,31.0,0,0,STON/O 2. 3101288,7.925,,S +441,1,2,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",female,45.0,1,1,F.C.C. 13529,26.25,,S +413,1,1,"Minahan, Miss. Daisy E",female,33.0,1,0,19928,90.0,C78,Q +434,0,3,"Kallio, Mr. Nikolai Erland",male,17.0,0,0,STON/O 2. 3101274,7.125,,S +547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19.0,1,0,2908,26.0,,S +380,0,3,"Gustafsson, Mr. Karl Gideon",male,19.0,0,0,347069,7.775,,S +595,0,2,"Chapman, Mr. John Henry",male,37.0,1,0,SC/AH 29037,26.0,,S +889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S +627,0,2,"Kirkland, Rev. Charles Leonard",male,57.0,0,0,219533,12.35,,Q +753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33.0,0,0,345780,9.5,,S +320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40.0,1,1,16966,134.5,E34,C +509,0,3,"Olsen, Mr. Henry Margido",male,28.0,0,0,C 4001,22.525,,S +679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43.0,1,6,CA 2144,46.9,,S +216,1,1,"Newell, Miss. Madeleine",female,31.0,1,0,35273,113.275,D36,C +782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17.0,1,0,17474,57.0,B20,S +890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C +414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0.0,,S +17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q +369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q +131,0,3,"Drazenoic, Mr. Jozef",male,33.0,0,0,349241,7.8958,,C +122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S +480,1,3,"Hirvonen, Miss. Hildur E",female,2.0,0,1,3101298,12.2875,,S +690,1,1,"Madill, Miss. Georgette Alexandra",female,15.0,0,1,24160,211.3375,B5,S +263,0,1,"Taussig, Mr. Emil",male,52.0,1,1,110413,79.65,E67,S +82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29.0,0,0,345779,9.5,,S +548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C +256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29.0,0,2,2650,15.2458,,C +565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S +114,0,3,"Jussila, Miss. Katriina",female,20.0,1,0,4136,9.825,,S +608,1,1,"Daniel, Mr. Robert Williams",male,27.0,0,0,113804,30.5,,S +485,1,1,"Bishop, Mr. Dickinson H",male,25.0,1,0,11967,91.0792,B49,C +91,0,3,"Christmann, Mr. Emil",male,29.0,0,0,343276,8.05,,S +373,0,3,"Beavan, Mr. William Thomas",male,19.0,0,0,323951,8.05,,S +289,1,2,"Hosono, Mr. Masabumi",male,42.0,0,0,237798,13.0,,S +590,0,3,"Murdlin, Mr. Joseph",male,,0,0,A./5. 3235,8.05,,S +135,0,2,"Sobey, Mr. Samuel James Hayden",male,25.0,0,0,C.A. 29178,13.0,,S +236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S +95,0,3,"Coxon, Mr. Daniel",male,59.0,0,0,364500,7.25,,S +317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24.0,1,0,244367,26.0,,S +686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,SC/Paris 2123,41.5792,,C +157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16.0,0,0,35851,7.7333,,Q +201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28.0,0,0,345770,9.5,,S +258,1,1,"Cherry, Miss. Gladys",female,30.0,0,0,110152,86.5,B77,S +202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S +429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q +818,0,2,"Mallet, Mr. Albert",male,31.0,1,1,S.C./PARIS 2079,37.0042,,C +51,0,3,"Panula, Master. Juha Niilo",male,7.0,4,1,3101295,39.6875,,S +579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C +493,0,1,"Molson, Mr. Harry Markland",male,55.0,0,0,113787,30.5,C30,S +850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C +507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33.0,0,2,26360,26.0,,S +475,0,3,"Strandberg, Miss. Ida Sofia",female,22.0,0,0,7553,9.8375,,S +811,0,3,"Alexander, Mr. William",male,26.0,0,0,3474,7.8875,,S +162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40.0,0,0,C.A. 33595,15.75,,S +720,0,3,"Johnson, Mr. Malkolm Joackim",male,33.0,0,0,347062,7.775,,S +22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S +304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q +102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S +597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33.0,,S +598,0,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0,,S +634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0,,S +741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30.0,D45,S +541,1,1,"Crosby, Miss. Harriet R",female,36.0,0,2,WE/P 5735,71.0,B22,S +644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S +781,1,3,"Ayoub, Miss. Banoura",female,13.0,0,0,2687,7.2292,,C +254,0,3,"Lobb, Mr. William Arthur",male,30.0,1,0,A/5. 3336,16.1,,S +21,0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0,,S +563,0,2,"Norman, Mr. Robert Douglas",male,28.0,0,0,218629,13.5,,S +790,0,1,"Guggenheim, Mr. Benjamin",male,46.0,0,0,PC 17593,79.2,B82 B84,C +868,0,1,"Roebling, Mr. Washington Augustus II",male,31.0,0,0,PC 17590,50.4958,A24,S +60,0,3,"Goodwin, Master. William Frederick",male,11.0,5,2,CA 2144,46.9,,S +30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S +488,0,1,"Kent, Mr. Edward Austin",male,58.0,0,0,11771,29.7,B37,C +657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S +87,0,3,"Ford, Mr. William Neal",male,16.0,1,3,W./C. 6608,34.375,,S +670,1,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",female,,1,0,19996,52.0,C126,S +440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S +599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C +474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23.0,0,0,SC/AH Basle 541,13.7917,D,C +176,0,3,"Klasen, Mr. Klas Albin",male,18.0,1,1,350404,7.8542,,S +573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36.0,0,0,PC 17474,26.3875,E25,S +24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S +463,0,1,"Gee, Mr. Arthur H",male,47.0,0,0,111320,38.5,E63,S +372,0,3,"Wiklund, Mr. Jakob Alfred",male,18.0,1,0,3101267,6.4958,,S +729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25.0,1,0,236853,26.0,,S +876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.225,,C +689,0,3,"Fischer, Mr. Eberhard Thelander",male,18.0,0,0,350036,7.7958,,S +693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S +776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18.0,0,0,347078,7.75,,S +454,1,1,"Goldenberg, Mr. Samuel L",male,49.0,1,0,17453,89.1042,C92,C +529,0,3,"Salonen, Mr. Johan Werner",male,39.0,0,0,3101296,7.925,,S +355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C +663,0,1,"Colley, Mr. Edward Pomeroy",male,47.0,0,0,5727,25.5875,E58,S +669,0,3,"Cook, Mr. Jacob",male,43.0,0,0,A/5 3536,8.05,,S +407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51.0,0,0,347064,7.75,,S +360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q +603,0,1,"Harrington, Mr. Charles H",male,,0,0,113796,42.4,,S +12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S +70,0,3,"Kink, Mr. Vincenz",male,26.0,2,0,315151,8.6625,,S +767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C +536,1,2,"Hart, Miss. Eva Miriam",female,7.0,0,2,F.C.C. 13529,26.25,,S +870,1,3,"Johnson, Master. Harold Theodor",male,4.0,1,1,347742,11.1333,,S +646,1,1,"Harper, Mr. Henry Sleeper",male,48.0,1,0,PC 17572,76.7292,D33,C +180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0,,S +252,0,3,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29.0,1,1,347054,10.4625,G6,S +628,1,1,"Longley, Miss. Gretchen Fiske",female,21.0,0,0,13502,77.9583,D9,S +334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16.0,2,0,345764,18.0,,S +232,0,3,"Larsson, Mr. Bengt Edvin",male,29.0,0,0,347067,7.775,,S +179,0,2,"Hale, Mr. Reginald",male,30.0,0,0,250653,13.0,,S +853,0,3,"Boulos, Miss. Nourelain",female,9.0,1,1,2678,15.2458,,C +306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S +797,1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S +839,1,3,"Chip, Mr. Chang",male,32.0,0,0,1601,56.4958,,S +416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S +607,0,3,"Karaic, Mr. Milan",male,30.0,0,0,349246,7.8958,,S +277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45.0,0,0,347073,7.75,,S +589,0,3,"Gilinski, Mr. Eliezer",male,22.0,0,0,14973,8.05,,S +736,0,3,"Williams, Mr. Leslie",male,28.5,0,0,54636,16.1,,S +879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S +883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S +112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C +331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q +318,0,2,"Moraweck, Dr. Ernest",male,54.0,0,0,29011,14.0,,S +552,0,2,"Sharp, Mr. Percival James R",male,27.0,0,0,244358,26.0,,S +515,0,3,"Coleff, Mr. Satio",male,24.0,0,0,349209,7.4958,,S +245,0,3,"Attalah, Mr. Sleiman",male,30.0,0,0,2694,7.225,,C +795,0,3,"Dantcheff, Mr. Ristiu",male,25.0,0,0,349203,7.8958,,S +654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q +871,0,3,"Balkic, Mr. Cerin",male,26.0,0,0,349248,7.8958,,S +491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S +713,1,1,"Taylor, Mr. Elmer Zebley",male,48.0,1,0,19996,52.0,C126,S +194,1,2,"Navratil, Master. Michel M",male,3.0,1,1,230080,26.0,F2,S +218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42.0,1,0,243847,27.0,,S +149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26.0,F2,S +722,0,3,"Jensen, Mr. Svend Lauritz",male,17.0,1,0,350048,7.0542,,S +139,0,3,"Osen, Mr. Olaf Elon",male,16.0,0,0,7534,9.2167,,S +47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q +800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30.0,1,1,345773,24.15,,S +825,0,3,"Panula, Master. Urho Abraham",male,2.0,4,1,3101295,39.6875,,S +702,1,1,"Silverthorne, Mr. Spencer Victor",male,35.0,0,0,PC 17475,26.2875,E24,S +356,0,3,"Vanden Steen, Mr. Leo Peter",male,28.0,0,0,345783,9.5,,S +420,0,3,"Van Impe, Miss. Catharina",female,10.0,0,2,345773,24.15,,S +882,0,3,"Markun, Mr. Johann",male,33.0,0,0,349257,7.8958,,S +249,1,1,"Beckwith, Mr. Richard Leonard",male,37.0,1,1,11751,52.5542,D35,S +844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C +50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18.0,1,0,349237,17.8,,S +696,0,2,"Chapman, Mr. Charles Henry",male,52.0,0,0,248731,13.5,,S +293,0,2,"Levy, Mr. Rene Jacques",male,36.0,0,0,SC/Paris 2163,12.875,D,C +543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11.0,4,2,347082,31.275,,S +769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q +152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22.0,1,0,113776,66.6,C2,S +347,1,2,"Smith, Miss. Marion Elsie",female,40.0,0,0,31418,13.0,,S +89,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S +447,1,2,"Mellinger, Miss. Madeleine Violet",female,13.0,0,1,250644,19.5,,S +537,0,1,"Butt, Major. Archibald Willingham",male,45.0,0,0,113050,26.55,B38,S +98,1,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,PC 17759,63.3583,D10 D12,C +581,1,2,"Christy, Miss. Julie Rachel",female,25.0,1,1,237789,30.0,,S +376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C +684,0,3,"Goodwin, Mr. Charles Edward",male,14.0,5,2,CA 2144,46.9,,S +339,1,3,"Dahl, Mr. Karl Edwart",male,45.0,0,0,7598,8.05,,S +707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45.0,0,0,223596,13.5,,S +685,0,2,"Brown, Mr. Thomas William Solomon",male,60.0,1,1,29750,39.0,,S +388,1,2,"Buss, Miss. Kate",female,36.0,0,0,27849,13.0,,S +748,1,2,"Sinkkonen, Miss. Anna",female,30.0,0,0,250648,13.0,,S +409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21.0,0,0,312992,7.775,,S +281,0,3,"Duane, Mr. Frank",male,65.0,0,0,336439,7.75,,Q +857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.0,1,1,36928,164.8667,,S +786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25.0,0,0,374887,7.25,,S +243,0,2,"Coleridge, Mr. Reginald Charles",male,29.0,0,0,W./C. 14263,10.5,,S +394,1,1,"Newell, Miss. Marjorie",female,23.0,1,0,35273,113.275,D36,C +881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0,,S +415,1,3,"Sundman, Mr. Johan Julian",male,44.0,0,0,STON/O 2. 3101269,7.925,,S +477,0,2,"Renouf, Mr. Peter Henry",male,34.0,1,0,31027,21.0,,S +691,1,1,"Dick, Mr. Albert Adrian",male,31.0,1,0,17474,57.0,B20,S +481,0,3,"Goodwin, Master. Harold Victor",male,9.0,5,2,CA 2144,46.9,,S +505,1,1,"Maioni, Miss. Roberta",female,16.0,0,0,110152,86.5,B79,S +873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0,B51 B53 B55,S +567,0,3,"Stoytcheff, Mr. Ilia",male,19.0,0,0,349205,7.8958,,S +309,0,2,"Abelson, Mr. Samuel",male,30.0,1,0,P/PP 3381,24.0,,C +329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1,1,363291,20.525,,S +566,0,3,"Davies, Mr. Alfred J",male,24.0,2,0,A/4 48871,24.15,,S +44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.0,1,2,SC/Paris 2123,41.5792,,C +433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42.0,1,0,SC/AH 3085,26.0,,S +106,0,3,"Mionoff, Mr. Stoytcho",male,28.0,0,0,349207,7.8958,,S +391,1,1,"Carter, Mr. William Ernest",male,36.0,1,2,113760,120.0,B96 B98,S +404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28.0,1,0,STON/O2. 3101279,15.85,,S +801,0,2,"Ponesell, Mr. Martin",male,34.0,0,0,250647,13.0,,S +209,1,3,"Carr, Miss. Helen ""Ellen""",female,16.0,0,0,367231,7.75,,Q +262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3.0,4,2,347077,31.3875,,S +799,0,3,"Ibrahim Shawah, Mr. Yousseff",male,30.0,0,0,2685,7.2292,,C +867,1,2,"Duran y More, Miss. Asuncion",female,27.0,1,0,SC/PARIS 2149,13.8583,,C +241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C +817,0,3,"Heininen, Miss. Wendla Maria",female,23.0,0,0,STON/O2. 3101290,7.925,,S +418,1,2,"Silven, Miss. Lyyli Karoliina",female,18.0,0,2,250652,13.0,,S +303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0,,S +885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S +301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q +504,0,3,"Laitinen, Miss. Kristina Sofia",female,37.0,0,0,4135,9.5875,,S +83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q +851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4.0,4,2,347082,31.275,,S +299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S +849,0,2,"Harper, Rev. John",male,28.0,0,1,248727,33.0,,S +750,0,3,"Connaghton, Mr. Michael",male,31.0,0,0,335097,7.75,,Q +396,0,3,"Johansson, Mr. Erik",male,22.0,0,0,350052,7.7958,,S +68,0,3,"Crease, Mr. Ernest James",male,19.0,0,0,S.P. 3464,8.1583,,S +244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22.0,0,0,STON/O 2. 3101275,7.125,,S +183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9.0,4,2,347077,31.3875,,S +460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q +67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29.0,0,0,C.A. 29395,10.5,F33,S +197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q +808,0,3,"Pettersson, Miss. Ellen Natalia",female,18.0,0,0,347087,7.775,,S +806,0,3,"Johansson, Mr. Karl Johan",male,31.0,0,0,347063,7.775,,S +159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S +508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S +829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q +342,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0,C23 C25 C27,S +542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9.0,4,2,347082,31.275,,S +888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S +260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50.0,0,1,230433,26.0,,S +56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S +73,0,2,"Hood, Mr. Ambrose Jr",male,21.0,0,0,S.O.C. 14879,73.5,,S +586,1,1,"Taussig, Miss. Ruth",female,18.0,0,2,110413,79.65,E68,S +494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C +534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C +85,1,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.5,,S +357,1,1,"Bowerman, Miss. Elsie Edith",female,22.0,0,1,113505,55.0,E33,S +522,0,3,"Vovk, Mr. Janko",male,22.0,0,0,349252,7.8958,,S +15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S +831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15.0,1,0,2659,14.4542,,C +555,1,3,"Ohman, Miss. Velin",female,22.0,0,0,347085,7.775,,S +189,0,3,"Bourke, Mr. John",male,40.0,1,1,364849,15.5,,Q +439,0,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0,C23 C25 C27,S +298,0,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S +108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S +276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S +99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34.0,0,1,231919,23.0,,S +783,0,1,"Long, Mr. Milton Clyde",male,29.0,0,0,113501,30.0,D6,S +332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S +468,0,1,"Smart, Mr. John Montgomery",male,56.0,0,0,113792,26.55,,S +796,0,2,"Otter, Mr. Richard",male,39.0,0,0,28213,13.0,,S +294,0,3,"Haas, Miss. Aloisia",female,24.0,0,0,349236,8.85,,S +596,0,3,"Van Impe, Mr. Jean Baptiste",male,36.0,1,1,345773,24.15,,S +115,0,3,"Attalah, Miss. Malake",female,17.0,0,0,2627,14.4583,,C +363,0,3,"Barbara, Mrs. (Catherine David)",female,45.0,0,1,2691,14.4542,,C +109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S +9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S +196,1,1,"Lurette, Miss. Elise",female,58.0,0,0,PC 17569,146.5208,B80,C +7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S +600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49.0,1,0,PC 17485,56.9292,A20,C +428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19.0,0,0,250655,26.0,,S +760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)",female,33.0,0,0,110152,86.5,B77,S +217,1,3,"Honkanen, Miss. Eliina",female,27.0,0,0,STON/O2. 3101283,7.925,,S +280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.25,,S +81,0,3,"Waelens, Mr. Achille",male,22.0,0,0,345767,9.0,,S +755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48.0,1,2,220845,65.0,,S +708,1,1,"Calderhead, Mr. Edward Pennington",male,42.0,0,0,PC 17476,26.2875,E24,S +315,0,2,"Hart, Mr. Benjamin",male,43.0,1,1,F.C.C. 13529,26.25,,S +220,0,2,"Harris, Mr. Walter",male,30.0,0,0,W/C 14208,10.5,,S +39,0,3,"Vander Planke, Miss. Augusta Maria",female,18.0,2,0,345764,18.0,,S +668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.775,,S +609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22.0,1,2,SC/Paris 2123,41.5792,,C +443,0,3,"Petterson, Mr. Johan Emil",male,25.0,1,0,347076,7.775,,S +275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q +150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42.0,0,0,244310,13.0,,S +127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q +395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24.0,0,2,PP 9549,16.7,G6,S +111,0,1,"Porter, Mr. Walter Chamberlain",male,47.0,0,0,110465,52.0,C110,S +791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q +558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C +57,1,2,"Rugg, Miss. Emily",female,21.0,0,0,C.A. 31026,10.5,,S +58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C +28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S +860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C +695,0,1,"Weir, Col. John",male,60.0,0,0,113800,26.55,,S +170,0,3,"Ling, Mr. Lee",male,28.0,0,0,1601,56.4958,,S +425,0,3,"Rosblom, Mr. Viktor Richard",male,18.0,1,1,370129,20.2125,,S +476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52.0,A14,S +641,0,3,"Jensen, Mr. Hans Peder",male,20.0,0,0,350050,7.8542,,S +11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S +735,0,2,"Troupiansky, Mr. Moses Aaron",male,23.0,0,0,233639,13.0,,S +147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S +138,0,1,"Futrelle, Mr. Jacques Heath",male,37.0,1,0,113803,53.1,C123,S +133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47.0,1,0,A/5. 3337,14.5,,S +330,1,1,"Hippach, Miss. Jean Gertrude",female,16.0,0,1,111361,57.9792,B18,C +132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20.0,0,0,SOTON/O.Q. 3101307,7.05,,S +726,0,3,"Oreskovic, Mr. Luka",male,20.0,0,0,315094,8.6625,,S +500,0,3,"Svensson, Mr. Olof",male,24.0,0,0,350035,7.7958,,S +113,0,3,"Barton, Mr. David John",male,22.0,0,0,324669,8.05,,S +514,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",female,54.0,1,0,PC 17603,59.4,,C +190,0,3,"Turcin, Mr. Stjepan",male,36.0,0,0,349247,7.8958,,S +841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S +524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44.0,0,1,111361,57.9792,B18,C +771,0,3,"Lievens, Mr. Rene Aime",male,24.0,0,0,345781,9.5,,S +697,0,3,"Kelly, Mr. James",male,44.0,0,0,363592,8.05,,S +302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q +193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19.0,1,0,350046,7.8542,,S +92,0,3,"Andreasson, Mr. Paul Edvin",male,20.0,0,0,347466,7.8542,,S +605,1,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35.0,0,0,111426,26.55,,C +464,0,2,"Milling, Mr. Jacob Christian",male,48.0,0,0,234360,13.0,,S +793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S +874,0,3,"Vander Cruyssen, Mr. Victor",male,47.0,0,0,345765,9.0,,S +733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0.0,,S +614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q +185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4.0,0,2,315153,22.025,,S +390,1,2,"Lehmann, Miss. Bertha",female,17.0,0,0,SC 1748,12.0,,C +54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29.0,1,0,2926,26.0,,S +69,1,3,"Andersson, Miss. Erna Alexandra",female,17.0,4,2,3101281,7.925,,S +816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0.0,B102,S +584,0,1,"Ross, Mr. John Hugo",male,36.0,0,0,13049,40.125,A10,C +171,0,1,"Van der hoef, Mr. Wyckoff",male,61.0,0,0,111240,33.5,B19,S +455,0,3,"Peduzzi, Mr. Joseph",male,,0,0,A/5 2817,8.05,,S +432,1,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",female,,1,0,376564,16.1,,S +343,0,2,"Collander, Mr. Erik Gustaf",male,28.0,0,0,248740,13.0,,S +449,1,3,"Baclini, Miss. Marie Catherine",female,5.0,2,1,2666,19.2583,,C +223,0,3,"Green, Mr. George Henry",male,51.0,0,0,21440,8.05,,S +551,1,1,"Thayer, Mr. John Borland Jr",male,17.0,0,2,17421,110.8833,C70,C +715,0,2,"Greenberg, Mr. Samuel",male,52.0,0,0,250647,13.0,,S +662,0,3,"Badt, Mr. Mohamed",male,40.0,0,0,2623,7.225,,C +743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21.0,2,2,PC 17608,262.375,B57 B59 B63 B66,C +410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S diff --git a/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/problem.py b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/problem.py new file mode 100644 index 00000000..9199e18e --- /dev/null +++ b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/problem.py @@ -0,0 +1,42 @@ +import os +import pandas as pd +import rampwf as rw +from sklearn.model_selection import StratifiedShuffleSplit + +problem_title = 'Titanic survival classification' +_target_column_name = 'Survived' +_ignore_column_names = ['PassengerId'] +_prediction_label_names = [0, 1] +# A type (class) which will be used to create wrapper objects for y_pred +Predictions = rw.prediction_types.make_multiclass( + label_names=_prediction_label_names) +# An object implementing the workflow +workflow = rw.workflows.FeatureExtractorClassifier() + +score_types = [ + rw.score_types.ROCAUC(name='auc'), + rw.score_types.Accuracy(name='acc'), + rw.score_types.NegativeLogLikelihood(name='nll'), +] + + +def get_cv(X, y): + cv = StratifiedShuffleSplit(n_splits=8, test_size=0.2, random_state=57) + return cv.split(X, y) + + +def _read_data(path, f_name): + data = pd.read_csv(os.path.join(path, 'data', f_name)) + y_array = data[_target_column_name].values + X_df = data.drop([_target_column_name] + _ignore_column_names, axis=1) + return X_df, y_array + + +def get_train_data(path='.'): + f_name = 'train.csv' + return _read_data(path, f_name) + + +def get_test_data(path='.'): + f_name = 'test.csv' + return _read_data(path, f_name) diff --git a/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/requirements.txt b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/requirements.txt new file mode 100644 index 00000000..716fb8d1 --- /dev/null +++ b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/requirements.txt @@ -0,0 +1,5 @@ +numpy +scikit-learn +pandas +matplotlib +seaborn diff --git a/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/submissions/starting_kit/classifier.py b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/submissions/starting_kit/classifier.py new file mode 100755 index 00000000..bc9a4954 --- /dev/null +++ b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/submissions/starting_kit/classifier.py @@ -0,0 +1,27 @@ +from sklearn.linear_model import LogisticRegression +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.base import BaseEstimator +from rampwf.hyperopt import Hyperparameter + +# RAMP START HYPERPARAMETERS +logreg_C = Hyperparameter( + dtype='float', default=1.0, values=[0.01, 0.1, 0.9, 1.0]) +imputer_strategy = Hyperparameter( + dtype='object', default='median', values=['mean', 'median']) +# RAMP END HYPERPARAMETERS + + +class Classifier(BaseEstimator): + def __init__(self): + self.clf = Pipeline([ + ('imputer', + SimpleImputer(strategy=str(imputer_strategy))), + ('classifier', LogisticRegression(C=float(logreg_C))) + ]) + + def fit(self, X, y): + self.clf.fit(X, y) + + def predict_proba(self, X): + return self.clf.predict_proba(X) diff --git a/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/submissions/starting_kit/feature_extractor.py b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/submissions/starting_kit/feature_extractor.py new file mode 100755 index 00000000..ade21359 --- /dev/null +++ b/rampwf/hyperopt/tests/interfaces/header_in_files/titanic/submissions/starting_kit/feature_extractor.py @@ -0,0 +1,50 @@ +import pandas as pd +import numpy as np +from rampwf.hyperopt import Hyperparameter + +# RAMP START HYPERPARAMETERS +complex_features = Hyperparameter( + dtype='bool', default=True, values=[True, False]) +# RAMP END HYPERPARAMETERS + + +class FeatureExtractor(): + def __init__(self): + pass + + def fit(self, X_df, y): + pass + + def transform(self, X_df): + if int(complex_features): + X_df = X_df.assign(LogFare=lambda x: np.log(x.Fare + 10.)) + X_df = X_df.assign(Cab=lambda x: x.Cabin == x.Cabin) + + X_df_new = pd.concat( + [X_df.get(['Parch']), + X_df.assign( + LogFare=lambda x: 10 * np.log(x.Fare + 1.)**0.5).get( + ['LogFare']), + X_df.assign(SibSp=lambda x: np.exp(x.SibSp) + 0.6 * np.exp( + x.Parch)).get(['SibSp']), + X_df.assign(Age=lambda x: 10 * np.log(x.Age + 0.01)).get( + ['Age']), + pd.get_dummies(X_df.Sex, prefix='Sex', drop_first=False), + pd.get_dummies( + X_df.Pclass, prefix='Pclass', drop_first=False), + pd.get_dummies( + X_df.Embarked, prefix='Embarked', drop_first=True), + pd.get_dummies(X_df.Cab, prefix='Cab', drop_first=True)], + axis=1) + else: + X_df_new = pd.concat( + [X_df.get(['Fare', 'Age', 'SibSp', 'Parch']), + pd.get_dummies(X_df.Sex, prefix='Sex', drop_first=True), + pd.get_dummies(X_df.Pclass, prefix='Pclass', drop_first=True), + pd.get_dummies( + X_df.Embarked, prefix='Embarked', drop_first=True)], + axis=1) + + X_df_new = X_df_new.fillna(-1) + XX = X_df_new.values + return XX diff --git a/rampwf/hyperopt/tests/test_hyperparameter.py b/rampwf/hyperopt/tests/test_hyperparameter.py new file mode 100644 index 00000000..6078fae6 --- /dev/null +++ b/rampwf/hyperopt/tests/test_hyperparameter.py @@ -0,0 +1,44 @@ +import os +import pytest +from rampwf.hyperopt import Hyperparameter, run_hyperopt + +PATH = os.path.dirname(__file__) + + +def test_hyperparameter(): + hp_1 = Hyperparameter(dtype='int', values=[1, 2, 3]) + assert hp_1.default == 1 + assert hp_1.n_values == 3 + assert list(hp_1.values) == [1, 2, 3] + assert list(hp_1.prior) == [1. / 3, 1. / 3, 1. / 3] + hp_2 = Hyperparameter(dtype='int', values=[1, 2, 3], prior=[0.1, 0.7, 0.2]) + assert hp_2.default == 1 + assert hp_2.n_values == 3 + assert list(hp_2.values) == [1, 2, 3] + assert list(hp_2.prior) == [0.1, 0.7, 0.2] + hp_3 = Hyperparameter(dtype='int', default=1) + assert hp_3.default == 1 + assert hp_3.n_values == 1 + assert list(hp_3.values) == [1] + assert list(hp_3.prior) == [1.0] + with pytest.raises(ValueError) as e: + Hyperparameter(dtype='int', values=[1, 2, 3], prior=[0.1, 0.7]) + assert str(e.value) == 'len(values) == 3 != 2 == len(prior)' + with pytest.raises(ValueError) as e: + Hyperparameter(dtype='int') + assert str(e.value) == 'Either default or values must be defined.' + with pytest.raises(ValueError) as e: + Hyperparameter(dtype='int', values=[]) + assert str(e.value) == 'Values needs to contain at least one element.' + with pytest.raises(ValueError) as e: + Hyperparameter(dtype='int', default=2, values=[1]) + assert str(e.value) == 'Default must be among values.' + + +def test_hyperopt(): + ramp_kit_dir = os.path.join( + PATH, 'interfaces', 'header_in_files', 'titanic') + submission = 'starting_kit' + run_hyperopt( + ramp_kit_dir, ramp_kit_dir, os.path.join(ramp_kit_dir, 'submissions'), + submission, 'random', 64, is_cleanup=True) diff --git a/rampwf/utils/__init__.py b/rampwf/utils/__init__.py index c3aa40bf..bb4a8d90 100644 --- a/rampwf/utils/__init__.py +++ b/rampwf/utils/__init__.py @@ -2,14 +2,16 @@ ramp_test_submission, ramp_test_notebook, ramp_convert_notebook, ramp_blend_submissions) from .testing import ( - assert_title, assert_data, assert_cv, assert_submission, assert_notebook, - blend_submissions) + assert_title, assert_data, assert_cv, assert_read_problem, + assert_submission, assert_notebook, blend_submissions) +from .submission import run_submission_on_cv_fold from .combine import get_score_cv_bags from .importing import import_file __all__ = ['assert_cv', 'assert_data', 'assert_notebook', + 'assert_read_problem', 'assert_submission', 'assert_title', 'blend_submissions', @@ -19,5 +21,6 @@ 'ramp_convert_notebook', 'ramp_test_notebook', 'ramp_test_submission', + 'run_submission_on_cv_fold', 'import_file' ] diff --git a/rampwf/utils/scoring.py b/rampwf/utils/scoring.py index 7cb5cf72..3c3f148b 100644 --- a/rampwf/utils/scoring.py +++ b/rampwf/utils/scoring.py @@ -50,12 +50,14 @@ def mean_score_matrix(df_scores_list, score_types): scores = np.array([df_scores.values for df_scores in df_scores_list]) meanss = scores.mean(axis=0) stdss = scores.std(axis=0) + precisions = [st.precision for st in score_types] + precisions.append(1) # for time # we use unicode no break space so split in print_df_scores works strs = np.array([[ u'{val}\u00A0±\u00A0{std}'.format( - val=round(mean, score_type.precision), - std=round(std, score_type.precision + 1)) - for mean, std, score_type in zip(means, stds, score_types)] + val=round(mean, prec), + std=round(std, prec + 1)) + for mean, std, prec in zip(means, stds, precisions)] for means, stds in zip(meanss, stdss)]) df_scores = pd.DataFrame( strs, columns=df_scores_list[0].columns, index=df_scores_list[0].index) diff --git a/rampwf/utils/submission.py b/rampwf/utils/submission.py index ad70b21d..36301080 100644 --- a/rampwf/utils/submission.py +++ b/rampwf/utils/submission.py @@ -48,8 +48,8 @@ def save_submissions(problem, y_pred, data_path='.', output_path='.', pass -def train_test_submission(problem, module_path, X_train, y_train, X_test, - is_pickle, save_output, output_path, +def train_test_submission(problem, module_path, X_train, y_train, X_test=None, + is_pickle=False, save_output=False, output_path='.', model_name='model.pkl', train_is=None): """Train and test submission, on cv fold if train_is not none. @@ -134,10 +134,10 @@ def train_test_submission(problem, module_path, X_train, y_train, X_test, return (y_pred_train, y_pred_test), (train_time, valid_time, test_time) -def run_submission_on_cv_fold(problem, module_path, X_train, y_train, - X_test, y_test, score_types, - is_pickle, save_output, fold_output_path, - fold, ramp_data_dir): +def run_submission_on_cv_fold(problem, module_path, fold, X_train, + y_train, X_test=None, y_test=None, + is_pickle=False, save_output=False, + fold_output_path='.', ramp_data_dir='.'): """Run submission, compute and return predictions and scores on cv. Parameters @@ -154,8 +154,6 @@ def run_submission_on_cv_fold(problem, module_path, X_train, y_train, returned by problem.get_test_data y_test : a list of testing ground truth or None returned by problem.get_test_data - score_types : a list of score types - problem.score_types is_pickle : boolean True if the model should be pickled save_output : boolean @@ -175,6 +173,7 @@ def run_submission_on_cv_fold(problem, module_path, X_train, y_train, df_scores : pd.DataFrame table of scores (rows = train/valid/test steps, columns = scores) """ + score_types = problem.score_types train_is, valid_is = fold pred, timing = train_test_submission( problem, module_path, X_train, y_train, X_test, is_pickle, @@ -216,6 +215,7 @@ def run_submission_on_cv_fold(problem, module_path, X_train, y_train, ('valid', predictions_train_valid), ('test', predictions_test)]), ) + df_scores['time'] = [train_time, valid_time, test_time] set_state('scored', save_output, fold_output_path) return predictions_train_valid, predictions_test, df_scores @@ -235,6 +235,7 @@ def run_submission_on_cv_fold(problem, module_path, X_train, y_train, predictions=OrderedDict([('train', predictions_train_train), ('valid', predictions_train_valid)]), ) + df_scores['time'] = [train_time, valid_time] set_state('scored', save_output, fold_output_path) return predictions_train_valid, None, df_scores diff --git a/rampwf/utils/testing.py b/rampwf/utils/testing.py index 919165e0..ed304e9a 100644 --- a/rampwf/utils/testing.py +++ b/rampwf/utils/testing.py @@ -10,7 +10,7 @@ import pandas as pd from .combine import blend_on_fold -from .io import load_y_pred, set_state +from .io import load_y_pred from .pretty_print import print_title, print_df_scores from .notebook import execute_notebook, convert_notebook from .scoring import round_df_scores, mean_score_matrix @@ -118,9 +118,9 @@ def assert_submission(ramp_kit_dir='.', ramp_data_dir='.', predictions_valid, predictions_test, df_scores = \ run_submission_on_cv_fold( - problem, submission_path, X_train, y_train, X_test, y_test, - score_types, is_pickle, save_output, fold_output_path, - fold, ramp_data_dir) + problem, submission_path, fold, X_train, y_train, + X_test, y_test, is_pickle, save_output, fold_output_path, + ramp_data_dir) if save_output: filename = os.path.join(fold_output_path, 'scores.csv') df_scores.to_csv(filename) diff --git a/setup.py b/setup.py index 6095030a..954ea66f 100755 --- a/setup.py +++ b/setup.py @@ -68,6 +68,7 @@ 'console_scripts': [ 'ramp-test = rampwf.utils.cli.testing:start', 'ramp-show = rampwf.utils.cli.show:start', + 'ramp-hyperopt = rampwf.hyperopt.cli.hyperopt:start', 'ramp_test_submission=' 'rampwf.utils.command_line:ramp_test_submission', 'ramp_test_notebook='