Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed missing datetime for timebased features and allowed warnings to be silenced #874

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
34 changes: 27 additions & 7 deletions tsfresh/convenience/bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def _feature_extraction_on_chunk_helper(
column_value,
default_fc_parameters,
kind_to_fc_parameters,
show_warnings=True,
):
"""
Helper function wrapped around _do_extraction_on_chunk to use the correct format
Expand All @@ -28,18 +29,23 @@ def _feature_extraction_on_chunk_helper(
default_fc_parameters = {}

if column_sort is not None:
df = df.sort_values(column_sort)

chunk = df[column_id].iloc[0], df[column_kind].iloc[0], df[column_value]
data = df[[column_sort, column_value]].set_index(column_sort)
data = data.sort_index(level=column_sort)
else:
data = df[column_value]

chunk = df[column_id].iloc[0], df[column_kind].iloc[0], data
features = _do_extraction_on_chunk(
chunk,
default_fc_parameters=default_fc_parameters,
kind_to_fc_parameters=kind_to_fc_parameters,
show_warnings=show_warnings,
)
features = pd.DataFrame(features, columns=[column_id, "variable", "value"])
features["value"] = features["value"].astype("double")

return features[[column_id, "variable", "value"]]
return features


def dask_feature_extraction_on_chunk(
Expand All @@ -50,6 +56,8 @@ def dask_feature_extraction_on_chunk(
column_sort=None,
default_fc_parameters=None,
kind_to_fc_parameters=None,
taste_of_pandas_df=False,
show_warnings=True,
):
"""
Extract features on a grouped dask dataframe given the column names and the extraction settings.
Expand Down Expand Up @@ -125,6 +133,14 @@ def dask_feature_extraction_on_chunk(
:param column_value: The name for the column keeping the value itself.
:type column_value: str

:param taste_of_pandas_df: pandas Dataframe with same preprocessing steps as the Dask Dataframe, (e.g. df.groupby(), etc.)
:type taste_of_pandas_df: pandas.DataFrame

:param show_warnings: Wether to show warings in tsfresh.
:type show_warnings: bool



:return: A dask dataframe with the columns ``column_id``, "variable" and "value". The index is taken
from the grouped dataframe.
:rtype: dask.dataframe.DataFrame (id int64, variable object, value float64)
Expand All @@ -138,11 +154,14 @@ def dask_feature_extraction_on_chunk(
column_value=column_value,
default_fc_parameters=default_fc_parameters,
kind_to_fc_parameters=kind_to_fc_parameters,
show_warnings=show_warnings,
)
return df.apply(
feature_extraction,
meta=[(column_id, "int64"), ("variable", "object"), ("value", "float64")],
)
if taste_of_pandas_df:
meta = taste_of_pandas_df.apply(feature_extraction)
else:
meta = [(column_id, "int64"), ("variable", "object"), ("value", "float64")]

return df.apply(feature_extraction, meta=meta)


def spark_feature_extraction_on_chunk(
Expand All @@ -154,6 +173,7 @@ def spark_feature_extraction_on_chunk(
default_fc_parameters=None,
kind_to_fc_parameters=None,
):

"""
Extract features on a grouped spark dataframe given the column names and the extraction settings.
This wrapper function should only be used if you have a spark dataframe as input.
Expand Down
2 changes: 1 addition & 1 deletion tsfresh/feature_extraction/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def _f():
continue
x = data
else:
x = data.values
x = data.values.flatten()

if getattr(func, "fctype", None) == "combiner":
result = func(x, param=parameter_list)
Expand Down
2 changes: 1 addition & 1 deletion tsfresh/feature_extraction/feature_calculators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2282,7 +2282,7 @@ def linear_trend_timewise(x, param):
times_seconds = (ix - ix[0]).total_seconds()
times_hours = np.asarray(times_seconds / float(3600))

linReg = linregress(times_hours, x.values)
linReg = linregress(times_hours, x.values.flatten())

return [
('attr_"{}"'.format(config["attr"]), getattr(linReg, config["attr"]))
Expand Down