Skip to content

Commit

Permalink
refactor(eda): ran just ci
Browse files Browse the repository at this point in the history
  • Loading branch information
Devin Lu committed Apr 21, 2022
1 parent c801377 commit 5f5ba73
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 31 deletions.
4 changes: 2 additions & 2 deletions dataprep/clean/clean_country.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def _format_country(
return result, 2 if val != result else 3


@lru_cache(maxsize=2**20)
@lru_cache(maxsize=2 ** 20)
def _check_country(country: str, input_formats: Tuple[str, ...], strict: bool, clean: bool) -> Any:
"""
Finds the index of the given country in the DATA dataframe.
Expand Down Expand Up @@ -322,7 +322,7 @@ def _check_country(country: str, input_formats: Tuple[str, ...], strict: bool, c
return (None, "unknown") if clean else False


@lru_cache(maxsize=2**20)
@lru_cache(maxsize=2 ** 20)
def _check_fuzzy_dist(country: str, fuzzy_dist: int) -> Any:
"""
A match is found if a country has an edit distance <= fuzzy_dist
Expand Down
10 changes: 7 additions & 3 deletions dataprep/eda/create_diff_report/diff_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def format_diff_report(
cfg: Config,
mode: Optional[str],
progress: bool = True,
target: Optional[str] = None
target: Optional[str] = None,
) -> Dict[str, Any]:
"""
Format the data and figures needed by create_diff_report
Expand Down Expand Up @@ -118,6 +118,7 @@ def format_diff_report(
raise ValueError(f"Unknown mode: {mode}")
return report


def validate_target(target: str, df_list: List[pd.DataFrame]):
"""
Helper function, verify that target column exists
Expand All @@ -128,7 +129,8 @@ def validate_target(target: str, df_list: List[pd.DataFrame]):
exists = True
break
if not exists:
raise ValueError(f'Sorry, {target} is not a valid column')
raise ValueError(f"Sorry, {target} is not a valid column")


def format_basic(df_list: List[pd.DataFrame], target: Optional[str], cfg: Config) -> Dict[str, Any]:
"""
Expand Down Expand Up @@ -295,7 +297,9 @@ def compute_plot_data(
elif is_dtype(dtp, DateTime_v1()):
plot_data.append((col, dtp, dask.compute(*datum), orig)) # workaround

return Intermediate(data=plot_data, stats=stats, visual_type="comparison_grid", target=target, df_list=pd_list)
return Intermediate(
data=plot_data, stats=stats, visual_type="comparison_grid", target=target, df_list=pd_list
)


def _compute_variables(df: EDAFrame, cfg: Config) -> Dict[str, Any]:
Expand Down
62 changes: 41 additions & 21 deletions dataprep/eda/diff/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def bar_viz(
df_labels: List[str],
baseline: int,
target: Optional[str] = None,
df_list: Optional[List[pd.DataFrame]] = None
df_list: Optional[List[pd.DataFrame]] = None,
) -> Figure:
"""
Render a bar chart
Expand Down Expand Up @@ -119,7 +119,7 @@ def bar_viz(
tools="hover",
x_range=list(df[baseline].index),
y_axis_type=yscale,
y_range=(min(col1_min, col2_min) * (1 - y_inc), max(col1_max, col2_max) * (1 + y_inc))
y_range=(min(col1_min, col2_min) * (1 - y_inc), max(col1_max, col2_max) * (1 + y_inc)),
)
row_names = None
offset = np.linspace(-0.08 * len(df), 0.08 * len(df), len(df)) if len(df) > 1 else [0]
Expand Down Expand Up @@ -157,7 +157,7 @@ def bar_viz(

if show_yticks and yscale == "linear":
_format_axis(fig, 0, df[baseline].max(), "y")

df1, df2 = df_list[0], df_list[1]
if target != col and target and col in df1.columns and col in df2.columns:
col1, col2 = df_list[0][col], df_list[1][col]
Expand All @@ -166,12 +166,23 @@ def bar_viz(
for names in row_names:
row_avgs_1.append(df_list[0][target][col1 == names].mean())
row_avgs_2.append(df_list[1][target][col2 == names].mean())

row_avgs_1 = [0 if math.isnan(x) else x for x in row_avgs_1]
row_avgs_2 = [0 if math.isnan(x) else x for x in row_avgs_2]
fig.extra_y_ranges = {"Averages": Range1d(start=min(row_avgs_1 + row_avgs_2) * (1 - y_inc), end=max(row_avgs_1 + row_avgs_2) * (1 + y_inc))}
fig.multi_line([row_names, row_names], [row_avgs_1, row_avgs_2], color=['navy', 'firebrick'], y_range_name="Averages", line_width=4)
fig.add_layout(LinearAxis(y_range_name="Averages"), 'right')
fig.extra_y_ranges = {
"Averages": Range1d(
start=min(row_avgs_1 + row_avgs_2) * (1 - y_inc),
end=max(row_avgs_1 + row_avgs_2) * (1 + y_inc),
)
}
fig.multi_line(
[row_names, row_names],
[row_avgs_1, row_avgs_2],
color=["navy", "firebrick"],
y_range_name="Averages",
line_width=4,
)
fig.add_layout(LinearAxis(y_range_name="Averages"), "right")
return fig


Expand All @@ -186,7 +197,7 @@ def hist_viz(
df_labels: List[str],
orig: Optional[List[str]] = None,
target: Optional[str] = None,
df_list: Optional[List[pd.DataFrame]] = None
df_list: Optional[List[pd.DataFrame]] = None,
) -> Figure:
"""
Render a histogram
Expand Down Expand Up @@ -222,14 +233,13 @@ def hist_viz(
counts_max_2 = max(counts_list[1])

y_start, y_end = min(counts_min_1, counts_min_2), max(counts_max_1, counts_max_2)


fig = Figure(
plot_height=plot_height,
plot_width=plot_width,
title=col,
toolbar_location=None,
y_axis_type=yscale
y_axis_type=yscale,
)
bins_list = []
for i, hst in enumerate(hist):
Expand All @@ -252,7 +262,9 @@ def hist_viz(
bottom = 0 if yscale == "linear" or df.empty else counts.min() / 2
if y_start is not None and y_end is not None:
# fig.y_range = (y_start * (1 - y_inc), y_end * (1 + y_inc))
fig.extra_y_ranges = {"Counts": Range1d(start=y_start * (1 - y_inc), end=y_end * (1 + y_inc))}
fig.extra_y_ranges = {
"Counts": Range1d(start=y_start * (1 - y_inc), end=y_end * (1 + y_inc))
}
fig.quad(
source=df,
left="left",
Expand All @@ -262,7 +274,7 @@ def hist_viz(
top="freq",
fill_color=CATEGORY10[i],
line_color=CATEGORY10[i],
y_range_name="Counts"
y_range_name="Counts",
)
else:
fig.quad(
Expand All @@ -273,11 +285,11 @@ def hist_viz(
alpha=0.5,
top="freq",
fill_color=CATEGORY10[i],
line_color=CATEGORY10[i]
line_color=CATEGORY10[i],
)
# if col == 'LotFrontage':
# breakpoint()
# breakpoint()

hover = HoverTool(tooltips=tooltips, attachment="vertical", mode="vline")
fig.add_tools(hover)

Expand Down Expand Up @@ -325,9 +337,17 @@ def hist_viz(
max_range = max(df1_bin_averages + df2_bin_averages)
min_range = min(df1_bin_averages + df2_bin_averages)

fig.extra_y_ranges['Averages'] = Range1d(start=min_range * (1 - y_inc), end=max_range * (1 + y_inc))
fig.multi_line([bins_1, bins_2], [df1_bin_averages, df2_bin_averages], color=['navy', 'firebrick'], y_range_name="Averages", line_width=4)
fig.add_layout(LinearAxis(y_range_name="Averages", axis_label='Bin Averages'), 'right')
fig.extra_y_ranges["Averages"] = Range1d(
start=min_range * (1 - y_inc), end=max_range * (1 + y_inc)
)
fig.multi_line(
[bins_1, bins_2],
[df1_bin_averages, df2_bin_averages],
color=["navy", "firebrick"],
y_range_name="Averages",
line_width=4,
)
fig.add_layout(LinearAxis(y_range_name="Averages", axis_label="Bin Averages"), "right")
return fig


Expand Down Expand Up @@ -678,7 +698,7 @@ def format_num_stats(data: Dict[str, List[Any]]) -> Dict[str, Dict[str, List[Any
descriptive = {
"Mean": data["mean"],
"Standard Deviation": data["std"],
"Variance": [std**2 for std in data["std"]],
"Variance": [std ** 2 for std in data["std"]],
"Sum": [mean * npres for mean, npres in zip(data["mean"], data["npres"])],
"Skewness": [float(skew) for skew in data["skew"]],
"Kurtosis": [float(kurt) for kurt in data["kurt"]],
Expand Down Expand Up @@ -734,7 +754,7 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
df_labels,
baseline if len(df) > 1 else 0,
target,
df_list
df_list,
)
elif is_dtype(dtp, Continuous()):
if cfg.diff.density:
Expand All @@ -753,7 +773,7 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
df_labels,
orig,
target,
df_list
df_list,
)
elif is_dtype(dtp, DateTime()):
df, timeunit = data
Expand Down
10 changes: 5 additions & 5 deletions dataprep/eda/intermediate.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
visual_type = kwargs.pop("visual_type")
super().__init__(**kwargs)
self.visual_type = visual_type
if 'target' in kwargs:
self.target = kwargs.pop('target')
if 'df_list' in kwargs:
self.df_list = kwargs.pop('df_list')
if "target" in kwargs:
self.target = kwargs.pop("target")

if "df_list" in kwargs:
self.df_list = kwargs.pop("df_list")
else:
raise ValueError("Unsupported initialization")

Expand Down

0 comments on commit 5f5ba73

Please sign in to comment.