Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pretty printing dataset #3987

Merged
merged 34 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
502f982
Implemented basic __repr__
ElenaKhaustova Jul 3, 2024
9aa704f
Updated __repr__
ElenaKhaustova Jul 3, 2024
45b51e8
Removed __str__
ElenaKhaustova Jul 3, 2024
94e6465
Updated _describe() for CachedDataset
ElenaKhaustova Jul 3, 2024
e2884e7
Made pretty_repr protected
ElenaKhaustova Jul 3, 2024
248a2c0
Merge branch 'main' into feature/3980-pretty-printing-dataset
ElenaKhaustova Jul 4, 2024
aea8ec0
Reverted width parameter to default
ElenaKhaustova Jul 4, 2024
659de36
Updated printing params
ElenaKhaustova Jul 4, 2024
d1cb03f
Updated printing width
ElenaKhaustova Jul 4, 2024
4de32c4
Disable sorting
ElenaKhaustova Jul 4, 2024
9e6bae9
Disable sorting
ElenaKhaustova Jul 4, 2024
57f9a3f
Merge branch 'main' into feature/3980-pretty-printing-dataset
ElenaKhaustova Jul 8, 2024
e61a775
Updated test_str_representation
ElenaKhaustova Jul 8, 2024
0f84735
Updated cached dataset tests
ElenaKhaustova Jul 8, 2024
3a7e748
Updated data catalog tests
ElenaKhaustova Jul 8, 2024
216cb42
Updated core tests
ElenaKhaustova Jul 8, 2024
ca42da1
Updated versioned dataset tests
ElenaKhaustova Jul 8, 2024
924b53e
Updated tests for lambda dataset
ElenaKhaustova Jul 8, 2024
f529c4b
Updated tests for memory dataset
ElenaKhaustova Jul 8, 2024
7754078
Set width to maxsize
ElenaKhaustova Jul 9, 2024
14f237f
Removed top-level keys sorting
ElenaKhaustova Jul 9, 2024
cfac9d6
Updated tests
ElenaKhaustova Jul 9, 2024
bfc0841
Merge branch 'main' into feature/3980-pretty-printing-dataset
ElenaKhaustova Jul 9, 2024
7190b3c
Merge branch 'main' into feature/3980-pretty-printing-dataset
ElenaKhaustova Jul 9, 2024
54964f8
Updated release notes
ElenaKhaustova Jul 9, 2024
1b0ae7a
Decoupled describe from pretty printing
ElenaKhaustova Jul 12, 2024
c6a8a31
Merge branch 'main' into feature/3980-pretty-printing-dataset
ElenaKhaustova Jul 12, 2024
22aec50
Returned old __str__ to avoid a breaking change
ElenaKhaustova Jul 12, 2024
28a961d
Updated tests
ElenaKhaustova Jul 12, 2024
d5cd26b
Merge branch 'main' into feature/3980-pretty-printing-dataset
ElenaKhaustova Jul 16, 2024
d7f50aa
Replaced deprecation comment with TODO
ElenaKhaustova Jul 16, 2024
52945dd
Merge branch 'main' into feature/3980-pretty-printing-dataset
noklam Jul 17, 2024
2073d74
Merge branch 'main' into feature/3980-pretty-printing-dataset
ElenaKhaustova Jul 17, 2024
001e7f7
Merge branch 'main' into feature/3980-pretty-printing-dataset
ElenaKhaustova Jul 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions kedro/io/cached_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ def _from_config(config: dict, version: Version | None) -> AbstractDataset:

def _describe(self) -> dict[str, Any]:
return {
"dataset": self._dataset._describe(),
"cache": self._cache._describe(),
"dataset": self._dataset._pretty_repr(self._dataset._describe()),
"cache": self._cache._pretty_repr(self._cache._describe()),
}

def _load(self) -> Any:
Expand Down
39 changes: 16 additions & 23 deletions kedro/io/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
import copy
import logging
import os
import pprint
import re
import sys
import warnings
from collections import namedtuple
from datetime import datetime, timezone
Expand Down Expand Up @@ -227,32 +229,23 @@ def save(self, data: _DI) -> None:
message = f"Failed while saving data to data set {str(self)}.\n{str(exc)}"
raise DatasetError(message) from exc

def __str__(self) -> str:
astrojuanlu marked this conversation as resolved.
Show resolved Hide resolved
def _to_str(obj: Any, is_root: bool = False) -> str:
"""Returns a string representation where
1. The root level (i.e. the Dataset.__init__ arguments) are
formatted like Dataset(key=value).
2. Dictionaries have the keys alphabetically sorted recursively.
3. None values are not shown.
"""

fmt = "{}={}" if is_root else "'{}': {}" # 1

if isinstance(obj, dict):
sorted_dict = sorted(obj.items(), key=lambda pair: str(pair[0])) # 2

text = ", ".join(
fmt.format(key, _to_str(value)) # 2
for key, value in sorted_dict
if value is not None # 3
def _pretty_repr(self, object_description: dict[str, Any]) -> str:
str_keys = []
for arg_name, arg_descr in object_description.items():
if arg_descr is not None:
descr = pprint.pformat(
arg_descr,
sort_dicts=False,
ElenaKhaustova marked this conversation as resolved.
Show resolved Hide resolved
compact=True,
depth=2,
width=sys.maxsize,
)
str_keys.append(f"{arg_name}={descr}")

return text if is_root else "{" + text + "}" # 1

# not a dictionary
return str(obj)
return f"{type(self).__module__}.{type(self).__name__}({', '.join(str_keys)})"
ElenaKhaustova marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it comes from kedro_datasets, I feel like it would be much cleaner to have the short name users can provide (i.e. pandas.CSVDataset instead of kedro_datasets.pandas.csv_dataset.CSVDataset).

Similarly, from Kedro core, MemoryDataset, etc. seems much easier to read than the full path.

Copy link
Contributor Author

@ElenaKhaustova ElenaKhaustova Jul 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree on your points but decided to keep the full path, so __repr__ is as less ambiguous as possible and one could easily understand where to look the implementation of the printed dataset.

Ideally, it would be good to have some "short=True" flag for the representation that can be set by user. But by default, I prefer to keep a full module name.

Curious what other think.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Happy to hear what others think, but IMO we actually don't use the module name in the docs and examples. The standard way people write entries in catalog is the "short" version, and I think that should be reflected.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can also consider adding __str__ with a short representation, so that __repr__ is actual representation but __str__ adapted for printing

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The standard way people write entries in catalog is the "short" version

Most users we've spoken to create their own datasets sooner or later, so people have a mix between short and long names in the catalog.

Also, the long name would work even for kedro-datasets right? The fact that we use the short one is a matter of convention and convenience. But arguably it obscures where the datasets come from, and maybe play a role in people not understanding their tracebacks (especially with s vs S).

So I'm slightly in favour of keeping the long names for all datasets to avoid introducing an exception to the logic written here, but I don't have a strong opinion.

(In either case, no matter what we do I don't think we should complicate the implementation with a flag.)


return f"{type(self).__name__}({_to_str(self._describe(), True)})"
def __repr__(self) -> str:
return self._pretty_repr(self._describe())

@abc.abstractmethod
def _load(self) -> _DO:
Expand Down
6 changes: 4 additions & 2 deletions tests/io/test_cached_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,10 @@ def test_pickle(self, cached_ds, caplog):

def test_str(self):
assert (
str(CachedDataset(MemoryDataset(42))) == "CachedDataset(cache={}, "
"dataset={'data': <int>})"
str(CachedDataset(MemoryDataset(42)))
== """kedro.io.cached_dataset.CachedDataset("""
"""dataset="kedro.io.memory_dataset.MemoryDataset(data='<int>')", """
"""cache='kedro.io.memory_dataset.MemoryDataset()')"""
)

def test_release(self, cached_ds):
Expand Down
22 changes: 16 additions & 6 deletions tests/io/test_core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import pprint
import shutil
from decimal import Decimal
from fractions import Fraction
Expand Down Expand Up @@ -206,11 +207,18 @@ def dummy_data():
class TestCoreFunctions:
@pytest.mark.parametrize("var", [1, True] + FALSE_BUILTINS)
def test_str_representation(self, var):
filepath = "."
assert str(MyDataset(var=var)) == f"MyDataset(filepath={filepath}, var={var})"
var_str = pprint.pformat(var)
filepath_str = pprint.pformat(PurePosixPath("."))
assert (
str(MyDataset(var=var))
== f"tests.io.test_core.MyDataset(filepath={filepath_str}, var={var_str})"
)

def test_str_representation_none(self):
assert str(MyDataset()) == "MyDataset(filepath=.)"
filepath_str = pprint.pformat(PurePosixPath("."))
assert (
str(MyDataset()) == f"tests.io.test_core.MyDataset(filepath={filepath_str})"
)

def test_get_filepath_str(self):
path = get_filepath_str(PurePosixPath("example.com/test.csv"), "http")
Expand Down Expand Up @@ -334,7 +342,9 @@ def test_resolve_save_version(self, dummy_data):

def test_no_versions(self, my_versioned_dataset):
"""Check the error if no versions are available for load."""
pattern = r"Did not find any versions for MyVersionedDataset\(.+\)"
pattern = (
r"Did not find any versions for tests.io.test_core.MyVersionedDataset\(.+\)"
)
with pytest.raises(DatasetError, match=pattern):
my_versioned_dataset.load()

Expand Down Expand Up @@ -369,7 +379,7 @@ def test_prevent_overwrite(self, my_versioned_dataset, dummy_data):
corresponding json file for a given save version already exists."""
my_versioned_dataset.save(dummy_data)
pattern = (
r"Save path \'.+\' for MyVersionedDataset\(.+\) must "
r"Save path \'.+\' for tests.io.test_core.MyVersionedDataset\(.+\) must "
r"not exist if versioning is enabled\."
)
with pytest.raises(DatasetError, match=pattern):
Expand All @@ -389,7 +399,7 @@ def test_save_version_warning(
pattern = (
f"Save version '{save_version}' did not match "
f"load version '{load_version}' for "
r"MyVersionedDataset\(.+\)"
r"tests.io.test_core.MyVersionedDataset\(.+\)"
)
with pytest.warns(UserWarning, match=pattern):
my_versioned_dataset.save(dummy_data)
Expand Down
2 changes: 1 addition & 1 deletion tests/io/test_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def test_add_all_save_and_load(self, dataset, dummy_dataframe):
def test_load_error(self, data_catalog):
"""Check the error when attempting to load a data set
from nonexistent source"""
pattern = r"Failed while loading data from data set CSVDataset"
pattern = r"Failed while loading data from data set kedro_datasets.pandas.csv_dataset.CSVDataset"
with pytest.raises(DatasetError, match=pattern):
data_catalog.load("test")

Expand Down
27 changes: 15 additions & 12 deletions tests/io/test_lambda_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,28 @@ def _dummy_exists():
def _dummy_release():
pass # pragma: no cover

assert "LambdaDataset(load=<tests.io.test_lambda_dataset._dummy_load>)" in str(
LambdaDataset(_dummy_load, None)
assert (
"kedro.io.lambda_dataset.LambdaDataset(load='<tests.io.test_lambda_dataset._dummy_load>')"
in str(LambdaDataset(_dummy_load, None))
)
assert "LambdaDataset(save=<tests.io.test_lambda_dataset._dummy_save>)" in str(
LambdaDataset(None, _dummy_save)
assert (
"kedro.io.lambda_dataset.LambdaDataset(save='<tests.io.test_lambda_dataset._dummy_save>')"
in str(LambdaDataset(None, _dummy_save))
)
assert "LambdaDataset(exists=<tests.io.test_lambda_dataset._dummy_exists>)" in str(
LambdaDataset(None, None, _dummy_exists)
assert (
"kedro.io.lambda_dataset.LambdaDataset(exists='<tests.io.test_lambda_dataset._dummy_exists>')"
in str(LambdaDataset(None, None, _dummy_exists))
)
assert (
"LambdaDataset(release=<tests.io.test_lambda_dataset._dummy_release>)"
"kedro.io.lambda_dataset.LambdaDataset(release='<tests.io.test_lambda_dataset._dummy_release>')"
in str(LambdaDataset(None, None, None, _dummy_release))
)

# __init__ keys alphabetically sorted, None values not shown
# __init__ keys remains in the provided order, None values not shown
expected = (
"LambdaDataset(exists=<tests.io.test_lambda_dataset._dummy_exists>, "
"load=<tests.io.test_lambda_dataset._dummy_load>, "
"save=<tests.io.test_lambda_dataset._dummy_save>)"
"kedro.io.lambda_dataset.LambdaDataset(load='<tests.io.test_lambda_dataset._dummy_load>', "
"save='<tests.io.test_lambda_dataset._dummy_save>', "
"exists='<tests.io.test_lambda_dataset._dummy_exists>')"
)
actual = str(LambdaDataset(_dummy_load, _dummy_save, _dummy_exists, None))
assert actual == expected
Expand Down Expand Up @@ -103,7 +106,7 @@ def test_save_raises_error(self, mocked_save, mocked_dataset):
mocked_save.side_effect = FileExistsError(error_message)

pattern = (
r"Failed while saving data to data set LambdaDataset\(.+\)\.\n"
r"Failed while saving data to data set kedro.io.lambda_dataset.LambdaDataset\(.+\)\.\n"
+ error_message
)
with pytest.raises(DatasetError, match=pattern):
Expand Down
10 changes: 8 additions & 2 deletions tests/io/test_memory_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,14 @@ def test_saving_none(self):
@pytest.mark.parametrize(
"input_data,expected",
[
("dummy_dataframe", "MemoryDataset(data=<DataFrame>)"),
("dummy_numpy_array", "MemoryDataset(data=<ndarray>)"),
(
"dummy_dataframe",
"kedro.io.memory_dataset.MemoryDataset(data='<DataFrame>')",
),
(
"dummy_numpy_array",
"kedro.io.memory_dataset.MemoryDataset(data='<ndarray>')",
),
],
indirect=["input_data"],
)
Expand Down