From dcac4c5306ad1f6d7e6ff629f313b268cb5152dc Mon Sep 17 00:00:00 2001 From: Samhita Alla Date: Tue, 7 Nov 2023 15:22:57 +0530 Subject: [PATCH] second iteration: docs restructure (#1214) * update examples in the data types and io section Signed-off-by: Samhita Alla * image spec Signed-off-by: Samhita Alla * lint; move files and folders Signed-off-by: Samhita Alla * isort Signed-off-by: Samhita Alla * add torch dep Signed-off-by: Samhita Alla * mnist fix Signed-off-by: Samhita Alla * incorporate suggestions by Ketan Signed-off-by: Samhita Alla --------- Signed-off-by: Samhita Alla --- .github/workflows/checks.yml | 11 +- dev-requirements.in | 1 + dev-requirements.txt | 147 +++-- docs-requirements.in | 4 +- docs-requirements.txt | 291 ++++----- docs/getting_started/data_engineering.md | 2 - docs/getting_started/extending_flyte.md | 9 +- docs/getting_started/machine_learning.md | 3 +- examples/advanced_composition/README.md | 2 - examples/basics/Dockerfile | 4 +- examples/basics/basics/task.py | 2 +- .../customizing_dependencies/image_spec.py | 5 + examples/data_types_and_io/Dockerfile | 15 +- examples/data_types_and_io/README.md | 147 ++++- .../data_types_and_io/custom_objects.py | 159 ----- .../data_types_and_io/dataclass.py | 143 +++++ .../data_types_and_io/dataclass_input.json | 1 + .../data_types_and_io/enum_type.py | 74 +++ .../data_types_and_io/enums.py | 66 -- .../data_types_and_io/file.py} | 43 +- .../data_types_and_io/flyte_pickle.py | 94 --- .../data_types_and_io/flyte_python_types.py | 101 --- .../data_types_and_io/folder.py} | 56 +- .../data_types_and_io/pickle_type.py | 105 +++ .../{pytorch_types.py => pytorch_type.py} | 66 +- .../data_types_and_io/schema.py | 70 -- .../data_types_and_io/structured_dataset.py | 328 ++++++---- .../data_types_and_io/typed_schema.py | 69 -- examples/data_types_and_io/requirements.in | 6 - examples/data_types_and_io/requirements.txt | 607 ------------------ .../development_lifecycle/decks.py | 3 +- examples/extending/README.md | 2 +- examples/k8s_dask_plugin/README.md | 5 +- .../k8s_dask_plugin/dask_example.py | 3 +- examples/k8s_spark_plugin/README.md | 7 +- .../k8s_spark_plugin/dataframe_passing.py | 7 +- .../k8s_spark_plugin/pyspark_pi.py | 3 +- examples/kfpytorch_plugin/README.md | 3 +- .../kfpytorch_plugin/pytorch_mnist.py | 3 +- examples/kftensorflow_plugin/README.md | 3 +- .../kftensorflow_plugin/tf_mnist.py | 3 +- examples/mnist_classifier/Dockerfile | 6 - examples/pandera_plugin/README.md | 4 +- examples/ray_plugin/README.md | 5 +- examples/ray_plugin/ray_plugin/ray_example.py | 3 +- 45 files changed, 992 insertions(+), 1699 deletions(-) delete mode 100644 examples/data_types_and_io/data_types_and_io/custom_objects.py create mode 100644 examples/data_types_and_io/data_types_and_io/dataclass.py create mode 100644 examples/data_types_and_io/data_types_and_io/dataclass_input.json create mode 100644 examples/data_types_and_io/data_types_and_io/enum_type.py delete mode 100644 examples/data_types_and_io/data_types_and_io/enums.py rename examples/{advanced_composition/advanced_composition/files.py => data_types_and_io/data_types_and_io/file.py} (76%) delete mode 100644 examples/data_types_and_io/data_types_and_io/flyte_pickle.py delete mode 100644 examples/data_types_and_io/data_types_and_io/flyte_python_types.py rename examples/{advanced_composition/advanced_composition/folders.py => data_types_and_io/data_types_and_io/folder.py} (74%) create mode 100644 examples/data_types_and_io/data_types_and_io/pickle_type.py rename examples/data_types_and_io/data_types_and_io/{pytorch_types.py => pytorch_type.py} (62%) delete mode 100644 examples/data_types_and_io/data_types_and_io/schema.py delete mode 100644 examples/data_types_and_io/data_types_and_io/typed_schema.py delete mode 100644 examples/data_types_and_io/requirements.in delete mode 100644 examples/data_types_and_io/requirements.txt diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 81b8840f3..60fa7fb77 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -171,8 +171,8 @@ jobs: e2e-tests: runs-on: ubuntu-latest env: - FLYTESNACKS_PRIORITIES: 'P0' - FLYTESNACKS_VERSION: '' + FLYTESNACKS_PRIORITIES: "P0" + FLYTESNACKS_VERSION: "" timeout-minutes: 30 steps: - name: Set latest Flytesnacks release @@ -185,7 +185,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.11' + python-version: "3.11" - uses: unionai/flytectl-setup-action@v0.0.1 - name: Setup sandbox run: | @@ -220,9 +220,8 @@ jobs: advanced_composition/advanced_composition/dynamics.py \ advanced_composition/advanced_composition/map_task.py \ advanced_composition/advanced_composition/subworkflows.py \ - data_types_and_io/data_types_and_io/custom_objects.py \ - data_types_and_io/data_types_and_io/schema.py \ - data_types_and_io/data_types_and_io/typed_schema.py ; + data_types_and_io/data_types_and_io/dataclass.py \ + data_types_and_io/data_types_and_io/structured_dataset.py ; do pyflyte --config ./boilerplate/flyte/end2end/functional-test-config.yaml \ register \ diff --git a/dev-requirements.in b/dev-requirements.in index 46cab6e76..996558ba2 100644 --- a/dev-requirements.in +++ b/dev-requirements.in @@ -10,3 +10,4 @@ isort mock pytest mypy +mashumaro diff --git a/dev-requirements.txt b/dev-requirements.txt index da483aa05..174553f41 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -4,11 +4,11 @@ # # pip-compile dev-requirements.in # -adlfs==2023.8.0 +adlfs==2023.10.0 # via flytekit -aiobotocore==2.5.4 +aiobotocore==2.7.0 # via s3fs -aiohttp==3.8.5 +aiohttp==3.8.6 # via # adlfs # aiobotocore @@ -18,38 +18,38 @@ aioitertools==0.11.0 # via aiobotocore aiosignal==1.3.1 # via aiohttp -arrow==1.2.3 +arrow==1.3.0 # via cookiecutter async-timeout==4.0.3 # via aiohttp attrs==23.1.0 # via aiohttp -azure-core==1.29.3 +azure-core==1.29.5 # via # adlfs # azure-identity # azure-storage-blob azure-datalake-store==0.0.53 # via adlfs -azure-identity==1.14.0 +azure-identity==1.15.0 # via adlfs -azure-storage-blob==12.17.0 +azure-storage-blob==12.18.3 # via adlfs binaryornot==0.4.4 # via cookiecutter -black==22.3.0 +black==23.10.1 # via # -r dev-requirements.in # flake8-black -botocore==1.31.17 +botocore==1.31.64 # via aiobotocore -cachetools==5.3.1 +cachetools==5.3.2 # via google-auth certifi==2023.7.22 # via # kubernetes # requests -cffi==1.15.1 +cffi==1.16.0 # via # azure-datalake-store # cryptography @@ -57,7 +57,7 @@ cfgv==3.4.0 # via pre-commit chardet==5.2.0 # via binaryornot -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 # via # aiohttp # requests @@ -67,31 +67,30 @@ click==8.1.7 # cookiecutter # flytekit # rich-click -cloudpickle==2.2.1 +cloudpickle==3.0.0 # via flytekit -codespell==2.2.5 +codespell==2.2.6 # via -r dev-requirements.in -cookiecutter==2.3.0 +cookiecutter==2.4.0 # via flytekit -coverage==7.3.0 +coverage==7.3.2 # via -r dev-requirements.in -croniter==1.4.1 +croniter==2.0.1 # via flytekit -cryptography==41.0.3 +cryptography==41.0.5 # via # azure-identity # azure-storage-blob # msal # pyjwt # pyopenssl - # secretstorage dataclasses-json==0.5.9 # via flytekit decorator==5.1.1 # via gcsfs deprecated==1.2.14 # via flytekit -diskcache==5.6.1 +diskcache==5.6.3 # via flytekit distlib==0.3.7 # via virtualenv @@ -101,7 +100,7 @@ docker-image-py==0.1.12 # via flytekit docstring-parser==0.15 # via flytekit -filelock==3.12.2 +filelock==3.13.1 # via virtualenv flake8==6.1.0 # via @@ -110,33 +109,33 @@ flake8==6.1.0 # flake8-isort flake8-black==0.3.6 # via -r dev-requirements.in -flake8-isort==6.0.0 +flake8-isort==6.1.0 # via -r dev-requirements.in -flyteidl==1.5.16 +flyteidl==1.10.0 # via flytekit -flytekit==1.9.0 +flytekit==1.10.0 # via -r dev-requirements.in frozenlist==1.4.0 # via # aiohttp # aiosignal -fsspec==2023.6.0 +fsspec==2023.10.0 # via # adlfs # flytekit # gcsfs # s3fs -gcsfs==2023.6.0 +gcsfs==2023.10.0 # via flytekit -gitdb==4.0.10 +gitdb==4.0.11 # via gitpython -gitpython==3.1.35 +gitpython==3.1.40 # via flytekit -google-api-core==2.11.1 +google-api-core==2.12.0 # via # google-cloud-core # google-cloud-storage -google-auth==2.22.0 +google-auth==2.23.4 # via # gcsfs # google-api-core @@ -144,29 +143,31 @@ google-auth==2.22.0 # google-cloud-core # google-cloud-storage # kubernetes -google-auth-oauthlib==1.0.0 +google-auth-oauthlib==1.1.0 # via gcsfs google-cloud-core==2.3.3 # via google-cloud-storage -google-cloud-storage==2.10.0 +google-cloud-storage==2.13.0 # via gcsfs google-crc32c==1.5.0 - # via google-resumable-media -google-resumable-media==2.5.0 + # via + # google-cloud-storage + # google-resumable-media +google-resumable-media==2.6.0 # via google-cloud-storage -googleapis-common-protos==1.60.0 +googleapis-common-protos==1.61.0 # via # flyteidl # flytekit # google-api-core # grpcio-status -grpcio==1.53.0 +grpcio==1.59.2 # via # flytekit # grpcio-status -grpcio-status==1.53.0 +grpcio-status==1.59.2 # via flytekit -identify==2.5.27 +identify==2.5.31 # via pre-commit idna==3.4 # via @@ -186,19 +187,17 @@ isort==5.12.0 # flake8-isort jaraco-classes==3.3.0 # via keyring -jeepney==0.8.0 - # via - # keyring - # secretstorage jinja2==3.1.2 # via cookiecutter jmespath==1.0.1 # via botocore joblib==1.3.2 # via flytekit +jsonpickle==3.0.2 + # via flytekit keyring==24.2.0 # via flytekit -kubernetes==27.2.0 +kubernetes==28.1.0 # via flytekit markdown-it-py==3.0.0 # via rich @@ -215,6 +214,10 @@ marshmallow-enum==1.5.1 # flytekit marshmallow-jsonschema==0.13.0 # via flytekit +mashumaro==3.10 + # via + # -r dev-requirements.in + # flytekit mccabe==0.7.0 # via flake8 mdurl==0.1.2 @@ -223,7 +226,7 @@ mock==5.1.0 # via -r dev-requirements.in more-itertools==10.1.0 # via jaraco-classes -msal==1.23.0 +msal==1.24.1 # via # azure-datalake-store # azure-identity @@ -234,7 +237,7 @@ multidict==6.0.4 # via # aiohttp # yarl -mypy==1.5.1 +mypy==1.6.1 # via -r dev-requirements.in mypy-extensions==1.0.0 # via @@ -245,7 +248,7 @@ natsort==8.4.0 # via flytekit nodeenv==1.8.0 # via pre-commit -numpy==1.25.2 +numpy==1.26.1 # via # flytekit # pandas @@ -254,8 +257,9 @@ oauthlib==3.2.2 # via # kubernetes # requests-oauthlib -packaging==23.1 +packaging==23.2 # via + # black # docker # marshmallow # pytest @@ -263,17 +267,17 @@ pandas==1.5.3 # via flytekit pathspec==0.11.2 # via black -platformdirs==3.10.0 +platformdirs==3.11.0 # via # black # virtualenv -pluggy==1.2.0 +pluggy==1.3.0 # via pytest -portalocker==2.7.0 +portalocker==2.8.2 # via msal-extensions -pre-commit==3.3.3 +pre-commit==3.5.0 # via -r dev-requirements.in -protobuf==4.24.1 +protobuf==4.25.0 # via # flyteidl # google-api-core @@ -290,7 +294,7 @@ pyasn1==0.5.0 # rsa pyasn1-modules==0.3.0 # via google-auth -pycodestyle==2.11.0 +pycodestyle==2.11.1 # via flake8 pycparser==2.21 # via cffi @@ -300,9 +304,9 @@ pygments==2.16.1 # via rich pyjwt[crypto]==2.8.0 # via msal -pyopenssl==23.2.0 +pyopenssl==23.3.0 # via flytekit -pytest==7.4.0 +pytest==7.4.3 # via -r dev-requirements.in python-dateutil==2.8.2 # via @@ -318,8 +322,9 @@ python-slugify==8.0.1 # via cookiecutter pytimeparse==1.1.8 # via flytekit -pytz==2023.3 +pytz==2023.3.post1 # via + # croniter # flytekit # pandas pyyaml==6.0.1 @@ -328,7 +333,7 @@ pyyaml==6.0.1 # flytekit # kubernetes # pre-commit -regex==2023.8.8 +regex==2023.10.3 # via docker-image-py requests==2.31.0 # via @@ -347,27 +352,24 @@ requests-oauthlib==1.3.1 # via # google-auth-oauthlib # kubernetes -rich==13.5.2 +rich==13.6.0 # via # cookiecutter # flytekit # rich-click -rich-click==1.6.1 +rich-click==1.7.1 # via flytekit rsa==4.9 # via google-auth -s3fs==2023.6.0 +s3fs==2023.10.0 # via flytekit -secretstorage==3.3.3 - # via keyring six==1.16.0 # via # azure-core - # google-auth # isodate # kubernetes # python-dateutil -smmap==5.0.0 +smmap==5.0.1 # via gitdb sortedcontainers==2.4.0 # via flytekit @@ -375,30 +377,33 @@ statsd==3.3.0 # via flytekit text-unidecode==1.3 # via python-slugify -typing-extensions==4.7.1 +types-python-dateutil==2.8.19.14 + # via arrow +typing-extensions==4.8.0 # via # azure-core # azure-storage-blob # flytekit + # mashumaro # mypy + # rich-click # typing-inspect typing-inspect==0.9.0 # via dataclasses-json -urllib3==1.26.16 +urllib3==1.26.18 # via # botocore # docker # flytekit - # google-auth # kubernetes # requests -virtualenv==20.24.3 +virtualenv==20.24.6 # via pre-commit -websocket-client==1.6.2 +websocket-client==1.6.4 # via # docker # kubernetes -wheel==0.41.2 +wheel==0.41.3 # via flytekit wrapt==1.15.0 # via @@ -407,7 +412,7 @@ wrapt==1.15.0 # flytekit yarl==1.9.2 # via aiohttp -zipp==3.16.2 +zipp==3.17.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/docs-requirements.in b/docs-requirements.in index 682bc9767..12ae5ab1c 100644 --- a/docs-requirements.in +++ b/docs-requirements.in @@ -1,4 +1,6 @@ flytekit +fsspec<2023.10.0 +mashumaro git+https://github.com/flyteorg/furo@main pillow psycopg2-binary @@ -24,6 +26,6 @@ grpcio grpcio-status myst-nb ipython!=8.7.0 -sphinx-tags +sphinx-tags==0.2.1 scikit-learn torch diff --git a/docs-requirements.txt b/docs-requirements.txt index 1e7c0e670..b70cc0a77 100644 --- a/docs-requirements.txt +++ b/docs-requirements.txt @@ -4,11 +4,11 @@ # # pip-compile docs-requirements.in # -adlfs==2023.8.0 +adlfs==2023.10.0 # via flytekit aiobotocore==2.5.4 # via s3fs -aiohttp==3.8.5 +aiohttp==3.8.6 # via # adlfs # aiobotocore @@ -20,13 +20,19 @@ aiosignal==1.3.1 # via aiohttp alabaster==0.7.13 # via sphinx -arrow==1.2.3 +annotated-types==0.6.0 + # via pydantic +appnope==0.1.3 + # via + # ipykernel + # ipython +arrow==1.3.0 # via cookiecutter -astroid==2.15.6 +astroid==3.0.1 # via # -r docs-requirements.in # sphinx-autoapi -asttokens==2.2.1 +asttokens==2.4.1 # via stack-data async-timeout==4.0.3 # via aiohttp @@ -37,21 +43,19 @@ attrs==23.1.0 # jupyter-cache # referencing # visions -azure-core==1.29.3 +azure-core==1.29.5 # via # adlfs # azure-identity # azure-storage-blob azure-datalake-store==0.0.53 # via adlfs -azure-identity==1.14.0 +azure-identity==1.15.0 # via adlfs -azure-storage-blob==12.17.0 +azure-storage-blob==12.18.3 # via adlfs -babel==2.12.1 +babel==2.13.1 # via sphinx -backcall==0.2.0 - # via ipython beautifulsoup4==4.12.2 # via # furo @@ -60,13 +64,13 @@ binaryornot==0.4.4 # via cookiecutter botocore==1.31.17 # via aiobotocore -cachetools==5.3.1 +cachetools==5.3.2 # via google-auth certifi==2023.7.22 # via # kubernetes # requests -cffi==1.15.1 +cffi==1.16.0 # via # azure-datalake-store # cryptography @@ -74,7 +78,7 @@ cfgv==3.4.0 # via pre-commit chardet==5.2.0 # via binaryornot -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 # via # aiohttp # requests @@ -84,37 +88,34 @@ click==8.1.7 # flytekit # jupyter-cache # rich-click -cloudpickle==2.2.1 +cloudpickle==3.0.0 # via # flytekit # flytekitplugins-kfpytorch -cmake==3.27.4.1 - # via triton comm==0.1.4 # via # ipykernel # ipywidgets -contourpy==1.1.0 +contourpy==1.1.1 # via matplotlib -cookiecutter==2.3.0 +cookiecutter==2.4.0 # via flytekit -croniter==1.4.1 +croniter==2.0.1 # via flytekit -cryptography==41.0.3 +cryptography==41.0.5 # via # azure-identity # azure-storage-blob # msal # pyjwt # pyopenssl - # secretstorage -cycler==0.11.0 +cycler==0.12.1 # via matplotlib dacite==1.8.1 # via ydata-profiling dataclasses-json==0.5.9 # via flytekit -debugpy==1.6.7.post1 +debugpy==1.8.0 # via ipykernel decorator==5.1.1 # via @@ -122,7 +123,7 @@ decorator==5.1.1 # ipython deprecated==1.2.14 # via flytekit -diskcache==5.6.1 +diskcache==5.6.3 # via flytekit distlib==0.3.7 # via virtualenv @@ -139,56 +140,57 @@ docutils==0.17.1 # sphinx-panels # sphinx-rtd-theme # sphinx-tabs -executing==1.2.0 +executing==2.0.1 # via stack-data -fastjsonschema==2.18.0 +fastjsonschema==2.18.1 # via nbformat -filelock==3.12.2 +filelock==3.13.1 # via # torch - # triton # virtualenv -flyteidl==1.5.16 +flyteidl==1.10.0 # via # flytekit # flytekitplugins-kfpytorch -flytekit==1.9.0 +flytekit==1.10.0 # via # -r docs-requirements.in # flytekitplugins-deck-standard # flytekitplugins-kfpytorch # flytekitplugins-sqlalchemy -flytekitplugins-deck-standard==1.9.0 +flytekitplugins-deck-standard==1.10.0 # via -r docs-requirements.in -flytekitplugins-kfpytorch==1.9.0 +flytekitplugins-kfpytorch==1.10.0 # via -r docs-requirements.in -flytekitplugins-sqlalchemy==1.9.0 +flytekitplugins-sqlalchemy==1.10.0 # via -r docs-requirements.in -fonttools==4.42.1 +fonttools==4.43.1 # via matplotlib frozenlist==1.4.0 # via # aiohttp # aiosignal -fsspec==2023.6.0 +fsspec==2023.9.2 # via + # -r docs-requirements.in # adlfs # flytekit # gcsfs # s3fs + # torch furo @ git+https://github.com/flyteorg/furo@main # via -r docs-requirements.in -gcsfs==2023.6.0 +gcsfs==2023.9.2 # via flytekit -gitdb==4.0.10 +gitdb==4.0.11 # via gitpython -gitpython==3.1.35 +gitpython==3.1.40 # via flytekit -google-api-core==2.11.1 +google-api-core==2.12.0 # via # google-cloud-core # google-cloud-storage -google-auth==2.22.0 +google-auth==2.23.4 # via # gcsfs # google-api-core @@ -196,36 +198,36 @@ google-auth==2.22.0 # google-cloud-core # google-cloud-storage # kubernetes -google-auth-oauthlib==1.0.0 +google-auth-oauthlib==1.1.0 # via gcsfs google-cloud-core==2.3.3 # via google-cloud-storage -google-cloud-storage==2.10.0 +google-cloud-storage==2.13.0 # via gcsfs google-crc32c==1.5.0 - # via google-resumable-media -google-resumable-media==2.5.0 + # via + # google-cloud-storage + # google-resumable-media +google-resumable-media==2.6.0 # via google-cloud-storage -googleapis-common-protos==1.60.0 +googleapis-common-protos==1.61.0 # via # flyteidl # flytekit # google-api-core # grpcio-status -greenlet==2.0.2 - # via sqlalchemy -grpcio==1.53.0 +grpcio==1.59.2 # via # -r docs-requirements.in # flytekit # grpcio-status -grpcio-status==1.53.0 +grpcio-status==1.59.2 # via # -r docs-requirements.in # flytekit htmlmin==0.1.12 # via ydata-profiling -identify==2.5.27 +identify==2.5.31 # via pre-commit idna==3.4 # via @@ -243,26 +245,22 @@ importlib-metadata==6.8.0 # jupyter-cache # keyring # myst-nb -ipykernel==6.25.1 +ipykernel==6.26.0 # via myst-nb -ipython==8.14.0 +ipython==8.17.2 # via # -r docs-requirements.in # ipykernel # ipywidgets # myst-nb -ipywidgets==8.1.0 +ipywidgets==8.1.1 # via flytekitplugins-deck-standard isodate==0.6.1 # via azure-storage-blob jaraco-classes==3.3.0 # via keyring -jedi==0.19.0 +jedi==0.19.1 # via ipython -jeepney==0.8.0 - # via - # keyring - # secretstorage jinja2==3.0.3 # via # cookiecutter @@ -279,37 +277,37 @@ joblib==1.3.2 # flytekit # phik # scikit-learn -jsonschema==4.19.0 +jsonpickle==3.0.2 + # via flytekit +jsonschema==4.19.2 # via nbformat jsonschema-specifications==2023.7.1 # via jsonschema jupyter-cache==0.6.1 # via myst-nb -jupyter-client==8.3.0 +jupyter-client==8.5.0 # via # ipykernel # nbclient -jupyter-core==5.3.1 +jupyter-core==5.5.0 # via # ipykernel # jupyter-client # nbclient # nbformat -jupyterlab-widgets==3.0.8 +jupyterlab-widgets==3.0.9 # via ipywidgets -jupytext==1.15.0 +jupytext==1.15.2 # via -r docs-requirements.in keyring==24.2.0 # via flytekit -kiwisolver==1.4.4 +kiwisolver==1.4.5 # via matplotlib -kubernetes==27.2.0 +kubernetes==28.1.0 # via flytekit -lazy-object-proxy==1.9.0 - # via astroid -lit==16.0.6 - # via triton -markdown==3.4.4 +llvmlite==0.41.1 + # via numba +markdown==3.5.1 # via flytekitplugins-deck-standard markdown-it-py==2.2.0 # via @@ -330,7 +328,11 @@ marshmallow-enum==1.5.1 # flytekit marshmallow-jsonschema==0.13.0 # via flytekit -matplotlib==3.7.2 +mashumaro==3.10 + # via + # -r docs-requirements.in + # flytekit +matplotlib==3.7.3 # via # phik # seaborn @@ -350,7 +352,7 @@ more-itertools==10.1.0 # via jaraco-classes mpmath==1.3.0 # via sympy -msal==1.23.0 +msal==1.24.1 # via # azure-datalake-store # azure-identity @@ -361,7 +363,7 @@ multidict==6.0.4 # via # aiohttp # yarl -multimethod==1.9.1 +multimethod==1.10 # via # visions # ydata-profiling @@ -383,20 +385,23 @@ nbformat==5.9.2 # jupytext # myst-nb # nbclient -nest-asyncio==1.5.7 +nest-asyncio==1.5.8 # via ipykernel -networkx==3.1 +networkx==3.2.1 # via # torch # visions nodeenv==1.8.0 # via pre-commit -numpy==1.23.5 +numba==0.58.1 + # via ydata-profiling +numpy==1.25.2 # via # contourpy # flytekit # imagehash # matplotlib + # numba # pandas # patsy # phik @@ -409,36 +414,11 @@ numpy==1.23.5 # visions # wordcloud # ydata-profiling -nvidia-cublas-cu11==11.10.3.66 - # via - # nvidia-cudnn-cu11 - # nvidia-cusolver-cu11 - # torch -nvidia-cuda-cupti-cu11==11.7.101 - # via torch -nvidia-cuda-nvrtc-cu11==11.7.99 - # via torch -nvidia-cuda-runtime-cu11==11.7.99 - # via torch -nvidia-cudnn-cu11==8.5.0.96 - # via torch -nvidia-cufft-cu11==10.9.0.58 - # via torch -nvidia-curand-cu11==10.2.10.91 - # via torch -nvidia-cusolver-cu11==11.4.0.1 - # via torch -nvidia-cusparse-cu11==11.7.4.91 - # via torch -nvidia-nccl-cu11==2.14.3 - # via torch -nvidia-nvtx-cu11==11.7.91 - # via torch oauthlib==3.2.2 # via # kubernetes # requests-oauthlib -packaging==23.1 +packaging==23.2 # via # docker # ipykernel @@ -463,28 +443,26 @@ pexpect==4.8.0 # via ipython phik==0.12.3 # via ydata-profiling -pickleshare==0.7.5 - # via ipython -pillow==10.0.0 +pillow==10.1.0 # via # -r docs-requirements.in # imagehash # matplotlib # visions # wordcloud -platformdirs==3.10.0 +platformdirs==3.11.0 # via # jupyter-core # virtualenv -plotly==5.16.1 +plotly==5.18.0 # via flytekitplugins-deck-standard -portalocker==2.7.0 +portalocker==2.8.2 # via msal-extensions -pre-commit==3.3.3 +pre-commit==3.5.0 # via sphinx-tags prompt-toolkit==3.0.39 # via ipython -protobuf==4.24.1 +protobuf==4.25.0 # via # flyteidl # google-api-core @@ -493,9 +471,9 @@ protobuf==4.24.1 # protoc-gen-swagger protoc-gen-swagger==0.1.0 # via flyteidl -psutil==5.9.5 +psutil==5.9.6 # via ipykernel -psycopg2-binary==2.9.7 +psycopg2-binary==2.9.9 # via -r docs-requirements.in ptyprocess==0.7.0 # via pexpect @@ -511,8 +489,10 @@ pyasn1-modules==0.3.0 # via google-auth pycparser==2.21 # via cffi -pydantic==1.10.12 +pydantic==2.4.2 # via ydata-profiling +pydantic-core==2.10.1 + # via pydantic pygments==2.16.1 # via # furo @@ -523,9 +503,9 @@ pygments==2.16.1 # sphinx-tabs pyjwt[crypto]==2.8.0 # via msal -pyopenssl==23.2.0 +pyopenssl==23.3.0 # via flytekit -pyparsing==3.0.9 +pyparsing==3.1.1 # via matplotlib python-dateutil==2.8.2 # via @@ -543,8 +523,9 @@ python-slugify==8.0.1 # via cookiecutter pytimeparse==1.1.8 # via flytekit -pytz==2023.3 +pytz==2023.3.post1 # via + # croniter # flytekit # pandas pywavelets==1.4.1 @@ -569,7 +550,7 @@ referencing==0.30.2 # via # jsonschema # jsonschema-specifications -regex==2023.8.8 +regex==2023.10.3 # via docker-image-py requests==2.31.0 # via @@ -591,24 +572,24 @@ requests-oauthlib==1.3.1 # via # google-auth-oauthlib # kubernetes -rich==13.5.2 +rich==13.6.0 # via # cookiecutter # flytekit # rich-click -rich-click==1.6.1 +rich-click==1.7.1 # via flytekit -rpds-py==0.9.2 +rpds-py==0.10.6 # via # jsonschema # referencing rsa==4.9 # via google-auth -s3fs==2023.6.0 +s3fs==2023.9.2 # via flytekit -scikit-learn==1.3.0 +scikit-learn==1.3.2 # via -r docs-requirements.in -scipy==1.11.2 +scipy==1.11.3 # via # imagehash # phik @@ -617,26 +598,23 @@ scipy==1.11.2 # ydata-profiling seaborn==0.12.2 # via ydata-profiling -secretstorage==3.3.3 - # via keyring six==1.16.0 # via # asttokens # azure-core - # google-auth # isodate # kubernetes # patsy # python-dateutil # sphinx-code-include # sphinxext-remoteliteralinclude -smmap==5.0.0 +smmap==5.0.1 # via gitdb snowballstemmer==2.2.0 # via sphinx sortedcontainers==2.4.0 # via flytekit -soupsieve==2.4.1 +soupsieve==2.5 # via beautifulsoup4 sphinx==4.5.0 # via @@ -696,15 +674,15 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -sphinxcontrib-youtube==1.2.0 +sphinxcontrib-youtube==1.3.0 # via -r docs-requirements.in sphinxext-remoteliteralinclude==0.4.0 # via -r docs-requirements.in -sqlalchemy==2.0.20 +sqlalchemy==2.0.22 # via # flytekitplugins-sqlalchemy # jupyter-cache -stack-data==0.6.2 +stack-data==0.6.3 # via ipython statsd==3.3.0 # via flytekit @@ -724,17 +702,15 @@ threadpoolctl==3.2.0 # via scikit-learn toml==0.10.2 # via jupytext -torch==2.0.1 - # via - # -r docs-requirements.in - # triton +torch==2.1.0 + # via -r docs-requirements.in tornado==6.3.3 # via # ipykernel # jupyter-client tqdm==4.66.1 # via ydata-profiling -traitlets==5.9.0 +traitlets==5.13.0 # via # comm # ipykernel @@ -745,67 +721,62 @@ traitlets==5.9.0 # matplotlib-inline # nbclient # nbformat -triton==2.0.0 - # via torch -typeguard==2.13.3 +typeguard==4.1.5 # via ydata-profiling -typing-extensions==4.7.1 +types-python-dateutil==2.8.19.14 + # via arrow +typing-extensions==4.8.0 # via # azure-core # azure-storage-blob # flytekit + # mashumaro # myst-nb # myst-parser # pydantic + # pydantic-core + # rich-click # sqlalchemy # torch + # typeguard # typing-inspect typing-inspect==0.9.0 # via dataclasses-json -unidecode==1.3.6 +unidecode==1.3.7 # via sphinx-autoapi -urllib3==1.26.16 +urllib3==1.26.18 # via # botocore # docker # flytekit - # google-auth # kubernetes # requests -virtualenv==20.24.3 +virtualenv==20.24.6 # via pre-commit visions[type_image_path]==0.7.5 # via ydata-profiling -wcwidth==0.2.6 +wcwidth==0.2.9 # via prompt-toolkit -websocket-client==1.6.2 +websocket-client==1.6.4 # via # docker # kubernetes -wheel==0.41.2 - # via - # flytekit - # nvidia-cublas-cu11 - # nvidia-cuda-cupti-cu11 - # nvidia-cuda-runtime-cu11 - # nvidia-curand-cu11 - # nvidia-cusparse-cu11 - # nvidia-nvtx-cu11 -widgetsnbextension==4.0.8 +wheel==0.41.3 + # via flytekit +widgetsnbextension==4.0.9 # via ipywidgets wordcloud==1.9.2 # via ydata-profiling wrapt==1.15.0 # via # aiobotocore - # astroid # deprecated # flytekit yarl==1.9.2 # via aiohttp -ydata-profiling==4.5.1 +ydata-profiling==4.6.1 # via flytekitplugins-deck-standard -zipp==3.16.2 +zipp==3.17.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/docs/getting_started/data_engineering.md b/docs/getting_started/data_engineering.md index 713f15ef1..031ae46fc 100644 --- a/docs/getting_started/data_engineering.md +++ b/docs/getting_started/data_engineering.md @@ -77,7 +77,6 @@ task's resource requirements with the {py:class}`~flytekit.Resources` object. If those assumptions ever change we can update the resource request here, or override it at the workflow-level with the {ref}`with_overrides ` method. - ```{code-cell} ipython3 @task(requests=Resources(mem="700Mi")) def transform(df: pd.DataFrame) -> pd.DataFrame: @@ -111,7 +110,6 @@ def load(df: pd.DataFrame) -> CSVFile: Putting all the pieces together, we create an `etl_workflow` that produces a dataset based on the parameters you give it. - ```{code-cell} ipython3 @workflow def etl_workflow( diff --git a/docs/getting_started/extending_flyte.md b/docs/getting_started/extending_flyte.md index abf4a2310..7390ac932 100644 --- a/docs/getting_started/extending_flyte.md +++ b/docs/getting_started/extending_flyte.md @@ -35,8 +35,7 @@ However, the types that ship with Flyte or one of Flyte's {ref}`first-party integrations ` may not fulfill your needs. In this case, you'll need to create your own. -The easiest way to do with is with the {py:mod}`dataclasses` and -[dataclasses-json](https://lidatong.github.io/dataclasses-json/) modules, which +The easiest way to do with is with the {py:mod}`dataclasses` module, which let you compose several Flyte-supported types into a single object. For example, suppose you want to support a coordinates data type with arbitrary metadata: @@ -45,12 +44,10 @@ metadata: import typing from dataclasses import dataclass -from dataclasses_json import dataclass_json +from mashumaro.mixins.json import DataClassJSONMixin - -@dataclass_json @dataclass -class Coordinate: +class Coordinate(DataClassJSONMixin): """A custom type for coordinates with metadata attached.""" x: float y: float diff --git a/docs/getting_started/machine_learning.md b/docs/getting_started/machine_learning.md index b4d077798..c5b28887c 100644 --- a/docs/getting_started/machine_learning.md +++ b/docs/getting_started/machine_learning.md @@ -74,7 +74,7 @@ def training_workflow(hyperparameters: dict) -> LogisticRegression: ```{important} Even though you can use a `dict` type to represent the model's hyperparameters, -we recommend using {ref}`dataclasses ` to define a custom +we recommend using {ref}`dataclasses ` to define a custom `Hyperparameter` Python object that provides more type information to the Flyte compiler. For example, Flyte uses this type information to auto-generate type-safe launch forms on the Flyte UI. Learn more in the @@ -124,7 +124,6 @@ There are many ways to extend your workloads: supports ScikitLearn, TensorFlow, and PyTorch. ``` - ```{admonition} Learn More :class: important diff --git a/examples/advanced_composition/README.md b/examples/advanced_composition/README.md index ed9696f4d..9a523bf27 100644 --- a/examples/advanced_composition/README.md +++ b/examples/advanced_composition/README.md @@ -7,8 +7,6 @@ These examples cover more complex aspects of Flyte, including conditions, subwor dynamic workflows, map tasks, gate nodes and more. ```{auto-examples-toc} -files -folders conditions chain_entities subworkflows diff --git a/examples/basics/Dockerfile b/examples/basics/Dockerfile index 858d7d252..5140e65c6 100644 --- a/examples/basics/Dockerfile +++ b/examples/basics/Dockerfile @@ -9,9 +9,7 @@ ENV VENV /opt/venv ENV LANG C.UTF-8 ENV LC_ALL C.UTF-8 ENV PYTHONPATH /root -WORKDIR /root -# This is necessary for opencv to work RUN apt-get update && apt-get install -y libsm6 libxext6 libxrender-dev ffmpeg build-essential curl # Virtual environment @@ -19,7 +17,7 @@ ENV VENV /opt/venv RUN python3 -m venv ${VENV} ENV PATH="${VENV}/bin:$PATH" -RUN pip install flytekit==1.9.0 flytekitplugins-envd +RUN pip install flytekit==1.10.0 flytekitplugins-envd # Copy the actual code COPY . /root diff --git a/examples/basics/basics/task.py b/examples/basics/basics/task.py index 7939228f9..8a573e97e 100644 --- a/examples/basics/basics/task.py +++ b/examples/basics/basics/task.py @@ -37,7 +37,7 @@ # %% [markdown] # The use of the {py:func}`~flytekit.task` decorator is mandatory for a ``PythonFunctionTask``. # A task is essentially a regular Python function, with the exception that all inputs and outputs must be clearly annotated with their types. -# Learn more about the supported types in the {ref}`type-system section `. +# Learn more about the supported types in the {ref}`type-system section `. # # We create a task that computes the slope of a regression line. # %% diff --git a/examples/customizing_dependencies/customizing_dependencies/image_spec.py b/examples/customizing_dependencies/customizing_dependencies/image_spec.py index 5a2b718b2..b6c131b95 100644 --- a/examples/customizing_dependencies/customizing_dependencies/image_spec.py +++ b/examples/customizing_dependencies/customizing_dependencies/image_spec.py @@ -62,6 +62,11 @@ ) # %% [markdown] +# :::{important} +# Replace `ghcr.io/flyteorg` with a container registry you've access to publish to. +# To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`. +# ::: +# # `is_container` is used to determine whether the task is utilizing the image constructed from the `ImageSpec`. # If the task is indeed using the image built from the `ImageSpec`, it will then import Tensorflow. # This approach helps minimize module loading time and prevents unnecessary dependency installation within a single image. diff --git a/examples/data_types_and_io/Dockerfile b/examples/data_types_and_io/Dockerfile index 2777e57c8..37f35a19f 100644 --- a/examples/data_types_and_io/Dockerfile +++ b/examples/data_types_and_io/Dockerfile @@ -1,4 +1,7 @@ -FROM python:3.8-slim-buster +# ###################### +# NOTE: For CI/CD only # +######################## +FROM python:3.11-slim-buster LABEL org.opencontainers.image.source https://github.com/flyteorg/flytesnacks WORKDIR /root @@ -7,20 +10,14 @@ ENV LANG C.UTF-8 ENV LC_ALL C.UTF-8 ENV PYTHONPATH /root -# This is necessary for opencv to work RUN apt-get update && apt-get install -y libsm6 libxext6 libxrender-dev ffmpeg build-essential curl -WORKDIR /root - -ENV VENV /opt/venv # Virtual environment +ENV VENV /opt/venv RUN python3 -m venv ${VENV} ENV PATH="${VENV}/bin:$PATH" -# Install Python dependencies -COPY requirements.txt /root -RUN pip install --no-cache-dir -r /root/requirements.txt -RUN pip freeze +RUN pip install flytekit==1.10.0 torch # Copy the actual code COPY . /root diff --git a/examples/data_types_and_io/README.md b/examples/data_types_and_io/README.md index 2025cdcdf..9e1c29623 100644 --- a/examples/data_types_and_io/README.md +++ b/examples/data_types_and_io/README.md @@ -2,25 +2,146 @@ # Data Types and IO -Flyte is a data-aware DAG scheduling system. The graph itself is derived -automatically from the flow of data and this closely resembles how a functional -programming language passes data between methods. - -Data awareness is powered by Flyte's own type system, which closely maps most programming languages. These types are what power Flyte's magic of: +Flyte being a data-aware orchestration platform, types play a vital role within it. +This section provides an introduction to the wide range of data types that Flyte supports. +These types serve a dual purpose by not only validating the data but also enabling seamless +transfer of data between local and cloud storage. +They enable: - Data lineage - Memoization - Auto parallelization - Simplifying access to data -- Auto generated CLI and Launch UI +- Auto generated CLI and launch UI + +For a more comprehensive understanding of how Flyte manages data, refer to the +{std:ref}`Understand How Flyte Handles Data ` guide. + +(python_to_flyte_type_mapping)= + +## Mapping Python to Flyte types + +Flytekit automatically translates most Python types into Flyte types. +Here's a breakdown of these mappings: + +```{eval-rst} +.. list-table:: + :widths: auto + :header-rows: 1 + + * - Python Type + - Flyte Type + - Conversion + - Comment + * - ``int`` + - ``Integer`` + - Automatic + - Use Python 3 type hints. + * - ``float`` + - ``Float`` + - Automatic + - Use Python 3 type hints. + * - ``str`` + - ``String`` + - Automatic + - Use Python 3 type hints. + * - ``bool`` + - ``Boolean`` + - Automatic + - Use Python 3 type hints. + * - ``bytes``/``bytearray`` + - ``Binary`` + - Not Supported + - You have the option to employ your own custom type transformer. + * - ``complex`` + - NA + - Not Supported + - You have the option to employ your own custom type transformer. + * - ``datetime.timedelta`` + - ``Duration`` + - Automatic + - Use Python 3 type hints. + * - ``datetime.datetime`` + - ``Datetime`` + - Automatic + - Use Python 3 type hints. + * - ``datetime.date`` + - ``Datetime`` + - Automatic + - Use Python 3 type hints. + * - ``typing.List[T]`` / ``list[T]`` + - ``Collection [T]`` + - Automatic + - Use ``typing.List[T]`` or ``list[T]``, where ``T`` can represent one of the other supported types listed in the table. + * - ``typing.Iterator[T]`` + - ``Collection [T]`` + - Automatic + - Use ``typing.Iterator[T]``, where ``T`` can represent one of the other supported types listed in the table. + * - File / file-like / ``os.PathLike`` + - ``FlyteFile`` + - Automatic + - If you're using ``file`` or ``os.PathLike`` objects, Flyte will default to the binary protocol for the file. + When using ``FlyteFile["protocol"]``, it is assumed that the file is in the specified protocol, such as 'jpg', 'png', 'hdf5', etc. + * - Directory + - ``FlyteDirectory`` + - Automatic + - When using ``FlyteDirectory["protocol"]``, it is assumed that all the files belong to the specified protocol. + * - ``typing.Dict[str, V]`` / ``dict[str, V]`` + - ``Map[str, V]`` + - Automatic + - Use ``typing.Dict[str, V]`` or ``dict[str, V]``, where ``V`` can be one of the other supported types in the table, + including a nested dictionary. + * - ``dict`` + - JSON (``struct.pb``) + - Automatic + - Use ``dict``. It's assumed that the untyped dictionary can be converted to JSON. + However, this may not always be possible and could result in a ``RuntimeError``. + * - ``@dataclass`` + - ``Struct`` + - Automatic + - The class should be a pure value class that inherits from Mashumaro's DataClassJSONMixin, + and be annotated with the ``@dataclass`` decorator. + * - ``np.ndarray`` + - File + - Automatic + - Use ``np.ndarray`` as a type hint. + * - ``pandas.DataFrame`` + - Structured Dataset + - Automatic + - Use ``pandas.DataFrame`` as a type hint. Pandas column types aren't preserved. + * - ``pyspark.DataFrame`` + - Structured Dataset + - To utilize the type, install the ``flytekitplugins-spark`` plugin. + - Use ``pyspark.DataFrame`` as a type hint. + * - ``pydantic.BaseModel`` + - ``Map`` + - To utilize the type, install the ``flytekitplugins-pydantic`` plugin. + - Use ``pydantic.BaseModel`` as a type hint. + * - ``torch.Tensor`` / ``torch.nn.Module`` + - File + - To utilize the type, install the ``torch`` library. + - Use ``torch.Tensor`` or ``torch.nn.Module`` as a type hint, and you can use their derived types. + * - ``tf.keras.Model`` + - File + - To utilize the type, install the ``tensorflow`` library. + - Use ``tf.keras.Model`` and its derived types. + * - ``sklearn.base.BaseEstimator`` + - File + - To utilize the type, install the ``scikit-learn`` library. + - Use ``sklearn.base.BaseEstimator`` and its derived types. + * - User defined types + - Any + - Custom transformers + - The ``FlytePickle`` transformer is the default option, but you can also define custom transformers. + **For instructions on building custom type transformers, please refer to :ref:`this section `**. +``` ```{auto-examples-toc} -flyte_python_types -schema +file +folder structured_dataset -typed_schema -pytorch_types -custom_objects -enums -flyte_pickle +dataclass +pytorch_type +enum_type +pickle_type ``` diff --git a/examples/data_types_and_io/data_types_and_io/custom_objects.py b/examples/data_types_and_io/data_types_and_io/custom_objects.py deleted file mode 100644 index 435ad65f1..000000000 --- a/examples/data_types_and_io/data_types_and_io/custom_objects.py +++ /dev/null @@ -1,159 +0,0 @@ -# %% [markdown] -# (dataclass_type)= -# -# # Using Custom Python Objects -# -# ```{eval-rst} -# .. tags:: Basic -# ``` -# -# Flyte supports passing JSON between tasks. But to simplify the usage for the users and introduce type-safety, -# Flytekit supports passing custom data objects between tasks. -# -# Currently, data classes decorated with `@dataclass_json` are supported. -# One good use case of a data class would be when you want to wrap all input in a data class in the case of a map task -# which can only accept one input and produce one output. -# -# This example shows how users can serialize custom JSON-compatible dataclasses between successive tasks using the -# excellent [dataclasses_json](https://pypi.org/project/dataclasses-json/) library. - -# %% [markdown] -# To get started, let's import the necessary libraries. -# %% -import os -import tempfile -import typing -from dataclasses import dataclass - -import pandas as pd -from dataclasses_json import dataclass_json -from flytekit import task, workflow -from flytekit.types.directory import FlyteDirectory -from flytekit.types.file import FlyteFile -from flytekit.types.schema import FlyteSchema - - -# %% [markdown] -# We define a simple data class that can be sent between tasks. -# %% -@dataclass_json -@dataclass -class Datum(object): - """ - Example of a simple custom class that is modeled as a dataclass - """ - - x: int - y: str - z: typing.Dict[int, str] - - -# %% [markdown] -# `Datum` is a user defined complex type that can be used to pass complex data between tasks. -# Interestingly, users can send this data between different tasks written in different languages and input it through the Flyte Console as raw JSON. -# -# :::{note} -# All variables in a data class should be **annotated with their type**. Failure to do should will result in an error. -# ::: - -# %% [markdown] -# Next, we define a data class that accepts {std:ref}`FlyteSchema `, {std:ref}`FlyteFile `, -# and {std:ref}`FlyteDirectory `. -# %% -@dataclass_json -@dataclass -class Result: - schema: FlyteSchema - file: FlyteFile - directory: FlyteDirectory - - -# %% [markdown] -# :::{note} -# A data class supports the usage of data associated with Python types, data classes, FlyteFile, FlyteDirectory, and FlyteSchema. -# ::: -# -# Once declared, dataclasses can be returned as outputs or accepted as inputs. -# -# 1. Datum Data Class -# %% -@task -def stringify(x: int) -> Datum: - """ - A dataclass return will be regarded as a complex single json return. - """ - return Datum(x=x, y=str(x), z={x: str(x)}) - - -@task -def add(x: Datum, y: Datum) -> Datum: - """ - Flytekit will automatically convert the passed in json into a DataClass. If the structures dont match, it will raise - a runtime failure - """ - x.z.update(y.z) - return Datum(x=x.x + y.x, y=x.y + y.y, z=x.z) - - -# %% [markdown] -# The `stringify` task outputs a data class, and the `add` task accepts data classes as inputs. -# -# 2. Result Data Class -# %% -@task -def upload_result() -> Result: - """ - Flytekit will upload FlyteFile, FlyteDirectory, and FlyteSchema to blob store (GCP, S3) - """ - df = pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]}) - temp_dir = tempfile.mkdtemp(prefix="flyte-") - - schema_path = temp_dir + "/schema.parquet" - df.to_parquet(schema_path) - - file_path = tempfile.NamedTemporaryFile(delete=False) - file_path.write(b"Hello world!") - fs = Result( - schema=FlyteSchema(temp_dir), - file=FlyteFile(file_path.name), - directory=FlyteDirectory(temp_dir), - ) - return fs - - -@task -def download_result(res: Result): - """ - Flytekit will lazily load FlyteSchema. We download the schema only when users invoke open(). - """ - assert pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]}).equals(res.schema.open().all()) - f = open(res.file, "r") - assert f.read() == "Hello world!" - assert os.listdir(res.directory) == ["schema.parquet"] - - -# %% [markdown] -# The `upload_result` task outputs a data class, and the `download_result` task accepts data classes as inputs. - -# %% [markdown] -# Lastly, we create a workflow. -# %% -@workflow -def wf(x: int, y: int) -> (Datum, Result): - """ - Dataclasses (JSON) can be returned from a workflow as well. - """ - res = upload_result() - download_result(res=res) - return add(x=stringify(x=x), y=stringify(x=y)), res - - -# %% [markdown] -# We can run the workflow locally. -# -# %% -if __name__ == "__main__": - """ - This workflow can be run locally. During local execution also, the dataclasses will be marshalled to and from json. - """ - wf(x=10, y=20) diff --git a/examples/data_types_and_io/data_types_and_io/dataclass.py b/examples/data_types_and_io/data_types_and_io/dataclass.py new file mode 100644 index 000000000..61c0d97d0 --- /dev/null +++ b/examples/data_types_and_io/data_types_and_io/dataclass.py @@ -0,0 +1,143 @@ +# %% [markdown] +# (dataclass)= +# +# # Data Class +# +# ```{eval-rst} +# .. tags:: Basic +# ``` +# +# When you've multiple values that you want to send across Flyte entities, you can use a `dataclass`. +# +# Flytekit uses the [Mashumaro library](https://github.com/Fatal1ty/mashumaro) +# to serialize and deserialize dataclasses. +# +# :::{important} +# If you're using Flytekit version below v1.10, you'll need to decorate with `@dataclass_json` using +# `from dataclass_json import dataclass_json` instead of inheriting from Mashumaro's `DataClassJSONMixin`. +# ::: +# +# To begin, import the necessary dependencies. +# %% +import os +import tempfile +from dataclasses import dataclass + +import pandas as pd +from flytekit import task, workflow +from flytekit.types.directory import FlyteDirectory +from flytekit.types.file import FlyteFile +from flytekit.types.structured import StructuredDataset +from mashumaro.mixins.json import DataClassJSONMixin + + +# %% [markdown] +# ## Python types +# We define a `dataclass` with `int`, `str` and `dict` as the data types. +# %% +@dataclass +class Datum(DataClassJSONMixin): + x: int + y: str + z: dict[int, str] + + +# %% [markdown] +# You can send a `dataclass` between different tasks written in various languages, and input it through the Flyte console as raw JSON. +# +# :::{note} +# All variables in a data class should be **annotated with their type**. Failure to do should will result in an error. +# ::: +# +# Once declared, a dataclass can be returned as an output or accepted as an input. +# %% +@task +def stringify(s: int) -> Datum: + """ + A dataclass return will be treated as a single complex JSON return. + """ + return Datum(x=s, y=str(s), z={s: str(s)}) + + +@task +def add(x: Datum, y: Datum) -> Datum: + """ + Flytekit automatically converts the provided JSON into a data class. + If the structures don't match, it triggers a runtime failure. + """ + x.z.update(y.z) + return Datum(x=x.x + y.x, y=x.y + y.y, z=x.z) + + +# %% [markdown] +# ## Flyte types +# We also define a data class that accepts {std:ref}`StructuredDataset `, +# {std:ref}`FlyteFile ` and {std:ref}`FlyteDirectory `. +# %% +@dataclass +class FlyteTypes(DataClassJSONMixin): + dataframe: StructuredDataset + file: FlyteFile + directory: FlyteDirectory + + +@task +def upload_data() -> FlyteTypes: + """ + Flytekit will upload FlyteFile, FlyteDirectory and StructuredDataset to the blob store, + such as GCP or S3. + """ + # 1. StructuredDataset + df = pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]}) + + # 2. FlyteDirectory + temp_dir = tempfile.mkdtemp(prefix="flyte-") + df.to_parquet(temp_dir + "/df.parquet") + + # 3. FlyteFile + file_path = tempfile.NamedTemporaryFile(delete=False) + file_path.write(b"Hello, World!") + + fs = FlyteTypes( + dataframe=StructuredDataset(dataframe=df), + file=FlyteFile(file_path.name), + directory=FlyteDirectory(temp_dir), + ) + return fs + + +@task +def download_data(res: FlyteTypes): + assert pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]}).equals(res.dataframe.open(pd.DataFrame).all()) + f = open(res.file, "r") + assert f.read() == "Hello, World!" + assert os.listdir(res.directory) == ["df.parquet"] + + +# %% [markdown] +# A data class supports the usage of data associated with Python types, data classes, +# flyte file, flyte directory and structured dataset. +# +# We define a workflow that calls the tasks created above. +# %% +@workflow +def dataclass_wf(x: int, y: int) -> (Datum, FlyteTypes): + o1 = add(x=stringify(s=x), y=stringify(s=y)) + o2 = upload_data() + download_data(res=o2) + return o1, o2 + + +# %% [markdown] +# You can run the workflow locally as follows: +# %% +if __name__ == "__main__": + dataclass_wf(x=10, y=20) + +# %% [markdown] +# To trigger a task that accepts a dataclass as an input with `pyflyte run`, you can provide a JSON file as an input: +# ``` +# pyflyte run \ +# https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/data_types_and_io/data_types_and_io/dataclass.py \ +# add --x dataclass_input.json --y dataclass_input.json +# ``` diff --git a/examples/data_types_and_io/data_types_and_io/dataclass_input.json b/examples/data_types_and_io/data_types_and_io/dataclass_input.json new file mode 100644 index 000000000..530a9487a --- /dev/null +++ b/examples/data_types_and_io/data_types_and_io/dataclass_input.json @@ -0,0 +1 @@ +{ "x": 5, "y": "5", "z": { "5": "5" } } diff --git a/examples/data_types_and_io/data_types_and_io/enum_type.py b/examples/data_types_and_io/data_types_and_io/enum_type.py new file mode 100644 index 000000000..179a51009 --- /dev/null +++ b/examples/data_types_and_io/data_types_and_io/enum_type.py @@ -0,0 +1,74 @@ +# %% [markdown] +# # Enum Type +# +# ```{eval-rst} +# .. tags:: Basic +# ``` +# +# At times, you might need to limit the acceptable values for inputs or outputs to a predefined set. +# This common requirement is usually met by using Enum types in programming languages. +# +# You can create a Python Enum type and utilize it as an input or output for a task. +# Flytekit will automatically convert it and constrain the inputs and outputs to the predefined set of values. +# +# :::{important} +# Currently, only string values are supported as valid enum values. +# Flyte assumes the first value in the list as the default, and Enum types cannot be optional. +# Therefore, when defining enums, it's important to design them with the first value as a valid default. +# ::: +# +# To begin, import the dependencies. +# %% +from enum import Enum + +from flytekit import task, workflow + + +# %% [markdown] +# We define an enum and a simple coffee maker workflow that accepts an order and brews coffee ☕️ accordingly. +# The assumption is that the coffee maker only understands enum inputs. +# %% +class Coffee(Enum): + ESPRESSO = "espresso" + AMERICANO = "americano" + LATTE = "latte" + CAPPUCCINO = "cappucccino" + + +@task +def take_order(coffee: str) -> Coffee: + return Coffee(coffee) + + +@task +def prep_order(coffee_enum: Coffee) -> str: + return f"Preparing {coffee_enum.value} ..." + + +@workflow +def coffee_maker(coffee: str) -> str: + coffee_enum = take_order(coffee=coffee) + return prep_order(coffee_enum=coffee_enum) + + +# %% [markdown] +# The workflow can also accept an enum value. +# %% +@workflow +def coffee_maker_enum(coffee_enum: Coffee) -> str: + return prep_order(coffee_enum=coffee_enum) + + +# %% [markdown] +# You can send a string to the `coffee_maker_enum` workflow during its execution, like this: +# ``` +# pyflyte run \ +# https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/data_types_and_io/data_types_and_io/enum_type.py \ +# coffee_maker_enum --coffee_enum="latte" +# ``` +# +# You can run the workflows locally. +# %% +if __name__ == "__main__": + print(coffee_maker(coffee="latte")) + print(coffee_maker_enum(coffee_enum=Coffee.LATTE)) diff --git a/examples/data_types_and_io/data_types_and_io/enums.py b/examples/data_types_and_io/data_types_and_io/enums.py deleted file mode 100644 index 5f74f6b40..000000000 --- a/examples/data_types_and_io/data_types_and_io/enums.py +++ /dev/null @@ -1,66 +0,0 @@ -# %% [markdown] -# # Using Enum types -# -# ```{eval-rst} -# .. tags:: Basic -# ``` -# -# Sometimes you may want to restrict the set of inputs / outputs to a finite set of acceptable values. This is commonly -# achieved using Enum types in programming languages. -# -# Since version 0.15.0, Flyte supports Enum Types with string values. You can create a python Enum type and pass it to -# a task or return from a task. Flyte will automatically convert this to and limit the inputs etc to a finite set of -# values. -# -# UX: flytectl will allow only the finite set of values to be acceptable and (in progress) UI will provide a drop-down for -# the values. -# -# **Caveat:** Only string values are supported as valid enum values. The first value in the list is assumed as the default -# and the Enum types are not optional. So when defining enums, design them well to always make the first value as a -# valid default. -# %% -from enum import Enum -from typing import Tuple - -from flytekit import task, workflow - - -# %% [markdown] -# Enums are natively supported in flyte's type system. Enum values can only be of type string. At runtime they are -# represented using their string values. -# -# :::{note} -# ENUM Values can only be string. Other languages will receive enum as a string. -# ::: -# %% -class Color(Enum): - RED = "red" - GREEN = "green" - BLUE = "blue" - - -# %% [markdown] -# Enums can be used as a regular type -# %% -@task -def enum_stringify(c: Color) -> str: - return c.value - - -# %% [markdown] -# Their values can be accepted as string -# -# %% -@task -def string_to_enum(c: str) -> Color: - return Color(c) - - -@workflow -def enum_wf(c: Color = Color.RED) -> Tuple[Color, str]: - v = enum_stringify(c=c) - return string_to_enum(c=v), v - - -if __name__ == "__main__": - print(enum_wf()) diff --git a/examples/advanced_composition/advanced_composition/files.py b/examples/data_types_and_io/data_types_and_io/file.py similarity index 76% rename from examples/advanced_composition/advanced_composition/files.py rename to examples/data_types_and_io/data_types_and_io/file.py index 78771f829..eff9ab9bc 100644 --- a/examples/advanced_composition/advanced_composition/files.py +++ b/examples/data_types_and_io/data_types_and_io/file.py @@ -1,7 +1,7 @@ # %% [markdown] -# (files)= +# (file)= # -# # Working With Files +# # Flyte File # # ```{eval-rst} # .. tags:: Data, Basic @@ -9,17 +9,16 @@ # # Files are one of the most fundamental entities that users of Python work with, # and they are fully supported by Flyte. In the IDL, they are known as -# [Blob](https://github.com/flyteorg/flyteidl/blob/cee566b2e6e109120f1bb34c980b1cfaf006a473/protos/flyteidl/core/literals.proto#L33) +# [Blob](https://github.com/flyteorg/flyteidl/blob/master/protos/flyteidl/core/literals.proto#L33) # literals which are backed by the -# [blob type](https://github.com/flyteorg/flyteidl/blob/cee566b2e6e109120f1bb34c980b1cfaf006a473/protos/flyteidl/core/types.proto#L47). +# [blob type](https://github.com/flyteorg/flyteidl/blob/master/protos/flyteidl/core/types.proto#L47). # -# Let's assume our mission here is pretty simple. We download a few csv file +# Let's assume our mission here is pretty simple. We download a few CSV file # links, read them with the python built-in {py:class}`csv.DictReader` function, # normalize some pre-specified columns, and output the normalized columns to # another csv file. - -# %% [markdown] -# First, let's import the libraries. +# +# First, import the libraries. # %% import csv import os @@ -30,19 +29,22 @@ from flytekit import task, workflow from flytekit.types.file import FlyteFile + # %% [markdown] -# Next, we write a task that accepts a `FlyteFile`, a list of column names, -# and a list of column names to normalize, then outputs a csv file of only -# the normalized columns. For this example we'll use z-score normalization, -# i.e. mean-centering and standard-deviation-scaling. +# Define a task that accepts {py:class}`~flytekit.types.file.FlyteFile` as an input. +# The following is a task that accepts a `FlyteFile`, a list of column names, +# and a list of column names to normalize. The task then outputs a CSV file +# containing only the normalized columns. For this example, we use z-score normalization, +# which involves mean-centering and standard-deviation-scaling. # # :::{note} # The `FlyteFile` literal can be scoped with a string, which gets inserted # into the format of the Blob type ("jpeg" is the string in # `FlyteFile[typing.TypeVar("jpeg")]`). The format is entirely optional, # and if not specified, defaults to `""`. +# Predefined aliases for commonly used flyte file formats are also available. +# You can find them [here](https://github.com/flyteorg/flytekit/blob/master/flytekit/types/file/__init__.py). # ::: - # %% @task def normalize_columns( @@ -86,20 +88,18 @@ def normalize_columns( # %% [markdown] # When the image URL is sent to the task, the Flytekit engine translates it into a `FlyteFile` object on the local -# drive (but doesn't download it). The act of calling `download` method should trigger the download, and the `path` +# drive (but doesn't download it). The act of calling the `download()` method should trigger the download, and the `path` # attribute enables to `open` the file. # # If the `output_location` argument is specified, it will be passed to the `remote_path` argument of `FlyteFile`, # which will use that path as the storage location instead of a random location (Flyte's object store). # # When this task finishes, the Flytekit engine returns the `FlyteFile` instance, uploads the file to the location, and -# creates a Blob literal pointing to it. - -# %% [markdown] -# Lastly, we define a `normalize_csv_files` workflow. Note that there is an `output_location` argument specified in -# the workflow. This is passed to the `location` input of the task. If it's not an empty string, the task attempts to +# creates a blob literal pointing to it. +# +# Lastly, define a workflow. The `normalize_csv_files` workflow has an `output_location` argument which is passed +# to the `location` input of the task. If it's not an empty string, the task attempts to # upload its file to that location. - # %% @workflow def normalize_csv_file( @@ -117,8 +117,7 @@ def normalize_csv_file( # %% [markdown] -# Finally, we can run the workflow locally. -# +# You can run the workflow locally as follows: # %% if __name__ == "__main__": default_files = [ diff --git a/examples/data_types_and_io/data_types_and_io/flyte_pickle.py b/examples/data_types_and_io/data_types_and_io/flyte_pickle.py deleted file mode 100644 index c8d2d1dc8..000000000 --- a/examples/data_types_and_io/data_types_and_io/flyte_pickle.py +++ /dev/null @@ -1,94 +0,0 @@ -# %% [markdown] -# (flyte_pickle)= -# -# # Using Flyte Pickle -# -# ```{eval-rst} -# .. tags:: Basic -# ``` -# -# Flyte enforces type safety by leveraging type information to be able to compile -# tasks/workflows, which enables all sorts of nice features (like static analysis of tasks/workflows, conditional branching, etc.) -# -# However, we do also want to provide enough flexibility to end-users so that they don't have to put in a lot of up front -# investment figuring out all the types of their data structures before experiencing the value that flyte has to offer. -# -# Flyte supports FlytePickle transformer which will convert any unrecognized type in type hint to -# FlytePickle, and serialize / deserialize the python value to / from a pickle file. -# -# ## Caveats -# -# Pickle can only be used to send objects between the exact same version of Python, -# and we strongly recommend to use python type that flyte support or register a custom transformer -# -# This example shows how users can custom object without register a transformer. -# %% -from flytekit import task, workflow - - -# %% [markdown] -# `People` is a user defined complex type, which can be used to pass complex data between tasks. -# We will serialize this class to a pickle file and pass it between different tasks. -# -# :::{Note} -# Here we can also {ref}`turn this object to dataclass ` to have better performance. -# We use simple object here for demo purpose. -# You may have some object that can't turn into a dataclass, e.g. NumPy, Tensor. -# ::: -# %% -class People: - def __init__(self, name): - self.name = name - - -# %% [markdown] -# Object can be returned as outputs or accepted as inputs -# %% -@task -def greet(name: str) -> People: - return People(name) - - -@workflow -def welcome(name: str) -> People: - return greet(name=name) - - -if __name__ == "__main__": - """ - This workflow can be run locally. During local execution also, - the custom object (People) will be marshalled to and from python pickle. - """ - welcome(name="Foo") - - -from typing import List - -# %% [markdown] -# By default, if the list subtype is unrecognized, a single pickle file is generated. -# To also improve serialization and deserialization performance for cases with millions of items or large list items, -# users can specify a batch size, processing each batch as a separate pickle file. -# Example below shows how users can set batch size. -# -# %% -from flytekit.types.pickle.pickle import BatchSize -from typing_extensions import Annotated - - -@task -def greet_all(names: List[str]) -> Annotated[List[People], BatchSize(2)]: - return [People(name) for name in names] - - -@workflow -def welcome_all(names: List[str]) -> Annotated[List[People], BatchSize(2)]: - return greet_all(names=names) - - -if __name__ == "__main__": - """ - In this example, two pickle files will be generated: - - One containing two People objects - - One containing one People object - """ - welcome_all(names=["f", "o", "o"]) diff --git a/examples/data_types_and_io/data_types_and_io/flyte_python_types.py b/examples/data_types_and_io/data_types_and_io/flyte_python_types.py deleted file mode 100644 index 8a252aef9..000000000 --- a/examples/data_types_and_io/data_types_and_io/flyte_python_types.py +++ /dev/null @@ -1,101 +0,0 @@ -# %% [markdown] -# (flytekit_to_flyte_type_mapping)= -# -# # Flyte and Python Types -# -# ```{eval-rst} -# .. tags:: Basic -# ``` -# -# FlyteKit automatically maps Python types to Flyte types. This section provides details of the mappings, but for the most -# part you can skip this section, as almost all of Python types are mapped automatically. -# -# The following table provides a quick overview of how types are converted from the type-hints (python native) to Flyte-understood, cross-language types. -# -# ```{eval-rst} -# .. list-table:: Supported Python types and mapping to underlying Flyte Type -# :widths: auto -# :header-rows: 1 -# -# * - Python Type -# - Flyte Type -# - Conversion -# - Comment -# * - int -# - Integer -# - Automatic -# - just use python 3 type hints -# * - float -# - Float -# - Automatic -# - just use python 3 type hints -# * - str -# - String -# - Automatic -# - just use python 3 type hints -# * - bool -# - Boolean -# - Automatic -# - just use python 3 type hints -# * - bytes/bytearray -# - binary -# - Not Supported -# - Let us know if this is an interesting usecase that you can currently support using your own transformers. -# * - complex -# - NA -# - Not Supported -# - Let us know if this is an interesting usecase that you can currently support using your own transformers. -# * - datetime.timedelta -# - Duration -# - Automatic -# - just use python 3 type hints -# * - datetime.datetime -# - Datetime -# - Automatic -# - just use python 3 type hints -# * - datetime.date -# - Datetime -# - Automatic -# - just use python 3 type hints -# * - Univariate List / typing.List -# - Collection [ type ] -# - Automatic -# - Use python 3 type hints e.g ``typing.List[T]``, where ``T`` can be one of the other supported types in the table -# * - file / file-like / os.PathLike / flytekit.types.file.FlyteFile -# - Blob - Single -# - Automatic -# - Use python 3 type hints. if using ``file / os.PathLike`` objects then, Flyte will default to binary protocol for the file. If using FlyteFile["protocol"], it is assumed that the file is in the specified protocol. e.g. "jpg", "png", "hdf5" etc -# * - Directory / flytekit.types.directory.FlyteDirectory -# - Blob - MultiPart -# - Automatic -# - Use python 3 type hints. If using FlyteDirectory["protocol"] it is assumed that all the files are of the specified protocol -# * - Typed dictionary with str key - typing.Dict[str, V] -# - Map[str, V] -# - Automatic -# - Use python 3 type hints e.g ``typing.Dict[str, V]``, where V can be one of the other supported types in the table even another Dictionary (nested) -# * - Untyped dictionary - dict -# - JSON (struct.pb) -# - Automatic -# - Use python 3 type hints e.g ``dict``, it will be assumed that we can convert the dict to json. This may not always be possible and will cause a RuntimeError -# * - Dataclasses ``@dataclass`` -# - Struct -# - Automatic -# - Use python 3 type hints. The class should be a pure value class and should be annotated with ``@dataclass`` and ``@dataclass_json``. -# * - pandas.DataFrame -# - Schema -# - Automatic -# - Use python 3 type hints. Pandas column types are not preserved -# * - pyspark.DataFrame -# - Schema -# - Automatic -# - Use python 3 type hints. Column types are not preserved. Install ``flytekitplugins-spark`` plugin using pip -# * - torch.Tensor & torch.nn.Module -# - Blob - Single -# - Automatic -# - Use PyTorch type hints. -# * - User defined types -# - Any -# - Custom Transformers -# - Use python 3 type hints. We use ``FlytePickle transformer`` by default, but users still can provide custom transformers. Refer to :ref:`advanced_custom_types`. -# ``` -# diff --git a/examples/advanced_composition/advanced_composition/folders.py b/examples/data_types_and_io/data_types_and_io/folder.py similarity index 74% rename from examples/advanced_composition/advanced_composition/folders.py rename to examples/data_types_and_io/data_types_and_io/folder.py index 45adabfed..5aafd95eb 100644 --- a/examples/advanced_composition/advanced_composition/folders.py +++ b/examples/data_types_and_io/data_types_and_io/folder.py @@ -1,17 +1,17 @@ # %% [markdown] -# (folders)= +# (folder)= # -# # Working With Folders +# # Flyte Directory # # ```{eval-rst} # .. tags:: Data, Basic # ``` # -# In addition to files, folders are another fundamental operating system primitive users often work with. Flyte supports folders -# in the form of [multi-part blobs](https://github.com/flyteorg/flyteidl/blob/237f025a15c0102675ad41d2d1e66869afa80822/protos/flyteidl/core/types.proto#L73). - -# %% [markdown] -# First, let's import the libraries we need in this example. +# In addition to files, folders are another fundamental operating system primitive. +# Flyte supports folders in the form of +# [multi-part blobs](https://github.com/flyteorg/flyteidl/blob/master/protos/flyteidl/core/types.proto#L73). +# +# To begin, import the libraries. # %% import csv import os @@ -26,10 +26,11 @@ # %% [markdown] -# Extending the same use case that we used in the File example, which is to normalize columns in a csv file. +# Building upon the previous example demonstrated in the {std:ref}`file ` section, +# let's continue by considering the normalization of columns in a CSV file. # -# The following task downloads a list of urls pointing to csv files and returns the path to the folder, in a -# `FlyteDirectory` object. +# The following task downloads a list of URLs pointing to CSV files +# and returns the folder path in a `FlyteDirectory` object. # %% @task def download_files(csv_urls: List[str]) -> FlyteDirectory: @@ -51,7 +52,8 @@ def download_files(csv_urls: List[str]) -> FlyteDirectory: # %% [markdown] # :::{note} -# You can annotate a FlyteDirectory when you want to download/upload the contents of the directory in batches. For example, +# You can annotate a `FlyteDirectory` when you want to download or upload the contents of the directory in batches. +# For example, # # ```{code-block} # @task @@ -60,22 +62,19 @@ def download_files(csv_urls: List[str]) -> FlyteDirectory: # return FlyteDirectory(...) # ``` # -# In the above example flytekit will download all files from the input `directory` in chunks of 10, i.e. first it -# downloads 10 files, loads them to memory, then writes those 10 to local disk, then it loads the next 10, so on -# and so forth. Similarly, for outputs, in this case flytekit is going to upload the resulting directory in chunks of -# 100. +# Flytekit efficiently downloads files from the specified input directory in 10-file chunks. +# It then loads these chunks into memory before writing them to the local disk. +# The process repeats for subsequent sets of 10 files. +# Similarly, for outputs, Flytekit uploads the resulting directory in chunks of 100. # ::: - - -# %% [markdown] -# Next, we define a helper function to normalize the columns in-place. +# +# We define a helper function to normalize the columns in-place. # # :::{note} -# This is a plain python function that will be called in a subsequent Flyte task. This example +# This is a plain Python function that will be called in a subsequent Flyte task. This example # demonstrates how Flyte tasks are simply entrypoints of execution, which can themselves call -# other functions and routines that are written in pure python. +# other functions and routines that are written in pure Python. # ::: - # %% def normalize_columns( local_csv_file: str, @@ -106,9 +105,8 @@ def normalize_columns( # %% [markdown] -# Now we define a task that accepts the previously downloaded folder, along with some metadata about the +# We then define a task that accepts the previously downloaded folder, along with some metadata about the # column names of each file in the directory and the column names that we want to normalize. - # %% @task def normalize_all_files( @@ -127,10 +125,9 @@ def normalize_all_files( # %% [markdown] -# Then we compose all of the above tasks into a workflow. This workflow accepts a list -# of url strings pointing to a remote location containing a csv file, a list of column names -# associated with each csv file, and a list of columns that we want to normalize. - +# Compose all of the above tasks into a workflow. This workflow accepts a list +# of URL strings pointing to a remote location containing a CSV file, a list of column names +# associated with each CSV file, and a list of columns that we want to normalize. # %% @workflow def download_and_normalize_csv_files( @@ -147,8 +144,7 @@ def download_and_normalize_csv_files( # %% [markdown] -# Finally, we can run the workflow locally. -# +# You can run the workflow locally as follows: # %% if __name__ == "__main__": csv_urls = [ diff --git a/examples/data_types_and_io/data_types_and_io/pickle_type.py b/examples/data_types_and_io/data_types_and_io/pickle_type.py new file mode 100644 index 000000000..88f04bb09 --- /dev/null +++ b/examples/data_types_and_io/data_types_and_io/pickle_type.py @@ -0,0 +1,105 @@ +# %% [markdown] +# (pickle_type)= +# +# # Pickle Type +# +# ```{eval-rst} +# .. tags:: Basic +# ``` +# +# Flyte enforces type safety by utilizing type information for compiling tasks and workflows, +# enabling various features such as static analysis and conditional branching. +# +# However, we also strive to offer flexibility to end-users so they don't have to invest heavily +# in understanding their data structures upfront before experiencing the value Flyte has to offer. +# +# Flyte supports the `FlytePickle` transformer, which converts any unrecognized type hint into `FlytePickle`, +# enabling the serialization/deserialization of Python values to/from a pickle file. +# +# :::{important} +# Pickle can only be used to send objects between the exact same Python version. +# For optimal performance, it's advisable to either employ Python types that are supported by Flyte +# or register a custom transformer, as using pickle types can result in lower performance. +# ::: +# +# This example demonstrates how you can utilize custom objects without registering a transformer. +# %% +from flytekit import task, workflow + + +# %% [markdown] +# `Superhero` represents a user-defined complex type that can be serialized to a pickle file by Flytekit +# and transferred between tasks as both input and output data. +# +# :::{note} +# Alternatively, you can {ref}`turn this object into a dataclass ` for improved performance. +# We have used a simple object here for demonstration purposes. +# ::: +# %% +class Superhero: + def __init__(self, name, power): + self.name = name + self.power = power + + +@task +def welcome_superhero(name: str, power: str) -> Superhero: + return Superhero(name, power) + + +@task +def greet_superhero(superhero: Superhero) -> str: + return f"👋 Hello {superhero.name}! Your superpower is {superhero.power}." + + +@workflow +def superhero_wf(name: str = "Thor", power: str = "Flight") -> str: + superhero = welcome_superhero(name=name, power=power) + return greet_superhero(superhero=superhero) + + +# %% [markdown] +# ## Batch size +# +# By default, if the list subtype is unrecognized, a single pickle file is generated. +# To optimize serialization and deserialization performance for scenarios involving a large number of items +# or significant list elements, you can specify a batch size. +# This feature allows for the processing of each batch as a separate pickle file. +# The following example demonstrates how to set the batch size. +# %% +from typing import Iterator + +from flytekit.types.pickle.pickle import BatchSize +from typing_extensions import Annotated + + +@task +def welcome_superheroes(names: list[str], powers: list[str]) -> Annotated[list[Superhero], BatchSize(3)]: + return [Superhero(name, power) for name, power in zip(names, powers)] + + +@task +def greet_superheroes(superheroes: list[Superhero]) -> Iterator[str]: + for superhero in superheroes: + yield f"👋 Hello {superhero.name}! Your superpower is {superhero.power}." + + +@workflow +def superheroes_wf( + names: list[str] = ["Thor", "Spiderman", "Hulk"], + powers: list[str] = ["Flight", "Surface clinger", "Shapeshifting"], +) -> Iterator[str]: + superheroes = welcome_superheroes(names=names, powers=powers) + return greet_superheroes(superheroes=superheroes) + + +# %% [markdown] +# :::{note} +# The `welcome_superheroes` task will generate two pickle files: one containing two superheroes and the other containing one superhero. +# ::: +# +# You can run the workflows locally as follows: +# %% +if __name__ == "__main__": + print(f"Superhero wf: {superhero_wf()}") + print(f"Superhero(es) wf: {superheroes_wf()}") diff --git a/examples/data_types_and_io/data_types_and_io/pytorch_types.py b/examples/data_types_and_io/data_types_and_io/pytorch_type.py similarity index 62% rename from examples/data_types_and_io/data_types_and_io/pytorch_types.py rename to examples/data_types_and_io/data_types_and_io/pytorch_type.py index a3cd61180..2b7be5ec9 100644 --- a/examples/data_types_and_io/data_types_and_io/pytorch_types.py +++ b/examples/data_types_and_io/data_types_and_io/pytorch_type.py @@ -1,22 +1,24 @@ # %% [markdown] -# (pytorch_types)= +# (pytorch_type)= # -# # PyTorch Types +# # PyTorch Type # # ```{eval-rst} # .. tags:: MachineLearning, Basic # ``` # -# Flyte promotes the use of strongly-typed data to make it easier to write pipelines that are more robust and easier to test. -# Flyte is primarily used for machine learning besides data engineering. To simplify the communication between Flyte tasks, especially when passing -# around tensors and models, we added support for the PyTorch types. - -# %% [markdown] -# ## Tensors & Modules +# Flyte advocates for the use of strongly-typed data to simplify the development of robust and testable pipelines. +# In addition to its application in data engineering, Flyte is primarily used for machine learning. +# To streamline the communication between Flyte tasks, particularly when dealing with tensors and models, +# we have introduced support for PyTorch types. # -# Many a times, you may need to pass around tensors and modules (aka models). In the absence of native type support for PyTorch tensors and modules, -# Flytekit resorts to using pickle to serialize and deserialize the entities; in fact, pickle is used for any unknown type. -# This is not very efficient, and hence, we added PyTorch's serialization and deserialization support to the Flyte type system. +# ## Tensors and modules +# +# At times, you may find the need to pass tensors and modules (models) within your workflow. +# Without native support for PyTorch tensors and modules, Flytekit relies on {std:ref}`pickle ` for serializing +# and deserializing these entities, as well as any unknown types. +# However, this approach isn't the most efficient. As a result, we've integrated PyTorch's +# serialization and deserialization support into the Flyte type system. # %% import torch from flytekit import task, workflow @@ -72,19 +74,20 @@ def pytorch_native_wf(): # %% [markdown] # Passing around tensors and modules is no more a hassle! - -from dataclasses import dataclass - -# %% [markdown] +# # ## Checkpoint # -# `PyTorchCheckpoint` is a special type of checkpoint to serialize and deserialize PyTorch models. -# It checkpoints `torch.nn.Module`'s state, hyperparameters, and optimizer state. -# The module checkpoint differs from the standard checkpoint in that it checkpoints the module's `state_dict`. -# Hence, when restoring the module, the module's `state_dict` needs to be used in conjunction with the actual module. -# As per PyTorch [docs](https://pytorch.org/tutorials/beginner/saving_loading_models.html#save-load-entire-model), it is recommended to -# store the module's `state_dict` rather than the module itself. However, the serialization should work either way. +# `PyTorchCheckpoint` is a specialized checkpoint used for serializing and deserializing PyTorch models. +# It checkpoints `torch.nn.Module`'s state, hyperparameters and optimizer state. +# +# This module checkpoint differs from the standard checkpoint as it specifically captures the module's `state_dict`. +# Therefore, when restoring the module, the module's `state_dict` must be used in conjunction with the actual module. +# According to the PyTorch [docs](https://pytorch.org/tutorials/beginner/saving_loading_models.html#save-load-entire-model), +# it's recommended to store the module's `state_dict` rather than the module itself, +# although the serialization should work in either case. # %% +from dataclasses import dataclass + import torch.nn as nn import torch.nn.functional as F import torch.optim as optim @@ -142,16 +145,18 @@ def pytorch_checkpoint_wf(): # %% [markdown] # :::{note} -# `PyTorchCheckpoint` supports serializing hyperparameters of types `dict`, `NamedTuple`, and `dataclass`. +# `PyTorchCheckpoint` supports serializing hyperparameters of types `dict`, `NamedTuple` and `dataclass`. # ::: # -# ## Auto GPU to CPU & CPU to GPU Conversion +# ## Auto GPU to CPU and CPU to GPU conversion +# +# Not all PyTorch computations require a GPU. In some cases, it can be advantageous to transfer the +# computation to a CPU, especially after training the model on a GPU. +# To utilize the power of a GPU, the typical construct to use is: `to(torch.device("cuda"))`. # -# Not all PyTorch computations require a GPU to run. There are some cases where it is beneficial to move the computation to the CPU after, say, -# the model is trained on a GPU. To avail the GPU power, we do `to(torch.device("cuda"))`. -# To use the GPU-variables on a CPU, we need to move the variables to a CPU using `to(torch.device("cpu"))` construct. -# This manual conversion proposed by PyTorch is not very user friendly, and hence, -# we added support for automatic GPU to CPU conversion (and vice versa) for the PyTorch types. +# When working with GPU variables on a CPU, variables need to be transferred to the CPU using the `to(torch.device("cpu"))` construct. +# However, this manual conversion recommended by PyTorch may not be very user-friendly. +# To address this, we added support for automatic GPU to CPU conversion (and vice versa) for PyTorch types. # # ```python # from flytekit import Resources @@ -188,6 +193,5 @@ def pytorch_checkpoint_wf(): # accuracy_list = correct.mean() # ``` # -# The `predict` task here runs on a CPU. -# As can be seen, you need not do the device conversion from GPU to CPU in the `predict` task as that's handled automatically by Flytekit! -# +# The `predict` task will run on a CPU, and +# the device conversion from GPU to CPU will be automatically handled by Flytekit. diff --git a/examples/data_types_and_io/data_types_and_io/schema.py b/examples/data_types_and_io/data_types_and_io/schema.py deleted file mode 100644 index f70f62cca..000000000 --- a/examples/data_types_and_io/data_types_and_io/schema.py +++ /dev/null @@ -1,70 +0,0 @@ -# %% [markdown] -# # Using Schemas -# -# ```{eval-rst} -# .. tags:: DataFrame, Basic -# ``` -# -# This example explains how an untyped schema is passed between tasks using {py:class}`pandas.DataFrame`. -# Flytekit makes it possible for users to directly return or accept a {py:class}`pandas.DataFrame`, which are automatically -# converted into flyte's abstract representation of a schema object - -# %% [markdown] -# :::{warning} -# `FlyteSchema` is deprecated, use {ref}`structured_dataset_example` instead. -# ::: - -# %% -import pandas -from flytekit import task, workflow - -# %% [markdown] -# Flytekit allows users to directly use pandas.dataframe in their tasks as long as they import -# Note: # noqa: F401. This is to ignore pylint complaining about unused imports -# %% -from flytekit.types import schema # noqa: F401 - - -# %% [markdown] -# This task generates a pandas.DataFrame and returns it. The Dataframe itself will be serialized to an intermediate -# format like parquet before passing between tasks -# %% -@task -def get_df(a: int) -> pandas.DataFrame: - """ - Generate a sample dataframe - """ - return pandas.DataFrame(data={"col1": [a, 2], "col2": [a, 4]}) - - -# %% [markdown] -# This task shows an example of transforming a dataFrame -# %% -@task -def add_df(df: pandas.DataFrame) -> pandas.DataFrame: - """ - Append some data to the dataframe. - NOTE: this may result in runtime failures if the columns do not match - """ - return df.append(pandas.DataFrame(data={"col1": [5, 10], "col2": [5, 10]})) - - -# %% [markdown] -# The workflow shows that passing DataFrame's between tasks is as simple as passing dataFrames in memory -# %% -@workflow -def df_wf(a: int) -> pandas.DataFrame: - """ - Pass data between the dataframes - """ - df = get_df(a=a) - return add_df(df=df) - - -# %% [markdown] -# The entire program can be run locally -# -# %% -if __name__ == "__main__": - print(f"Running {__file__} main...") - print(f"Running df_wf(a=42) {df_wf(a=42)}") diff --git a/examples/data_types_and_io/data_types_and_io/structured_dataset.py b/examples/data_types_and_io/data_types_and_io/structured_dataset.py index 6b2fa7eda..4f7941620 100644 --- a/examples/data_types_and_io/data_types_and_io/structured_dataset.py +++ b/examples/data_types_and_io/data_types_and_io/structured_dataset.py @@ -1,25 +1,29 @@ # %% [markdown] -# (structured_dataset_example)= +# (structured_dataset)= # # # Structured Dataset # # ```{eval-rst} -# .. tags:: DataFrame, Basic, Data +# .. tags:: Basic, DataFrame # ``` # ```{currentmodule} flytekit.types.structured # ``` # -# Structured dataset is a superset of Flyte Schema. +# As with most type systems, Python has primitives, container types like maps and tuples, and support for user-defined structures. +# However, while there’s a rich variety of dataframe classes (Pandas, Spark, Pandera, etc.), there’s no native Python type that +# represents a dataframe in the abstract. This is the gap that the {py:class}`StructuredDataset` type is meant to fill. +# It offers the following benefits: # -# The {py:class}`StructuredDataset` Transformer can write a dataframe to BigQuery, s3, or any storage by registering new structured dataset encoder and decoder. +# - Eliminate boilerplate code you would otherwise need to write to serialize/deserialize from file objects into dataframe instances, +# - Eliminate additional inputs/outputs that convey metadata around the format of the tabular data held in those files, +# - Add flexibility around how dataframe files are loaded, +# - Offer a range of dataframe specific functionality - enforce compatibility of different schemas +# (not only at compile time, but also runtime since type information is carried along in the literal), +# store third-party schema definitions, and potentially in the future, render sample data, provide summary statistics, etc. # -# Flytekit makes it possible to return or accept a {py:class}`pandas.DataFrame` which is automatically -# converted into Flyte's abstract representation of a structured dataset object. +# This example demonstrates how to work with a structured dataset using Flyte entities. # -# This example explains how a structured dataset can be used with the Flyte entities. - -# %% [markdown] -# Let's import the necessary dependencies. +# To begin, import the necessary dependencies. # %% import os import typing @@ -31,7 +35,6 @@ from flytekit import FlyteContext, StructuredDatasetType, kwtypes, task, workflow from flytekit.models import literals from flytekit.models.literals import StructuredDatasetMetadata -from flytekit.types.schema import FlyteSchema from flytekit.types.structured.structured_dataset import ( PARQUET, StructuredDataset, @@ -41,125 +44,190 @@ ) from typing_extensions import Annotated -# %% [markdown] -# We define the columns types for schema and {py:class}`StructuredDataset`. -# %% -superset_cols = kwtypes(Name=str, Age=int, Height=int) -subset_cols = kwtypes(Age=int) - # %% [markdown] -# We define two tasks, one returns a pandas DataFrame and the other a `FlyteSchema`. -# Flyte serializes the DataFrames to an intermediate format, a parquet file, before sending them to the other tasks. +# Define a task that returns a Pandas DataFrame. +# Flytekit will detect the Pandas dataframe return signature and +# convert the interface for the task to the new {py:class}`StructuredDataset` type. # %% @task -def get_df(a: int) -> Annotated[pd.DataFrame, superset_cols]: - """ - Generate a sample dataframe - """ +def generate_pandas_df(a: int) -> pd.DataFrame: return pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [a, 22], "Height": [160, 178]}) -@task -def get_schema_df(a: int) -> FlyteSchema[superset_cols]: - """ - Generate a sample dataframe - """ - s = FlyteSchema() - s.open().write(pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [a, 22], "Height": [160, 178]})) - return s +# %% [markdown] +# Using this simplest form, however, the user is not able to set the additional dataframe information alluded to above, +# +# - Column type information +# - Serialized byte format +# - Storage driver and location +# - Additional third party schema information +# +# This is by design as we wanted the default case to suffice for the majority of use-cases, and to require +# as few changes to existing code as possible. Specifying these is simple, however, and relies on Python variable annotations, +# which is designed explicitly to supplement types with arbitrary metadata. +# +# ## Column type information +# If you want to extract a subset of actual columns of the dataframe and specify their types for type validation, +# you can just specify the column names and their types in the structured dataset type annotation. +# +# First, initialize column types you want to extract from the `StructuredDataset`. +# %% +all_cols = kwtypes(Name=str, Age=int, Height=int) +col = kwtypes(Age=int) # %% [markdown] -# Next, we define a task that opens a structured dataset by calling `all()`. -# When we invoke `all()`, the Flyte engine downloads the parquet file on S3, and deserializes it to `pandas.dataframe`. -# -# :::{note} -# - Despite the input type of the task being {py:class}`StructuredDataset`, it can also accept FlyteSchema as input. -# - The code may result in runtime failures if the columns do not match. -# ::: +# Define a task that opens a structured dataset by calling `all()`. +# When you invoke `all()` with ``pandas.DataFrame``, the Flyte engine downloads the parquet file on S3, and deserializes it to `pandas.DataFrame`. +# Keep in mind that you can invoke ``open()`` with any dataframe type that's supported or added to structured dataset. +# For instance, you can use ``pa.Table`` to convert the Pandas DataFrame to a PyArrow table. # %% @task -def get_subset_df(df: Annotated[StructuredDataset, subset_cols]) -> Annotated[StructuredDataset, subset_cols]: +def get_subset_pandas_df(df: Annotated[StructuredDataset, all_cols]) -> Annotated[StructuredDataset, col]: df = df.open(pd.DataFrame).all() df = pd.concat([df, pd.DataFrame([[30]], columns=["Age"])]) - # On specifying BigQuery uri for StructuredDataset, flytekit writes a pandas dataframe to a BigQuery table return StructuredDataset(dataframe=df) -# %% [markdown] -# ## {py:class}`StructuredDataset` with `uri` Argument -# -# BigQuery `uri` allows you to load and retrieve data from cloud using the `uri`. The `uri` comprises of the bucket name and the filename prefixed with `gs://`. -# If you specify BigQuery `uri` for {py:class}`StructuredDataset`, BigQuery creates a table in the location specified by the `uri`. -# The `uri` in {py:class}`StructuredDataset` reads from or writes to S3, GCP, BigQuery, or any storage. -# Let's understand how to convert a pandas DataFrame to a BigQuery table and vice-versa through an example. -# -# Before writing DataFrame to a BigQuery table, -# -# 1. Create a [GCP account](https://cloud.google.com/docs/authentication/getting-started) and create a service account. -# 2. Create a project and add the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to your .bashrc file. -# 3. Create a dataset in your project. - -# %% [markdown] -# Import the dependencies. -# %% -import pandas as pd # noqa: E402 -from flytekit import task # noqa: E402 -from flytekit.types.structured import StructuredDataset # noqa: E402 +@workflow +def simple_sd_wf(a: int = 19) -> Annotated[StructuredDataset, col]: + pandas_df = generate_pandas_df(a=a) + return get_subset_pandas_df(df=pandas_df) # %% [markdown] -# Define a task that converts a pandas DataFrame to a BigQuery table. +# The code may result in runtime failures if the columns do not match. +# The input ``df`` has ``Name``, ``Age`` and ``Height`` columns, whereas the output structured dataset will only have the ``Age`` column. +# +# ## Serialized byte format +# You can use a custom serialization format to serialize your dataframes. +# Here's how you can register the Pandas to CSV handler, which is already available, +# and enable the CSV serialization by annotating the structured dataset with the CSV format: # %% -@task -def pandas_to_bq() -> StructuredDataset: - # create a pandas dataframe - df = pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]}) - # convert the dataframe to StructuredDataset - return StructuredDataset(dataframe=df, uri="bq://sample-project-1-352610.sample_352610.test1") +from flytekit.types.structured import register_csv_handlers +from flytekit.types.structured.structured_dataset import CSV +register_csv_handlers() -# %% [markdown] -# :::{note} -# The BigQuery uri's format is `bq://..`. -# ::: -# %% [markdown] -# Define a task that converts the BigQuery table to a pandas DataFrame. -# %% @task -def bq_to_pandas(sd: StructuredDataset) -> pd.DataFrame: - # convert to pandas dataframe - return sd.open(pd.DataFrame).all() +def pandas_to_csv(df: pd.DataFrame) -> Annotated[StructuredDataset, CSV]: + return StructuredDataset(dataframe=df) -# %% [markdown] -# :::{note} -# Flyte creates the table inside the dataset in the project upon BigQuery query execution. -# ::: - -# %% [markdown] -# Trigger the tasks locally. -# %% -if __name__ == "__main__": - o1 = bq_to_pandas(sd=StructuredDataset(uri="bq://sample-project-1-352610.sample_352610.test1")) - o2 = pandas_to_bq() +@workflow +def pandas_to_csv_wf() -> Annotated[StructuredDataset, CSV]: + pandas_df = generate_pandas_df(a=19) + return pandas_to_csv(df=pandas_df) # %% [markdown] -# ## NumPy Encoder and Decoder +# ## Storage driver and location +# By default, the data will be written to the same place that all other pointer-types (FlyteFile, FlyteDirectory, etc.) are written to. +# This is controlled by the output data prefix option in Flyte which is configurable on multiple levels. # -# {py:class}`StructuredDataset` ships with an encoder and a decoder that handles the conversion of a Python value to a Flyte literal and vice-versa, respectively. -# Let's understand how to write them by defining a NumPy encoder and decoder, which helps use NumPy array as a valid type within structured datasets. - -# %% [markdown] -# ### NumPy Encoder +# That is to say, in the simple default case, Flytekit will, +# +# - Look up the default format for say, Pandas dataframes, +# - Look up the default storage location based on the raw output prefix setting, +# - Use these two settings to select an encoder and invoke it. +# +# So what's an encoder? To understand that, let's look into how the structured dataset plugin works. +# +# ## Inner workings of a structured dataset plugin # -# We extend {py:class}`StructuredDatasetEncoder` and implement the {py:meth}`~StructuredDatasetEncoder.encode` function. -# The {py:meth}`~StructuredDatasetEncoder.encode` function converts NumPy array to an intermediate format (parquet file format in this case). +# Two things need to happen with any dataframe instance when interacting with Flyte: +# +# - Serialization/deserialization from/to the Python instance to bytes (in the format specified above). +# - Transmission/retrieval of those bits to/from somewhere. +# +# Each structured dataset plugin (called encoder or decoder) needs to perform both of these steps. +# Flytekit decides which of the loaded plugins to invoke based on three attributes: +# +# - The byte format +# - The storage location +# - The Python type in the task or workflow signature. +# +# These three keys uniquely identify which encoder (used when converting a dataframe in Python memory to a Flyte value, +# e.g. when a task finishes and returns a dataframe) or decoder (used when hydrating a dataframe in memory from a Flyte value, +# e.g. when a task starts and has a dataframe input) to invoke. +# +# However, it is awkward to require users to use `typing.Annotated` on every signature. +# Therefore, Flytekit has a default byte-format for every Python dataframe type registered with flytekit. +# +# ## The `uri` argument +# +# BigQuery `uri` allows you to load and retrieve data from cloud using the `uri` argument. +# The `uri` comprises of the bucket name and the filename prefixed with `gs://`. +# If you specify BigQuery `uri` for structured dataset, BigQuery creates a table in the location specified by the `uri`. +# The `uri` in structured dataset reads from or writes to S3, GCP, BigQuery or any storage. +# +# Before writing DataFrame to a BigQuery table, +# +# 1. Create a [GCP account](https://cloud.google.com/docs/authentication/getting-started) and create a service account. +# 2. Create a project and add the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to your `.bashrc` file. +# 3. Create a dataset in your project. +# +# Here's how you can define a task that converts a pandas DataFrame to a BigQuery table: +# +# ```python +# @task +# def pandas_to_bq() -> StructuredDataset: +# df = pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]}) +# return StructuredDataset(dataframe=df, uri="gs:///") +# ``` +# +# Replace `BUCKET_NAME` with the name of your GCS bucket and `FILE_NAME` with the name of the file the dataframe should be copied to. +# +# ### Note that no format was specified in the structured dataset constructor, or in the signature. So how did the BigQuery encoder get invoked? +# This is because the stock BigQuery encoder is loaded into Flytekit with an empty format. +# The Flytekit `StructuredDatasetTransformerEngine` interprets that to mean that it is a generic encoder +# (or decoder) and can work across formats, if a more specific format is not found. +# +# And here's how you can define a task that converts the BigQuery table to a pandas DataFrame: +# +# ```python +# @task +# def bq_to_pandas(sd: StructuredDataset) -> pd.DataFrame: +# return sd.open(pd.DataFrame).all() +# ``` +# +# :::{note} +# Flyte creates a table inside the dataset in the project upon BigQuery query execution. +# ::: +# +# ## How to return multiple dataframes from a task? +# For instance, how would a task return say two dataframes: +# - The first dataframe be written to BigQuery and serialized by one of their libraries, +# - The second needs to be serialized to CSV and written at a specific location in GCS different from the generic pointer-data bucket +# +# If you want the default behavior (which is itself configurable based on which plugins are loaded), +# you can work just with your current raw dataframe classes. +# +# ```python +# @task +# def t1() -> typing.Tuple[StructuredDataset, StructuredDataset]: +# ... +# return StructuredDataset(df1, uri="bq://project:flyte.table"), \ +# StructuredDataset(df2, uri="gs://auxiliary-bucket/data") +# ``` +# +# If you want to customize the Flyte interaction behavior, you'll need to wrap your dataframe in a `StructuredDataset` wrapper object. +# +# ## How to define a custom structured dataset plugin? +# +# `StructuredDataset` ships with an encoder and a decoder that handles the conversion of a +# Python value to a Flyte literal and vice-versa, respectively. +# Here is a quick demo showcasing how one might build a NumPy encoder and decoder, +# enabling the use of a 2D NumPy array as a valid type within structured datasets. +# +# ### NumPy encoder +# +# Extend `StructuredDatasetEncoder` and implement the `encode` function. +# The `encode` function converts NumPy array to an intermediate format (parquet file format in this case). # %% -class NumpyEncodingHandlers(StructuredDatasetEncoder): +class NumpyEncodingHandler(StructuredDatasetEncoder): def encode( self, ctx: FlyteContext, @@ -181,12 +249,12 @@ def encode( # %% [markdown] -# ### NumPy Decoder +# ### NumPy decoder # -# Next we extend {py:class}`StructuredDatasetDecoder` and implement the {py:meth}`~StructuredDatasetDecoder.decode` function. +# Extend {py:class}`StructuredDatasetDecoder` and implement the {py:meth}`~StructuredDatasetDecoder.decode` function. # The {py:meth}`~StructuredDatasetDecoder.decode` function converts the parquet file to a `numpy.ndarray`. # %% -class NumpyDecodingHandlers(StructuredDatasetDecoder): +class NumpyDecodingHandler(StructuredDatasetDecoder): def decode( self, ctx: FlyteContext, @@ -200,16 +268,12 @@ def decode( # %% [markdown] -# ### NumPy Renderer +# ### NumPy renderer # -# Create a default renderer for numpy array, then flytekit will use this renderer to -# display schema of numpy array on flyte deck. +# Create a default renderer for numpy array, then Flytekit will use this renderer to +# display schema of NumPy array on the Flyte deck. # %% class NumpyRenderer: - """ - The schema of Numpy array are rendered as an HTML table. - """ - def to_html(self, df: np.ndarray) -> str: assert isinstance(df, np.ndarray) name = ["col" + str(i) for i in range(len(df))] @@ -218,50 +282,44 @@ def to_html(self, df: np.ndarray) -> str: # %% [markdown] -# Finally, we register the encoder, decoder, and renderer with the `StructuredDatasetTransformerEngine`. +# In the end, register the encoder, decoder and renderer with the `StructuredDatasetTransformerEngine`. +# Specify the Python type you want to register this encoder with (`np.ndarray`), +# the storage engine to register this against (if not specified, it is assumed to work for all the storage backends), +# and the byte format, which in this case is `PARQUET`. # %% -StructuredDatasetTransformerEngine.register(NumpyEncodingHandlers(np.ndarray, None, PARQUET)) -StructuredDatasetTransformerEngine.register(NumpyDecodingHandlers(np.ndarray, None, PARQUET)) +StructuredDatasetTransformerEngine.register(NumpyEncodingHandler(np.ndarray, None, PARQUET)) +StructuredDatasetTransformerEngine.register(NumpyDecodingHandler(np.ndarray, None, PARQUET)) StructuredDatasetTransformerEngine.register_renderer(np.ndarray, NumpyRenderer()) # %% [markdown] # You can now use `numpy.ndarray` to deserialize the parquet file to NumPy and serialize a task's output (NumPy array) to a parquet file. - -# %% [markdown] -# Let's define a task to test the above functionality. -# We open a structured dataset of type `numpy.ndarray` and serialize it again. - # %% @task -def to_numpy(ds: Annotated[StructuredDataset, subset_cols]) -> Annotated[StructuredDataset, subset_cols, PARQUET]: - numpy_array = ds.open(np.ndarray).all() - return StructuredDataset(dataframe=numpy_array) +def generate_pd_df_with_str() -> pd.DataFrame: + return pd.DataFrame({"Name": ["Tom", "Joseph"]}) -# %% [markdown] -# Finally, we define two workflows that showcase how a `pandas.DataFrame` and `FlyteSchema` are accepted by the {py:class}`StructuredDataset`. -# %% -@workflow -def pandas_compatibility_wf(a: int) -> Annotated[StructuredDataset, subset_cols]: - df = get_df(a=a) - ds = get_subset_df(df=df) # noqa: shown for demonstration; users should use the same types between tasks - return to_numpy(ds=ds) +@task +def to_numpy(sd: StructuredDataset) -> Annotated[StructuredDataset, None, PARQUET]: + numpy_array = sd.open(np.ndarray).all() + return StructuredDataset(dataframe=numpy_array) @workflow -def schema_compatibility_wf(a: int) -> Annotated[StructuredDataset, subset_cols]: - df = get_schema_df(a=a) - ds = get_subset_df(df=df) # noqa: shown for demonstration; users should use the same types between tasks - return to_numpy(ds=ds) +def numpy_wf() -> Annotated[StructuredDataset, None, PARQUET]: + return to_numpy(sd=generate_pd_df_with_str()) # %% [markdown] -# You can run the code locally as follows: +# :::{note} +# `pyarrow` raises an `Expected bytes, got a 'int' object` error when the dataframe contains integers. +# ::: # +# You can run the code locally as follows: # %% if __name__ == "__main__": - numpy_array_one = pandas_compatibility_wf(a=42).open(np.ndarray).all() - print(f"pandas DataFrame compatibility check output: {numpy_array_one}") - numpy_array_two = schema_compatibility_wf(a=42).open(np.ndarray).all() - print(f"Schema compatibility check output: {numpy_array_two}") + sd = simple_sd_wf() + print(f"A simple Pandas dataframe workflow: {sd.open(pd.DataFrame).all()}") + print(f"Using CSV as the serializer: {pandas_to_csv_wf().open(pd.DataFrame).all()}") + print(f"NumPy encoder and decoder: {numpy_wf().open(np.ndarray).all()}") diff --git a/examples/data_types_and_io/data_types_and_io/typed_schema.py b/examples/data_types_and_io/data_types_and_io/typed_schema.py deleted file mode 100644 index 7a09a6e99..000000000 --- a/examples/data_types_and_io/data_types_and_io/typed_schema.py +++ /dev/null @@ -1,69 +0,0 @@ -# %% [markdown] -# (typed_schema)= -# -# # Typed Columns in a Schema -# -# ```{eval-rst} -# .. tags:: DataFrame, Basic, Data -# ``` -# -# This example explains how a typed schema can be used in Flyte and declared in flytekit. -# %% -import pandas -from flytekit import kwtypes, task, workflow - -# %% [markdown] -# Flytekit consists of some pre-built type extensions, one of them is the FlyteSchema type -# %% -from flytekit.types.schema import FlyteSchema - -# %% [markdown] -# FlyteSchema is an abstract Schema type that can be used to represent any structured dataset which has typed -# (or untyped) columns -# %% -out_schema = FlyteSchema[kwtypes(x=int, y=str)] - - -# %% [markdown] -# To write to a schema object refer to `FlyteSchema.open` method. Writing can be done -# using any of the supported dataframe formats. -# -# ```{eval-rst} -# .. todo:: -# -# Reference the supported dataframe formats here -# ``` -# %% -@task -def t1() -> out_schema: - w = out_schema() - df = pandas.DataFrame(data={"x": [1, 2], "y": ["3", "4"]}) - w.open().write(df) - return w - - -# %% [markdown] -# To read a Schema, one has to invoke the `FlyteSchema.open`. The default mode -# is automatically configured to be `open` and the default returned dataframe type is {py:class}`pandas.DataFrame` -# Different types of dataframes can be returned based on the type passed into the open method -# %% -@task -def t2(schema: FlyteSchema[kwtypes(x=int, y=str)]) -> FlyteSchema[kwtypes(x=int)]: - assert isinstance(schema, FlyteSchema) - df: pandas.DataFrame = schema.open().all() - return df[schema.column_names()[:-1]] - - -@workflow -def wf() -> FlyteSchema[kwtypes(x=int)]: - return t2(schema=t1()) - - -# %% [markdown] -# Local execution will convert the data to and from the serialized representation thus, mimicking a complete distributed -# execution. -# -# %% -if __name__ == "__main__": - print(f"Running {__file__} main...") - print(f"Running wf(), returns columns {wf().columns()}") diff --git a/examples/data_types_and_io/requirements.in b/examples/data_types_and_io/requirements.in deleted file mode 100644 index cc489e0e3..000000000 --- a/examples/data_types_and_io/requirements.in +++ /dev/null @@ -1,6 +0,0 @@ -flytekit>=1.7.0 -wheel -matplotlib -flytekitplugins-deck-standard -scikit-learn -torch diff --git a/examples/data_types_and_io/requirements.txt b/examples/data_types_and_io/requirements.txt deleted file mode 100644 index 5496972cb..000000000 --- a/examples/data_types_and_io/requirements.txt +++ /dev/null @@ -1,607 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.8 -# by the following command: -# -# pip-compile --resolver=backtracking requirements.in -# -adlfs==2023.4.0 - # via flytekit -aiobotocore==2.5.0 - # via s3fs -aiohttp==3.8.4 - # via - # adlfs - # aiobotocore - # gcsfs - # s3fs -aioitertools==0.11.0 - # via aiobotocore -aiosignal==1.3.1 - # via aiohttp -arrow==1.2.3 - # via jinja2-time -asttokens==2.2.1 - # via stack-data -async-timeout==4.0.2 - # via aiohttp -attrs==23.1.0 - # via - # aiohttp - # visions -azure-core==1.27.1 - # via - # adlfs - # azure-identity - # azure-storage-blob -azure-datalake-store==0.0.53 - # via adlfs -azure-identity==1.13.0 - # via adlfs -azure-storage-blob==12.16.0 - # via adlfs -backcall==0.2.0 - # via ipython -binaryornot==0.4.4 - # via cookiecutter -botocore==1.29.76 - # via aiobotocore -cachetools==5.3.1 - # via google-auth -certifi==2023.5.7 - # via - # kubernetes - # requests -cffi==1.15.1 - # via - # azure-datalake-store - # cryptography -chardet==5.1.0 - # via binaryornot -charset-normalizer==3.1.0 - # via - # aiohttp - # requests -click==8.1.3 - # via - # cookiecutter - # flytekit - # rich-click -cloudpickle==2.2.1 - # via flytekit -cmake==3.27.6 - # via triton -comm==0.1.3 - # via ipykernel -contourpy==1.1.0 - # via matplotlib -cookiecutter==2.1.1 - # via flytekit -croniter==1.4.1 - # via flytekit -cryptography==41.0.1 - # via - # azure-identity - # azure-storage-blob - # msal - # pyjwt - # pyopenssl - # secretstorage -cycler==0.11.0 - # via matplotlib -dacite==1.8.1 - # via ydata-profiling -dataclasses-json==0.5.8 - # via flytekit -debugpy==1.6.7 - # via ipykernel -decorator==5.1.1 - # via - # gcsfs - # ipython -deprecated==1.2.14 - # via flytekit -diskcache==5.6.1 - # via flytekit -docker==6.1.3 - # via flytekit -docker-image-py==0.1.12 - # via flytekit -docstring-parser==0.15 - # via flytekit -executing==1.2.0 - # via stack-data -filelock==3.12.2 - # via - # torch - # triton -flyteidl==1.5.11 - # via flytekit -flytekit==1.7.0 - # via - # -r requirements.in - # flytekitplugins-deck-standard -flytekitplugins-deck-standard==1.7.0 - # via -r requirements.in -fonttools==4.40.0 - # via matplotlib -frozenlist==1.3.3 - # via - # aiohttp - # aiosignal -fsspec==2023.6.0 - # via - # adlfs - # flytekit - # gcsfs - # s3fs -gcsfs==2023.6.0 - # via flytekit -gitdb==4.0.10 - # via gitpython -gitpython==3.1.31 - # via flytekit -google-api-core==2.11.1 - # via - # google-cloud-core - # google-cloud-storage -google-auth==2.20.0 - # via - # gcsfs - # google-api-core - # google-auth-oauthlib - # google-cloud-core - # google-cloud-storage - # kubernetes -google-auth-oauthlib==1.0.0 - # via gcsfs -google-cloud-core==2.3.2 - # via google-cloud-storage -google-cloud-storage==2.9.0 - # via gcsfs -google-crc32c==1.5.0 - # via google-resumable-media -google-resumable-media==2.5.0 - # via google-cloud-storage -googleapis-common-protos==1.59.1 - # via - # flyteidl - # flytekit - # google-api-core - # grpcio-status -grpcio==1.56.0 - # via - # flytekit - # grpcio-status -grpcio-status==1.56.0 - # via flytekit -htmlmin==0.1.12 - # via ydata-profiling -idna==3.4 - # via - # requests - # yarl -imagehash==4.3.1 - # via - # visions - # ydata-profiling -importlib-metadata==6.7.0 - # via - # flytekit - # keyring -ipykernel==6.23.3 - # via ipywidgets -ipython==8.12.2 - # via - # ipykernel - # ipywidgets -ipywidgets==8.0.6 - # via flytekitplugins-deck-standard -isodate==0.6.1 - # via azure-storage-blob -jaraco-classes==3.2.3 - # via keyring -jedi==0.18.2 - # via ipython -jeepney==0.8.0 - # via - # keyring - # secretstorage -jinja2==3.1.2 - # via - # cookiecutter - # jinja2-time - # torch - # ydata-profiling -jinja2-time==0.2.0 - # via cookiecutter -jmespath==1.0.1 - # via botocore -joblib==1.2.0 - # via - # flytekit - # phik - # scikit-learn -jupyter-client==8.3.0 - # via ipykernel -jupyter-core==5.3.1 - # via - # ipykernel - # jupyter-client -jupyterlab-widgets==3.0.7 - # via ipywidgets -keyring==24.1.0 - # via flytekit -kiwisolver==1.4.4 - # via matplotlib -kubernetes==26.1.0 - # via flytekit -lit==17.0.2 - # via triton -markdown==3.4.3 - # via flytekitplugins-deck-standard -markdown-it-py==3.0.0 - # via rich -markupsafe==2.1.3 - # via jinja2 -marshmallow==3.19.0 - # via - # dataclasses-json - # marshmallow-enum - # marshmallow-jsonschema -marshmallow-enum==1.5.1 - # via dataclasses-json -marshmallow-jsonschema==0.13.0 - # via flytekit -matplotlib==3.7.1 - # via - # -r requirements.in - # phik - # seaborn - # wordcloud - # ydata-profiling -matplotlib-inline==0.1.6 - # via - # ipykernel - # ipython -mdurl==0.1.2 - # via markdown-it-py -more-itertools==9.1.0 - # via jaraco-classes -mpmath==1.3.0 - # via sympy -msal==1.22.0 - # via - # azure-datalake-store - # azure-identity - # msal-extensions -msal-extensions==1.0.0 - # via azure-identity -multidict==6.0.4 - # via - # aiohttp - # yarl -multimethod==1.9.1 - # via - # visions - # ydata-profiling -mypy-extensions==1.0.0 - # via typing-inspect -natsort==8.4.0 - # via flytekit -nest-asyncio==1.5.6 - # via ipykernel -networkx==3.1 - # via - # torch - # visions -numpy==1.23.5 - # via - # contourpy - # flytekit - # imagehash - # matplotlib - # pandas - # patsy - # phik - # pyarrow - # pywavelets - # scikit-learn - # scipy - # seaborn - # statsmodels - # visions - # wordcloud - # ydata-profiling -nvidia-cublas-cu11==11.10.3.66 - # via - # nvidia-cudnn-cu11 - # nvidia-cusolver-cu11 - # torch -nvidia-cuda-cupti-cu11==11.7.101 - # via torch -nvidia-cuda-nvrtc-cu11==11.7.99 - # via torch -nvidia-cuda-runtime-cu11==11.7.99 - # via torch -nvidia-cudnn-cu11==8.5.0.96 - # via torch -nvidia-cufft-cu11==10.9.0.58 - # via torch -nvidia-curand-cu11==10.2.10.91 - # via torch -nvidia-cusolver-cu11==11.4.0.1 - # via torch -nvidia-cusparse-cu11==11.7.4.91 - # via torch -nvidia-nccl-cu11==2.14.3 - # via torch -nvidia-nvtx-cu11==11.7.91 - # via torch -oauthlib==3.2.2 - # via requests-oauthlib -packaging==23.1 - # via - # docker - # ipykernel - # marshmallow - # matplotlib - # plotly - # statsmodels -pandas==1.5.3 - # via - # flytekit - # phik - # seaborn - # statsmodels - # visions - # ydata-profiling -parso==0.8.3 - # via jedi -patsy==0.5.3 - # via statsmodels -pexpect==4.8.0 - # via ipython -phik==0.12.3 - # via ydata-profiling -pickleshare==0.7.5 - # via ipython -pillow==10.0.1 - # via - # imagehash - # matplotlib - # visions - # wordcloud -platformdirs==3.8.0 - # via jupyter-core -plotly==5.15.0 - # via flytekitplugins-deck-standard -portalocker==2.7.0 - # via msal-extensions -prompt-toolkit==3.0.38 - # via ipython -protobuf==4.23.3 - # via - # flyteidl - # google-api-core - # googleapis-common-protos - # grpcio-status - # protoc-gen-swagger -protoc-gen-swagger==0.1.0 - # via flyteidl -psutil==5.9.5 - # via ipykernel -ptyprocess==0.7.0 - # via pexpect -pure-eval==0.2.2 - # via stack-data -pyarrow==10.0.1 - # via flytekit -pyasn1==0.5.0 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.3.0 - # via google-auth -pycparser==2.21 - # via cffi -pydantic==1.10.9 - # via ydata-profiling -pygments==2.15.1 - # via - # ipython - # rich -pyjwt[crypto]==2.7.0 - # via msal -pyopenssl==23.2.0 - # via flytekit -pyparsing==3.1.0 - # via matplotlib -python-dateutil==2.8.2 - # via - # arrow - # botocore - # croniter - # flytekit - # jupyter-client - # kubernetes - # matplotlib - # pandas -python-json-logger==2.0.7 - # via flytekit -python-slugify==8.0.1 - # via cookiecutter -pytimeparse==1.1.8 - # via flytekit -pytz==2023.3 - # via - # flytekit - # pandas -pywavelets==1.4.1 - # via imagehash -pyyaml==6.0 - # via - # cookiecutter - # flytekit - # kubernetes - # responses - # ydata-profiling -pyzmq==25.1.0 - # via - # ipykernel - # jupyter-client -regex==2023.6.3 - # via docker-image-py -requests==2.31.0 - # via - # azure-core - # azure-datalake-store - # cookiecutter - # docker - # flytekit - # gcsfs - # google-api-core - # google-cloud-storage - # kubernetes - # msal - # requests-oauthlib - # responses - # ydata-profiling -requests-oauthlib==1.3.1 - # via - # google-auth-oauthlib - # kubernetes -responses==0.23.1 - # via flytekit -rich==13.4.2 - # via - # flytekit - # rich-click -rich-click==1.6.1 - # via flytekit -rsa==4.9 - # via google-auth -s3fs==2023.6.0 - # via flytekit -scikit-learn==1.2.2 - # via -r requirements.in -scipy==1.10.1 - # via - # imagehash - # phik - # scikit-learn - # statsmodels - # ydata-profiling -seaborn==0.12.2 - # via ydata-profiling -secretstorage==3.3.3 - # via keyring -six==1.16.0 - # via - # asttokens - # azure-core - # azure-identity - # google-auth - # isodate - # kubernetes - # patsy - # python-dateutil -smmap==5.0.0 - # via gitdb -sortedcontainers==2.4.0 - # via flytekit -stack-data==0.6.2 - # via ipython -statsd==3.3.0 - # via flytekit -statsmodels==0.14.0 - # via ydata-profiling -sympy==1.12 - # via torch -tangled-up-in-unicode==0.2.0 - # via visions -tenacity==8.2.2 - # via plotly -text-unidecode==1.3 - # via python-slugify -threadpoolctl==3.1.0 - # via scikit-learn -torch==2.0.1 - # via - # -r requirements.in - # triton -tornado==6.3.2 - # via - # ipykernel - # jupyter-client -tqdm==4.65.0 - # via ydata-profiling -traitlets==5.9.0 - # via - # comm - # ipykernel - # ipython - # ipywidgets - # jupyter-client - # jupyter-core - # matplotlib-inline -triton==2.0.0 - # via torch -typeguard==2.13.3 - # via ydata-profiling -types-pyyaml==6.0.12.10 - # via responses -typing-extensions==4.6.3 - # via - # azure-core - # azure-storage-blob - # flytekit - # pydantic - # torch - # typing-inspect -typing-inspect==0.9.0 - # via dataclasses-json -urllib3==1.26.16 - # via - # botocore - # docker - # flytekit - # google-auth - # kubernetes - # requests - # responses -visions[type_image_path]==0.7.5 - # via ydata-profiling -wcwidth==0.2.6 - # via prompt-toolkit -websocket-client==1.6.1 - # via - # docker - # kubernetes -wheel==0.40.0 - # via - # -r requirements.in - # flytekit - # nvidia-cublas-cu11 - # nvidia-cuda-cupti-cu11 - # nvidia-cuda-runtime-cu11 - # nvidia-curand-cu11 - # nvidia-cusparse-cu11 - # nvidia-nvtx-cu11 -widgetsnbextension==4.0.7 - # via ipywidgets -wordcloud==1.9.2 - # via ydata-profiling -wrapt==1.15.0 - # via - # aiobotocore - # deprecated - # flytekit -yarl==1.9.2 - # via aiohttp -ydata-profiling==4.3.1 - # via flytekitplugins-deck-standard -zipp==3.15.0 - # via importlib-metadata - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/examples/development_lifecycle/development_lifecycle/decks.py b/examples/development_lifecycle/development_lifecycle/decks.py index 563e17d9e..1a8f7fd49 100644 --- a/examples/development_lifecycle/development_lifecycle/decks.py +++ b/examples/development_lifecycle/development_lifecycle/decks.py @@ -45,7 +45,8 @@ # %% [markdown] -# :::{note} +# :::{important} +# Replace `ghcr.io/flyteorg` with a container registry you've access to publish to. # To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`. # ::: # diff --git a/examples/extending/README.md b/examples/extending/README.md index 8a5bdc82f..ad3a619c7 100644 --- a/examples/extending/README.md +++ b/examples/extending/README.md @@ -14,7 +14,7 @@ For example, Flyte supports adding support for a dataframe type from a new libra grouping of images in a specific encoding. Flytekit natively supports structured data like {py:func}`~dataclasses.dataclass` using JSON as the -representation format (see {ref}`Using Custom Python Objects `). +representation format (see {ref}`Using Custom Python Objects `). Flytekit allows users to extend Flyte's type system and implement types in Python that are not representable as JSON documents. The user has to implement a {py:class}`~flytekit.extend.TypeTransformer` class to enable the translation of type from user type to Flyte-understood type. diff --git a/examples/k8s_dask_plugin/README.md b/examples/k8s_dask_plugin/README.md index 616bd13a4..713e810c2 100644 --- a/examples/k8s_dask_plugin/README.md +++ b/examples/k8s_dask_plugin/README.md @@ -231,9 +231,8 @@ It's important to be aware that neither the job runner nor the scheduler will be To run the provided example on the Flyte cluster, use the following command: ``` -pyflyte run --remote \ - https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/k8s_dask_plugin/k8s_dask_plugin/dask_example.py \ - hello_dask --size 1000 +pyflyte run --remote dask_example.py \ + hello_dask --size 1000 ``` ```{auto-examples-toc} diff --git a/examples/k8s_dask_plugin/k8s_dask_plugin/dask_example.py b/examples/k8s_dask_plugin/k8s_dask_plugin/dask_example.py index 2dcfb0786..6e824b9e0 100644 --- a/examples/k8s_dask_plugin/k8s_dask_plugin/dask_example.py +++ b/examples/k8s_dask_plugin/k8s_dask_plugin/dask_example.py @@ -15,7 +15,8 @@ custom_image = ImageSpec(name="flyte-dask-plugin", registry="ghcr.io/flyteorg", packages=["flytekitplugins-dask"]) # %% [markdown] -# :::{note} +# :::{important} +# Replace `ghcr.io/flyteorg` with a container registry you've access to publish to. # To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`. # ::: # diff --git a/examples/k8s_spark_plugin/README.md b/examples/k8s_spark_plugin/README.md index 5a757ff5c..3bdbf721c 100644 --- a/examples/k8s_spark_plugin/README.md +++ b/examples/k8s_spark_plugin/README.md @@ -173,14 +173,11 @@ For more comprehensive information, please consult the [configuration structure] To run the provided examples on the Flyte cluster, use any of the following commands: ``` -pyflyte run --remote \ - https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/k8s_spark_plugin/k8s_spark_plugin/pyspark_pi.py \ - my_spark +pyflyte run --remote pyspark_pi.py my_spark ``` ``` -pyflyte run --remote \ - https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/k8s_spark_plugin/k8s_spark_plugin/dataframe_passing.py \ +pyflyte run --remote dataframe_passing.py \ my_smart_structured_dataset ``` diff --git a/examples/k8s_spark_plugin/k8s_spark_plugin/dataframe_passing.py b/examples/k8s_spark_plugin/k8s_spark_plugin/dataframe_passing.py index 368d5558a..47d73f163 100644 --- a/examples/k8s_spark_plugin/k8s_spark_plugin/dataframe_passing.py +++ b/examples/k8s_spark_plugin/k8s_spark_plugin/dataframe_passing.py @@ -26,7 +26,8 @@ custom_image = ImageSpec(name="flyte-spark-plugin", registry="ghcr.io/flyteorg") # %% [markdown] -# :::{note} +# :::{important} +# Replace `ghcr.io/flyteorg` with a container registry you've access to publish to. # To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`. # ::: # @@ -79,7 +80,7 @@ def spark_df() -> Annotated[StructuredDataset, columns]: # `spark_df` represents a Spark task executed within a Spark context, leveraging an active Spark cluster. # # This task yields a `pyspark.DataFrame` object, even though the return type is specified as -# {ref}`StructuredDataset `. +# {ref}`StructuredDataset `. # The Flytekit type system handles the automatic conversion of the `pyspark.DataFrame` into a `StructuredDataset` object. # The `StructuredDataset` object serves as an abstract representation of a DataFrame, adaptable to various DataFrame formats. @@ -117,7 +118,7 @@ def spark_to_pandas_wf() -> int: # %% [markdown] # New DataFrames can be dynamically loaded through the type engine. -# To register a custom DataFrame type, you can define an encoder and decoder for `StructuredDataset` as outlined in the {ref}`structured_dataset_example` example. +# To register a custom DataFrame type, you can define an encoder and decoder for `StructuredDataset` as outlined in the {ref}`structured_dataset` example. # # Existing DataFrame plugins include: # diff --git a/examples/k8s_spark_plugin/k8s_spark_plugin/pyspark_pi.py b/examples/k8s_spark_plugin/k8s_spark_plugin/pyspark_pi.py index c50e51f7c..0dd500a68 100644 --- a/examples/k8s_spark_plugin/k8s_spark_plugin/pyspark_pi.py +++ b/examples/k8s_spark_plugin/k8s_spark_plugin/pyspark_pi.py @@ -20,7 +20,8 @@ # %% [markdown] -# :::{note} +# :::{important} +# Replace `ghcr.io/flyteorg` with a container registry you've access to publish to. # To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`. # ::: # diff --git a/examples/kfpytorch_plugin/README.md b/examples/kfpytorch_plugin/README.md index cbaa60002..c24490ac6 100644 --- a/examples/kfpytorch_plugin/README.md +++ b/examples/kfpytorch_plugin/README.md @@ -24,8 +24,7 @@ To enable the plugin in the backend, follow instructions outlined in the {std:re To run the provided example on the Flyte cluster, use the following command: ``` -pyflyte run --remote \ - https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/kfpytorch_plugin/kfpytorch_plugin/pytorch_mnist.py \ +pyflyte run --remote pytorch_mnist.py \ pytorch_training_wf ``` diff --git a/examples/kfpytorch_plugin/kfpytorch_plugin/pytorch_mnist.py b/examples/kfpytorch_plugin/kfpytorch_plugin/pytorch_mnist.py index 9fd59461e..cd59e72c6 100644 --- a/examples/kfpytorch_plugin/kfpytorch_plugin/pytorch_mnist.py +++ b/examples/kfpytorch_plugin/kfpytorch_plugin/pytorch_mnist.py @@ -29,7 +29,8 @@ ) # %% [markdown] -# :::{note} +# :::{important} +# Replace `ghcr.io/flyteorg` with a container registry you've access to publish to. # To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`. # ::: # diff --git a/examples/kftensorflow_plugin/README.md b/examples/kftensorflow_plugin/README.md index a137c40db..636d0514d 100644 --- a/examples/kftensorflow_plugin/README.md +++ b/examples/kftensorflow_plugin/README.md @@ -24,8 +24,7 @@ To enable the plugin in the backend, follow instructions outlined in the {std:re To run the provided example on the Flyte cluster, use the following command: ``` -pyflyte run --remote \ - https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/kftensorflow_plugin/kftensorflow_plugin/tf_mnist.py \ +pyflyte run --remote tf_mnist.py \ mnist_tensorflow_workflow ``` diff --git a/examples/kftensorflow_plugin/kftensorflow_plugin/tf_mnist.py b/examples/kftensorflow_plugin/kftensorflow_plugin/tf_mnist.py index 3b5f69783..1749f7916 100644 --- a/examples/kftensorflow_plugin/kftensorflow_plugin/tf_mnist.py +++ b/examples/kftensorflow_plugin/kftensorflow_plugin/tf_mnist.py @@ -30,7 +30,8 @@ ) # %% [markdown] -# :::{note} +# :::{important} +# Replace `ghcr.io/flyteorg` with a container registry you've access to publish to. # To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`. # ::: # diff --git a/examples/mnist_classifier/Dockerfile b/examples/mnist_classifier/Dockerfile index 30b0501c7..c4845818e 100644 --- a/examples/mnist_classifier/Dockerfile +++ b/examples/mnist_classifier/Dockerfile @@ -16,12 +16,6 @@ RUN pip install awscli # Install gcloud for GCP RUN apt-get update && apt-get install -y make build-essential libssl-dev curl -WORKDIR /opt -RUN curl https://sdk.cloud.google.com > install.sh -RUN bash /opt/install.sh --install-dir=/opt -ENV PATH $PATH:/opt/google-cloud-sdk/bin -WORKDIR /root - # Virtual environment ENV VENV /opt/venv RUN python3 -m venv ${VENV} diff --git a/examples/pandera_plugin/README.md b/examples/pandera_plugin/README.md index c79e9bf8d..6fd022e21 100644 --- a/examples/pandera_plugin/README.md +++ b/examples/pandera_plugin/README.md @@ -4,8 +4,8 @@ .. tags:: Integration, DataFrame, Data, Intermediate ``` -Flytekit python natively supports {ref}`many data types `, -including a {ref}`FlyteSchema ` type for +Flytekit python natively supports {ref}`many data types `, +including a `FlyteSchema` type for type-annotating pandas dataframes. The flytekit pandera plugin provides an alternative for defining dataframe schemas by integrating with [pandera](https://pandera.readthedocs.io/en/stable/), which is a runtime data validation tool for pandas dataframes. diff --git a/examples/ray_plugin/README.md b/examples/ray_plugin/README.md index 3402ca58d..3ad2a65a8 100644 --- a/examples/ray_plugin/README.md +++ b/examples/ray_plugin/README.md @@ -72,9 +72,8 @@ To enable the plugin in the backend, refer to the instructions provided in the { To run the provided example on the Flyte cluster, use the following command: ``` -pyflyte run --remote \ - https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/ray_plugin/ray_plugin/ray_example.py \ - ray_workflow --n 10 +pyflyte run --remote ray_example.py \ + ray_workflow --n 10 ``` ```{auto-examples-toc} diff --git a/examples/ray_plugin/ray_plugin/ray_example.py b/examples/ray_plugin/ray_plugin/ray_example.py index 9f3a2fc41..fa6bc10fe 100644 --- a/examples/ray_plugin/ray_plugin/ray_example.py +++ b/examples/ray_plugin/ray_plugin/ray_example.py @@ -36,7 +36,8 @@ ) # %% [markdown] -# :::{note} +# :::{important} +# Replace `ghcr.io/flyteorg` with a container registry you've access to publish to. # To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`. # ::: #