From eafdc93ab1053c3d772585a044eab50483154141 Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Wed, 18 Dec 2024 10:32:53 +0000 Subject: [PATCH] feat!: Exclude views from output of find tables (#1355) * tests: Add view dvt_core_type_vw for Oracle and PostgreSQL * feat: Exclude views from Ibis sa.list_tables method * tests: Add more find_tables with views tests * feat: Exclude views from Teradata list_tables method * Update third_party/ibis/ibis_addon/operations.py Co-authored-by: Helen Cristina * Update tests/system/data_sources/test_postgres.py Co-authored-by: Helen Cristina * chore: PR comments * feat: Prepare list_tables to support listing only tables or both tables and views * feat: find-tables supports --include-views * feat: Avoid dvt_list_tables when using pandas (for unit tests) * feat: Add pass-through dvt_list_tables on Hive backend --------- Co-authored-by: Helen Cristina --- data_validation/cli_tools.py | 13 ++- data_validation/clients.py | 17 ++-- data_validation/find_tables.py | 17 +++- tests/resources/bigquery_test_tables.sql | 3 + tests/resources/hive_test_tables.sql | 3 + tests/resources/mysql_test_tables.sql | 4 + tests/resources/oracle_test_tables.sql | 3 + tests/resources/postgresql_test_tables.sql | 3 + tests/resources/snowflake_test_tables.sql | 3 + tests/resources/sqlserver_test_tables.sql | 4 + tests/resources/teradata_test_tables.sql | 4 + tests/system/data_sources/common_functions.py | 85 +++++++++++++++++-- tests/system/data_sources/test_hive.py | 11 +++ tests/system/data_sources/test_mysql.py | 50 ++++++++++- tests/system/data_sources/test_oracle.py | 73 ++++++++++++---- tests/system/data_sources/test_postgres.py | 67 ++++++++++++--- tests/system/data_sources/test_snowflake.py | 11 +++ tests/system/data_sources/test_spanner.py | 6 ++ tests/system/data_sources/test_sql_server.py | 16 +--- tests/system/data_sources/test_teradata.py | 44 +++++++++- tests/unit/test_cli_tools.py | 8 ++ third_party/ibis/ibis_addon/operations.py | 11 +++ third_party/ibis/ibis_biquery/api.py | 5 ++ .../ibis/ibis_cloud_spanner/__init__.py | 3 + .../ibis/ibis_cloud_spanner/tests/ddl.sql | 9 ++ third_party/ibis/ibis_impala/api.py | 6 ++ third_party/ibis/ibis_teradata/__init__.py | 9 +- 27 files changed, 420 insertions(+), 68 deletions(-) diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py index 06ca15172..3ac24c7e5 100644 --- a/data_validation/cli_tools.py +++ b/data_validation/cli_tools.py @@ -354,17 +354,24 @@ def _configure_deploy(subparsers): def _configure_find_tables(subparsers): """Configure arguments for text search table matching.""" find_tables_parser = subparsers.add_parser( - "find-tables", help="Build tables list using approx string matching" + "find-tables", help="Build tables list using approx string matching." ) find_tables_parser.add_argument( - "--source-conn", "-sc", help="Source connection name" + "--source-conn", "-sc", help="Source connection name." ) find_tables_parser.add_argument( - "--target-conn", "-tc", help="Target connection name" + "--target-conn", "-tc", help="Target connection name." ) find_tables_parser.add_argument( "--allowed-schemas", "-as", help="List of source schemas to match." ) + find_tables_parser.add_argument( + "--include-views", + "-iv", + default=False, + action="store_true", + help="Include views in results.", + ) find_tables_parser.add_argument( "--score-cutoff", "-score", diff --git a/data_validation/clients.py b/data_validation/clients.py index fd5ff4c2a..546b48260 100644 --- a/data_validation/clients.py +++ b/data_validation/clients.py @@ -233,14 +233,19 @@ def list_schemas(client): return [None] -def list_tables(client, schema_name): +def list_tables(client, schema_name, tables_only=True): """Return a list of tables in the DB schema.""" - if client.name in ["db2", "mssql", "redshift", "snowflake"]: - return client.list_tables() - return client.list_tables(database=schema_name) + fn = ( + client.dvt_list_tables + if tables_only and client.name != "pandas" + else client.list_tables + ) + if client.name in ["db2", "mssql", "redshift", "snowflake", "pandas"]: + return fn() + return fn(database=schema_name) -def get_all_tables(client, allowed_schemas=None): +def get_all_tables(client, allowed_schemas=None, tables_only=True): """Return a list of tuples with database and table names. client (IbisClient): Client to use for tables @@ -252,7 +257,7 @@ def get_all_tables(client, allowed_schemas=None): if allowed_schemas and schema_name not in allowed_schemas: continue try: - tables = list_tables(client, schema_name) + tables = list_tables(client, schema_name, tables_only=tables_only) except Exception as e: logging.warning(f"List Tables Error: {schema_name} -> {e}") continue diff --git a/data_validation/find_tables.py b/data_validation/find_tables.py index 7d850e8ba..3477ff48e 100644 --- a/data_validation/find_tables.py +++ b/data_validation/find_tables.py @@ -60,10 +60,14 @@ def _compare_match_tables(source_table_map, target_table_map, score_cutoff=0.8) return table_configs -def _get_table_map(client: "ibis.backends.base.BaseBackend", allowed_schemas=None): +def _get_table_map( + client: "ibis.backends.base.BaseBackend", allowed_schemas=None, include_views=False +): """Return dict with searchable keys for table matching.""" table_map = {} - table_objs = clients.get_all_tables(client, allowed_schemas=allowed_schemas) + table_objs = clients.get_all_tables( + client, allowed_schemas=allowed_schemas, tables_only=(not include_views) + ) for table_obj in table_objs: table_key = ".".join([t for t in table_obj if t]) @@ -79,11 +83,14 @@ def get_mapped_table_configs( source_client: "ibis.backends.base.BaseBackend", target_client: "ibis.backends.base.BaseBackend", allowed_schemas: list = None, + include_views: bool = False, score_cutoff: int = 1, ) -> list: """Get table list from each client and match them together into a single list of dicts.""" - source_table_map = _get_table_map(source_client, allowed_schemas=allowed_schemas) - target_table_map = _get_table_map(target_client) + source_table_map = _get_table_map( + source_client, allowed_schemas=allowed_schemas, include_views=include_views + ) + target_table_map = _get_table_map(target_client, include_views=include_views) return _compare_match_tables( source_table_map, target_table_map, score_cutoff=score_cutoff ) @@ -102,6 +109,7 @@ def find_tables_using_string_matching(args) -> str: source_client, target_client, allowed_schemas=allowed_schemas, + include_views=args.include_views, score_cutoff=score_cutoff, ) return json.dumps(table_configs) @@ -141,6 +149,7 @@ def expand_tables_of_asterisk( source_client, target_client, allowed_schemas=[mapping[consts.CONFIG_SCHEMA_NAME]], + include_views=False, ) new_list.extend(expanded_tables) else: diff --git a/tests/resources/bigquery_test_tables.sql b/tests/resources/bigquery_test_tables.sql index 1b0ead689..9d8daf140 100644 --- a/tests/resources/bigquery_test_tables.sql +++ b/tests/resources/bigquery_test_tables.sql @@ -52,6 +52,9 @@ INSERT INTO `pso_data_validator`.`dvt_core_types` VALUES ,DATE '1970-01-03',DATETIME '1970-01-03 00:00:03' ,TIMESTAMP '1970-01-03 00:00:03-03:00'); +CREATE VIEW `pso_data_validator`.`dvt_core_types_vw` AS +SELECT * FROM `pso_data_validator`.`dvt_core_types`; + DROP TABLE `pso_data_validator`.`dvt_null_not_null`; CREATE TABLE `pso_data_validator`.`dvt_null_not_null` ( col_nn DATETIME NOT NULL diff --git a/tests/resources/hive_test_tables.sql b/tests/resources/hive_test_tables.sql index 39886d645..7cc032cff 100644 --- a/tests/resources/hive_test_tables.sql +++ b/tests/resources/hive_test_tables.sql @@ -49,6 +49,9 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES ,'Hello DVT','C ','Hello DVT' ,'1970-01-03','1970-01-03 00:00:03','1970-01-03 03:00:03'); +CREATE VIEW `pso_data_validator`.`dvt_core_types_vw` AS +SELECT * FROM `pso_data_validator`.`dvt_core_types`; + DROP TABLE `pso_data_validator`.`dvt_null_not_null`; CREATE TABLE `pso_data_validator`.`dvt_null_not_null` diff --git a/tests/resources/mysql_test_tables.sql b/tests/resources/mysql_test_tables.sql index a1f987315..2c7a45dcb 100644 --- a/tests/resources/mysql_test_tables.sql +++ b/tests/resources/mysql_test_tables.sql @@ -90,6 +90,10 @@ INSERT INTO `pso_data_validator`.`dvt_core_types` VALUES ,'Hello DVT','C ','Hello DVT' ,'1970-01-03','1970-01-03 00:00:03','1970-01-03 03:00:03'); +CREATE VIEW `pso_data_validator`.`dvt_core_types_vw` AS +SELECT * FROM `pso_data_validator`.`dvt_core_types`; + + DROP TABLE IF EXISTS `pso_data_validator`.`dvt_null_not_null`; CREATE TABLE `pso_data_validator`.`dvt_null_not_null` ( col_nn datetime(0) NOT NULL diff --git a/tests/resources/oracle_test_tables.sql b/tests/resources/oracle_test_tables.sql index 2287333df..937837d24 100644 --- a/tests/resources/oracle_test_tables.sql +++ b/tests/resources/oracle_test_tables.sql @@ -68,6 +68,9 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES ,to_timestamp_tz('1970-01-03 00:00:03 -03:00','YYYY-MM-DD HH24:MI:SS TZH:TZM')); COMMIT; +CREATE VIEW pso_data_validator.dvt_core_types_vw AS +SELECT * FROM pso_data_validator.dvt_core_types; + DROP TABLE pso_data_validator.dvt_null_not_null; CREATE TABLE pso_data_validator.dvt_null_not_null ( col_nn TIMESTAMP(0) NOT NULL diff --git a/tests/resources/postgresql_test_tables.sql b/tests/resources/postgresql_test_tables.sql index d3073bbb1..f7be4ef4d 100644 --- a/tests/resources/postgresql_test_tables.sql +++ b/tests/resources/postgresql_test_tables.sql @@ -51,6 +51,9 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES ,DATE'1970-01-03',TIMESTAMP'1970-01-03 00:00:03' ,TIMESTAMP WITH TIME ZONE'1970-01-03 00:00:03 -03:00'); +CREATE VIEW pso_data_validator.dvt_core_types_vw AS +SELECT * FROM pso_data_validator.dvt_core_types; + DROP TABLE IF EXISTS pso_data_validator.dvt_ora2pg_types; CREATE TABLE pso_data_validator.dvt_ora2pg_types ( id int NOT NULL PRIMARY KEY diff --git a/tests/resources/snowflake_test_tables.sql b/tests/resources/snowflake_test_tables.sql index a122a948f..a1e6d129f 100644 --- a/tests/resources/snowflake_test_tables.sql +++ b/tests/resources/snowflake_test_tables.sql @@ -48,6 +48,9 @@ INSERT INTO PSO_DATA_VALIDATOR.PUBLIC.DVT_CORE_TYPES VALUES ,DATE'1970-01-03',TIMESTAMP'1970-01-03 00:00:03' ,'1970-01-03 00:00:03 -03:00'); +CREATE OR REPLACE VIEW PSO_DATA_VALIDATOR.PUBLIC.DVT_CORE_TYPES_VW AS +SELECT * FROM PSO_DATA_VALIDATOR.PUBLIC.DVT_CORE_TYPES; + CREATE OR REPLACE TABLE PSO_DATA_VALIDATOR.PUBLIC.TEST_GENERATE_PARTITIONS ( COURSE_ID VARCHAR(6), diff --git a/tests/resources/sqlserver_test_tables.sql b/tests/resources/sqlserver_test_tables.sql index 0759de3f0..954f6326a 100644 --- a/tests/resources/sqlserver_test_tables.sql +++ b/tests/resources/sqlserver_test_tables.sql @@ -55,6 +55,10 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES ,'1970-01-03','1970-01-03 00:00:03' ,cast('1970-01-03 00:00:03 -03:00' as datetimeoffset(3))); +CREATE VIEW pso_data_validator.dvt_core_types_vw AS +SELECT * FROM pso_data_validator.dvt_core_types; + + DROP TABLE pso_data_validator.dvt_null_not_null; CREATE TABLE pso_data_validator.dvt_null_not_null ( col_nn datetime2(0) NOT NULL diff --git a/tests/resources/teradata_test_tables.sql b/tests/resources/teradata_test_tables.sql index f7f6da5a1..51b94ba8f 100644 --- a/tests/resources/teradata_test_tables.sql +++ b/tests/resources/teradata_test_tables.sql @@ -52,6 +52,10 @@ INSERT INTO udf.dvt_core_types VALUES ,DATE'1970-01-03',TIMESTAMP'1970-01-03 00:00:03' ,CAST('1970-01-03 00:00:03.000-03:00' AS TIMESTAMP(3) WITH TIME ZONE)); +CREATE VIEW udf.dvt_core_types_vw AS +SELECT * FROM udf.dvt_core_types; + + DROP TABLE udf.dvt_null_not_null; CREATE TABLE udf.dvt_null_not_null ( col_nn TIMESTAMP(0) NOT NULL diff --git a/tests/system/data_sources/common_functions.py b/tests/system/data_sources/common_functions.py index a96c8b5d3..e8e4a37dd 100644 --- a/tests/system/data_sources/common_functions.py +++ b/tests/system/data_sources/common_functions.py @@ -18,11 +18,14 @@ import pathlib from data_validation import __main__ as main -from data_validation import consts, data_validation, raw_query - from data_validation import ( cli_tools, + consts, + data_validation, + find_tables, + raw_query, ) + from data_validation.partition_builder import PartitionBuilder if TYPE_CHECKING: @@ -30,6 +33,26 @@ from pandas import DataFrame +DVT_CORE_TYPES_COLUMNS = [ + "id", + "col_int8", + "col_int16", + "col_int32", + "col_int64", + "col_dec_20", + "col_dec_38", + "col_dec_10_2", + "col_float32", + "col_float64", + "col_varchar_30", + "col_char_2", + "col_string", + "col_date", + "col_datetime", + "col_tstz", +] + + def id_type_test_assertions(df, expected_rows=5): """Standard assertions for assorted primary key type integration tests.""" # Should be expected_rows rows in the df all with status success. @@ -147,7 +170,13 @@ def row_validation_many_columns_test( assert len(df) == 0 -def find_tables_assertions(command_output: str): +def find_tables_assertions( + command_output: str, + expected_source_schema: str = "pso_data_validator", + expected_target_schema: str = "pso_data_validator", + include_views: bool = False, + check_for_view: bool = True, +): assert isinstance(command_output, str) assert command_output output_dict = json.loads(command_output) @@ -155,11 +184,57 @@ def find_tables_assertions(command_output: str): assert isinstance(output_dict, list) assert isinstance(output_dict[0], dict) assert len(output_dict) > 1 - assert all(_["schema_name"] == "pso_data_validator" for _ in output_dict) - assert all(_["target_schema_name"] == "pso_data_validator" for _ in output_dict) + assert all(_["schema_name"] == expected_source_schema for _ in output_dict) + assert all(_["target_schema_name"] == expected_target_schema for _ in output_dict) # Assert that a couple of known tables are in the map. assert "dvt_core_types" in [_["table_name"] for _ in output_dict] assert "dvt_core_types" in [_["target_table_name"] for _ in output_dict] + if check_for_view: + source_names = [_["table_name"] for _ in output_dict] + target_names = [_["target_table_name"] for _ in output_dict] + if include_views: + # Assert that known view is IN the map. + assert ( + "dvt_core_types_vw" in source_names + ), f"dvt_core_types_vw should be in command source output: {source_names}" + assert ( + "dvt_core_types_vw" in target_names + ), f"dvt_core_types_vw should be in command target output: {target_names}" + else: + # Assert that known view is NOT IN the map. + assert ( + "dvt_core_types_vw" not in source_names + ), f"dvt_core_types_vw should NOT be in command source output: {source_names}" + assert ( + "dvt_core_types_vw" not in target_names + ), f"dvt_core_types_vw should NOT be in command target output: {target_names}" + + +def find_tables_test( + tc: str = "bq-conn", + allowed_schema: str = "pso_data_validator", + include_views: bool = False, + check_for_view: bool = True, +): + """Generic find-tables test""" + parser = cli_tools.configure_arg_parser() + cli_arg_list = [ + "find-tables", + "-sc=mock-conn", + f"-tc={tc}", + f"--allowed-schemas={allowed_schema}", + "--include-views" if include_views else None, + ] + cli_arg_list = [_ for _ in cli_arg_list if _] + args = parser.parse_args(cli_arg_list) + output = find_tables.find_tables_using_string_matching(args) + find_tables_assertions( + output, + include_views=include_views, + check_for_view=check_for_view, + expected_source_schema=allowed_schema, + expected_target_schema=allowed_schema, + ) def schema_validation_test( diff --git a/tests/system/data_sources/test_hive.py b/tests/system/data_sources/test_hive.py index dfcb877de..272230f86 100644 --- a/tests/system/data_sources/test_hive.py +++ b/tests/system/data_sources/test_hive.py @@ -21,6 +21,7 @@ from data_validation import cli_tools, data_validation, consts from tests.system.data_sources.common_functions import ( binary_key_assertions, + find_tables_test, id_type_test_assertions, null_not_null_assertions, row_validation_many_columns_test, @@ -371,3 +372,13 @@ def test_row_validation_hash_bool_to_bigquery(): tc="bq-conn", hash="*", ) + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_find_tables(): + """Hive to BigQuery test of find-tables command.""" + # check_for_view=False because there is no practical way to exclude views on Hive. + find_tables_test(check_for_view=False) diff --git a/tests/system/data_sources/test_mysql.py b/tests/system/data_sources/test_mysql.py index 008bca7cc..17c793a90 100644 --- a/tests/system/data_sources/test_mysql.py +++ b/tests/system/data_sources/test_mysql.py @@ -21,6 +21,7 @@ from data_validation import cli_tools, data_validation, consts, exceptions from tests.system.data_sources.common_functions import ( binary_key_assertions, + find_tables_test, id_type_test_assertions, null_not_null_assertions, raw_query_test, @@ -33,6 +34,7 @@ ) from tests.system.data_sources.test_bigquery import BQ_CONN from tests.system.data_sources.common_functions import ( + DVT_CORE_TYPES_COLUMNS, partition_table_test, partition_query_test, ) @@ -191,6 +193,18 @@ def test_schema_validation_core_types_to_bigquery(): ) +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_schema_validation_view_core_types_vw(): + """MySQL to MySQL view dvt_core_types_vw schema validation""" + schema_validation_test( + tables="pso_data_validator.dvt_core_types_vw", + tc="mock-conn", + ) + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, @@ -235,7 +249,13 @@ def test_column_validation_core_types_to_bigquery(): """MySQL to BigQuery dvt_core_types column validation""" # TODO Change --sum, --min and --max options to include col_char_2 when issue-842 is complete. # We've excluded col_float32 because BigQuery does not have an exact same type and float32/64 are lossy and cannot be compared. - cols = "col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_string,col_date,col_datetime,col_tstz" + cols = ",".join( + [ + _ + for _ in DVT_CORE_TYPES_COLUMNS + if _ not in ("id", "col_float32", "col_char_2") + ] + ) column_validation_test( tc="bq-conn", filters="id>0 AND col_int8>0", @@ -246,6 +266,25 @@ def test_column_validation_core_types_to_bigquery(): ) +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_column_validation_view_core_types_vw(): + """MySQL to MySQL view dvt_core_types_vw column validation""" + cols = ",".join([_ for _ in DVT_CORE_TYPES_COLUMNS if _ not in ("id")]) + column_validation_test( + tc="mock-conn", + tables="pso_data_validator.dvt_core_types_vw", + count_cols=cols, + sum_cols=cols, + min_cols=cols, + max_cols=cols, + filters="id>0 AND col_int8>0", + grouped_columns="col_varchar_30", + ) + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, @@ -411,3 +450,12 @@ def test_custom_query_row_validation_many_columns(): def test_raw_query_dvt_row_types(capsys): """Test data-validation query command.""" raw_query_test(capsys) + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_find_tables(): + """MySQL to MySQL test of find-tables command.""" + find_tables_test(tc="mock-conn") diff --git a/tests/system/data_sources/test_oracle.py b/tests/system/data_sources/test_oracle.py index 4e8e42e76..f1fbd2c2f 100644 --- a/tests/system/data_sources/test_oracle.py +++ b/tests/system/data_sources/test_oracle.py @@ -18,12 +18,12 @@ import pytest import pathlib -from data_validation import cli_tools, data_validation, consts, find_tables +from data_validation import cli_tools, data_validation, consts from tests.system.data_sources.common_functions import ( binary_key_assertions, column_validation_test, column_validation_test_config_managers, - find_tables_assertions, + find_tables_test, id_type_test_assertions, null_not_null_assertions, raw_query_test, @@ -36,6 +36,7 @@ from tests.system.data_sources.test_bigquery import BQ_CONN from tests.system.data_sources.test_postgres import CONN as PG_CONN from tests.system.data_sources.common_functions import ( + DVT_CORE_TYPES_COLUMNS, partition_table_test, partition_query_test, ) @@ -214,6 +215,18 @@ def test_schema_validation_core_types_to_bigquery(): ) +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_schema_validation_view_core_types_vw(): + """Oracle to Oracle view dvt_core_types_vw schema validation""" + schema_validation_test( + tables="pso_data_validator.dvt_core_types_vw", + tc="mock-conn", + ) + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, @@ -253,7 +266,7 @@ def test_schema_validation_oracle_to_postgres(): ) def test_column_validation_core_types(): """Oracle to Oracle dvt_core_types column validation""" - cols = "col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz" + cols = ",".join([_ for _ in DVT_CORE_TYPES_COLUMNS if _ not in ("id")]) column_validation_test( tc="mock-conn", tables="pso_data_validator.dvt_core_types", @@ -274,7 +287,9 @@ def test_column_validation_core_types_to_bigquery(): """Oracle to BigQuery dvt_core_types column validation""" # Excluded col_float32 because BigQuery does not have an exact same type and # float32/64 are lossy and cannot be compared. - cols = "col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz" + cols = ",".join( + [_ for _ in DVT_CORE_TYPES_COLUMNS if _ not in ("id", "col_float32")] + ) column_validation_test( tc="bq-conn", tables="pso_data_validator.dvt_core_types", @@ -316,6 +331,25 @@ def test_column_validation_oracle_to_postgres(): ) +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_column_validation_view_core_types_vw(): + """Oracle to Oracle view dvt_core_types_vw column validation""" + cols = ",".join([_ for _ in DVT_CORE_TYPES_COLUMNS if _ not in ("id")]) + column_validation_test( + tc="mock-conn", + tables="pso_data_validator.dvt_core_types_vw", + count_cols=cols, + sum_cols=cols, + min_cols=cols, + max_cols=cols, + filters="id>0 AND col_int8>0", + grouped_columns="col_varchar_30", + ) + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, @@ -349,10 +383,14 @@ def test_row_validation_core_types_auto_pks(): def test_row_validation_core_types_to_bigquery(): """Oracle to BigQuery dvt_core_types row validation""" # Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE. - row_validation_test( - tc="bq-conn", - hash="col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz", + cols = ",".join( + [ + _ + for _ in DVT_CORE_TYPES_COLUMNS + if _ not in ("id", "col_float32", "col_float64") + ] ) + row_validation_test(tc="bq-conn", hash=cols) @mock.patch( @@ -636,17 +674,16 @@ def test_custom_query_row_validation_oracle_to_postgres(): ) def test_find_tables(): """Oracle to BigQuery test of find-tables command.""" - parser = cli_tools.configure_arg_parser() - args = parser.parse_args( - [ - "find-tables", - "-sc=mock-conn", - "-tc=bq-conn", - "--allowed-schemas=pso_data_validator", - ] - ) - output = find_tables.find_tables_using_string_matching(args) - find_tables_assertions(output) + find_tables_test() + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_find_views_and_tables(): + """Oracle to BigQuery test of find-tables command.""" + find_tables_test(include_views=True) @mock.patch( diff --git a/tests/system/data_sources/test_postgres.py b/tests/system/data_sources/test_postgres.py index dc48ad1c0..ce9d8a13e 100644 --- a/tests/system/data_sources/test_postgres.py +++ b/tests/system/data_sources/test_postgres.py @@ -22,14 +22,14 @@ cli_tools, data_validation, consts, - find_tables, ) from tests.system.data_sources.deploy_cloudsql.cloudsql_resource_manager import ( CloudSQLResourceManager, ) from tests.system.data_sources.common_functions import ( + DVT_CORE_TYPES_COLUMNS, binary_key_assertions, - find_tables_assertions, + find_tables_test, id_type_test_assertions, null_not_null_assertions, raw_query_test, @@ -591,6 +591,18 @@ def test_schema_validation_core_types_to_bigquery(): ) +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_schema_validation_view_core_types_vw(): + """PostgreSQL to PostgreSQL view dvt_core_types_vw schema validation""" + schema_validation_test( + tables="pso_data_validator.dvt_core_types_vw", + tc="mock-conn", + ) + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, @@ -653,7 +665,13 @@ def test_column_validation_pg_types(): def test_column_validation_core_types_to_bigquery(): # We've excluded col_float32 because BigQuery does not have an exact same type and float32/64 are lossy and cannot be compared. # TODO Change --sum and --max options to include col_char_2 when issue-842 is complete. - cols = "col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_string,col_date,col_datetime,col_tstz" + cols = ",".join( + [ + _ + for _ in DVT_CORE_TYPES_COLUMNS + if _ not in ("id", "col_float32", "col_char_2") + ] + ) column_validation_test( tc="bq-conn", tables="pso_data_validator.dvt_core_types", @@ -663,6 +681,28 @@ def test_column_validation_core_types_to_bigquery(): ) +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_column_validation_view_core_types_vw(): + """PostgreSQL to PostgreSQL view dvt_core_types_vw column validation""" + # TODO Change --sum and --max options to include col_char_2 when issue-842 is complete. + cols = ",".join( + [_ for _ in DVT_CORE_TYPES_COLUMNS if _ not in ("id", "col_char_2")] + ) + column_validation_test( + tc="mock-conn", + tables="pso_data_validator.dvt_core_types_vw", + count_cols=cols, + sum_cols=cols, + min_cols=cols, + max_cols=cols, + filters="id>0 AND col_int8>0", + grouped_columns="col_varchar_30", + ) + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, @@ -819,17 +859,16 @@ def test_custom_query_validation_core_types(): ) def test_find_tables(): """PostgreSQL to BigQuery test of find-tables command.""" - parser = cli_tools.configure_arg_parser() - args = parser.parse_args( - [ - "find-tables", - "-sc=mock-conn", - "-tc=bq-conn", - "--allowed-schemas=pso_data_validator", - ] - ) - output = find_tables.find_tables_using_string_matching(args) - find_tables_assertions(output) + find_tables_test() + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_find_views_and_tables(): + """PostgreSQL to BigQuery test of find-tables command.""" + find_tables_test(include_views=True) @mock.patch( diff --git a/tests/system/data_sources/test_snowflake.py b/tests/system/data_sources/test_snowflake.py index 3545983b1..a51a0f024 100644 --- a/tests/system/data_sources/test_snowflake.py +++ b/tests/system/data_sources/test_snowflake.py @@ -21,6 +21,7 @@ from data_validation import cli_tools, data_validation, consts from tests.system.data_sources.common_functions import ( binary_key_assertions, + find_tables_test, id_type_test_assertions, null_not_null_assertions, raw_query_test, @@ -393,6 +394,16 @@ def test_custom_query_validation_core_types(): ) +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_find_tables(): + """Snowflake to Snowflake test of find-tables command.""" + pytest.skip("Skipping test_find_tables until issue 1198 has been resolved.") + find_tables_test(tc="mock-conn", allowed_schema="PSO_DATA_VALIDATOR.PUBLIC") + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, diff --git a/tests/system/data_sources/test_spanner.py b/tests/system/data_sources/test_spanner.py index e5f96debc..3594bb097 100644 --- a/tests/system/data_sources/test_spanner.py +++ b/tests/system/data_sources/test_spanner.py @@ -224,6 +224,12 @@ def test_cli_find_tables(): "target_schema_name": "pso_data_validator", "target_table_name": "students_pointer", } in tables + assert { + "schema_name": "pso_data_validator", + "table_name": "dvt_core_types_vw", + "target_schema_name": "pso_data_validator", + "target_table_name": "dvt_core_types_vw", + } not in tables @mock.patch( diff --git a/tests/system/data_sources/test_sql_server.py b/tests/system/data_sources/test_sql_server.py index 1db7335c0..8dd0429c3 100644 --- a/tests/system/data_sources/test_sql_server.py +++ b/tests/system/data_sources/test_sql_server.py @@ -21,10 +21,10 @@ from tests.system.data_sources.deploy_cloudsql.cloudsql_resource_manager import ( CloudSQLResourceManager, ) -from data_validation import cli_tools, data_validation, consts, find_tables +from data_validation import cli_tools, data_validation, consts from tests.system.data_sources.common_functions import ( binary_key_assertions, - find_tables_assertions, + find_tables_test, id_type_test_assertions, null_not_null_assertions, row_validation_many_columns_test, @@ -495,17 +495,7 @@ def test_custom_query_row_hash_validation_core_types_to_bigquery(): def test_find_tables(): """SQL Server to BigQuery test of find-tables command.""" pytest.skip("Skipping test_find_tables until issue 1198 has been resolved.") - parser = cli_tools.configure_arg_parser() - args = parser.parse_args( - [ - "find-tables", - "-sc=mock-conn", - "-tc=bq-conn", - "--allowed-schemas=pso_data_validator", - ] - ) - output = find_tables.find_tables_using_string_matching(args) - find_tables_assertions(output) + find_tables_test() @mock.patch( diff --git a/tests/system/data_sources/test_teradata.py b/tests/system/data_sources/test_teradata.py index 7abcf3f11..a22d5b1c5 100644 --- a/tests/system/data_sources/test_teradata.py +++ b/tests/system/data_sources/test_teradata.py @@ -20,17 +20,19 @@ from data_validation import cli_tools, data_validation, consts from tests.system.data_sources.common_functions import ( + DVT_CORE_TYPES_COLUMNS, binary_key_assertions, column_validation_test, custom_query_validation_test, + find_tables_test, id_type_test_assertions, null_not_null_assertions, partition_table_test, partition_query_test, raw_query_test, row_validation_many_columns_test, - run_test_from_cli_args, row_validation_test, + run_test_from_cli_args, schema_validation_test, ) from tests.system.data_sources.test_bigquery import BQ_CONN @@ -286,7 +288,18 @@ def test_column_validation_core_types(): ) def test_column_validation_core_types_to_bigquery(): """Teradata to BigQuery dvt_core_types column validation""" - cols = "col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_datetime" + cols = ",".join( + [ + _ + for _ in DVT_CORE_TYPES_COLUMNS + if _ + not in ( + "id", + "col_float32", + "col_tstz", + ) + ] + ) column_validation_test( tc="bq-conn", tables="udf.dvt_core_types=pso_data_validator.dvt_core_types", @@ -311,6 +324,24 @@ def test_column_validation_time_table_to_bigquery(): ) +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_column_validation_view_core_types_vw(): + """Teradata to Teradata view dvt_core_types_vw column validation""" + column_validation_test( + tc="mock-conn", + tables="udf.dvt_core_types_vw", + count_cols="*", + sum_cols="*", + min_cols="*", + max_cols="*", + filters="id>0 AND col_int8>0", + grouped_columns="col_varchar_30", + ) + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, @@ -617,6 +648,15 @@ def test_custom_query_row_hash_validation_core_types_to_bigquery(): ) +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_find_tables(): + """Teradata to Teradata test of find-tables command.""" + find_tables_test(tc="mock-conn", allowed_schema="udf") + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, diff --git a/tests/unit/test_cli_tools.py b/tests/unit/test_cli_tools.py index e6cec969d..60b73e5cc 100644 --- a/tests/unit/test_cli_tools.py +++ b/tests/unit/test_cli_tools.py @@ -254,6 +254,14 @@ def test_find_tables_config(): assert allowed_schemas[0] == "my_schema" +def test_find_tables_incl_views(): + parser = cli_tools.configure_arg_parser() + args = parser.parse_args(CLI_FIND_TABLES_ARGS) + assert not args.include_views + args = parser.parse_args(CLI_FIND_TABLES_ARGS + ["--include-views"]) + assert args.include_views + + @pytest.mark.parametrize( "test_input,expected", [ diff --git a/third_party/ibis/ibis_addon/operations.py b/third_party/ibis/ibis_addon/operations.py index 733fd1da2..453a895f6 100644 --- a/third_party/ibis/ibis_addon/operations.py +++ b/third_party/ibis/ibis_addon/operations.py @@ -31,6 +31,7 @@ import ibis.expr.rules as rlz import pandas as pd import sqlalchemy as sa +from ibis.backends.base.sql.alchemy import BaseAlchemyBackend from ibis.backends.base.sql.alchemy.registry import _cast as sa_fixed_cast from ibis.backends.base.sql.alchemy.registry import fixed_arity as sa_fixed_arity from ibis.backends.base.sql.alchemy.translator import AlchemyExprTranslator @@ -618,6 +619,12 @@ def execute_epoch_seconds_new(op, data, **kwargs): return epoch_series +def _dvt_list_tables(self, like=None, database=None) -> list: + """Alternative to BaseAlchemyBackend.list_tables that does not include views in the result.""" + tables = self.inspector.get_table_names(schema=database) + return self._filter_with_like(tables, like) + + execute_epoch_seconds = execute_epoch_seconds_new BinaryValue.byte_length = compile_binary_length @@ -625,6 +632,10 @@ def execute_epoch_seconds_new(op, data, **kwargs): NumericValue.to_char = compile_to_char TemporalValue.to_char = compile_to_char +# This is an additional DVT only method. We tag this onto BaseAlchemyBackend +# so we can piggy back Ibis code rather than writing metadata queries for all engines. +BaseAlchemyBackend.dvt_list_tables = _dvt_list_tables + BigQueryExprTranslator._registry[HashBytes] = format_hashbytes_bigquery BigQueryExprTranslator._registry[RawSQL] = format_raw_sql BigQueryExprTranslator._registry[Strftime] = strftime_bigquery diff --git a/third_party/ibis/ibis_biquery/api.py b/third_party/ibis/ibis_biquery/api.py index bf9b48c14..94b5c726c 100644 --- a/third_party/ibis/ibis_biquery/api.py +++ b/third_party/ibis/ibis_biquery/api.py @@ -7,4 +7,9 @@ def _list_primary_key_columns(self, database: str, table: str) -> list: return None +def _dvt_list_tables(self, like=None, database=None): + return self.list_tables(like=like, database=database) + + BigQueryBackend.list_primary_key_columns = _list_primary_key_columns +BigQueryBackend.dvt_list_tables = _dvt_list_tables diff --git a/third_party/ibis/ibis_cloud_spanner/__init__.py b/third_party/ibis/ibis_cloud_spanner/__init__.py index c3bb18308..960f7facf 100644 --- a/third_party/ibis/ibis_cloud_spanner/__init__.py +++ b/third_party/ibis/ibis_cloud_spanner/__init__.py @@ -133,6 +133,9 @@ def list_tables(self, like=None, database=None): ] return tables + def dvt_list_tables(self, like=None, database=None): + return self.list_tables(like=like, database=database) + def exists_table(self, name, database=None): if database is None: diff --git a/third_party/ibis/ibis_cloud_spanner/tests/ddl.sql b/third_party/ibis/ibis_cloud_spanner/tests/ddl.sql index 703a5e3b9..c032a9c9f 100644 --- a/third_party/ibis/ibis_cloud_spanner/tests/ddl.sql +++ b/third_party/ibis/ibis_cloud_spanner/tests/ddl.sql @@ -72,6 +72,15 @@ CREATE TABLE dvt_core_types ( , col_tstz TIMESTAMP ) PRIMARY KEY (id); +CREATE VIEW dvt_core_types_vw +SQL SECURITY DEFINER +AS +SELECT t.id,t.col_int8,t.col_int16,t.col_int32,t.col_int64 +,t.col_dec_20,t.col_dec_38,t.col_dec_10_2,t.col_float32 +,t.col_float64,t.col_varchar_30,t.col_char_2,t.col_string +,t.col_date,t.col_datetime,t.col_tstz +FROM dvt_core_types AS t; + --Integration test table used to test both binary pk matching and binary hash/concat comparisons. CREATE TABLE dvt_binary ( binary_id BYTES(MAX) NOT NULL diff --git a/third_party/ibis/ibis_impala/api.py b/third_party/ibis/ibis_impala/api.py index e432a3968..66e8d9a5f 100644 --- a/third_party/ibis/ibis_impala/api.py +++ b/third_party/ibis/ibis_impala/api.py @@ -235,12 +235,18 @@ def _list_primary_key_columns(self, database: str, table: str) -> list: return None +def _dvt_list_tables(self, like=None, database=None): + """dvt_list_tables is a pass through on Hadoop due to "show tables" behaviour.""" + return self.list_tables(like=like, database=database) + + udf.parse_type = parse_type ibis.backends.impala._chunks_to_pandas_array = _chunks_to_pandas_array ImpalaBackend.get_schema = get_schema ImpalaBackend._get_schema_using_query = _get_schema_using_query ImpalaBackend.do_connect = do_connect ImpalaBackend.list_primary_key_columns = _list_primary_key_columns +ImpalaBackend.dvt_list_tables = _dvt_list_tables def impala_connect( diff --git a/third_party/ibis/ibis_teradata/__init__.py b/third_party/ibis/ibis_teradata/__init__.py index b37c6e3cf..5ec90410f 100644 --- a/third_party/ibis/ibis_teradata/__init__.py +++ b/third_party/ibis/ibis_teradata/__init__.py @@ -94,18 +94,23 @@ def list_databases(self, like=None): SELECT * FROM DBC.Tables WHERE DatabaseName LIKE '%{database_like}%' AND TableName LIKE '%{table_like}%' + AND TableKind LIKE '{kind_like}' """ - def list_tables(self, like=None, database=None): + def list_tables(self, like=None, database=None, kind_like: str = "%") -> list: database = database or "" table = like or "" list_table_sql = self.LIST_TABLE_SQL.format( - database_like=database, table_like=table + database_like=database, table_like=table, kind_like=kind_like ) tables_df = self._execute(list_table_sql, results=True) return list(tables_df.TableName.str.rstrip()) + def dvt_list_tables(self, like=None, database=None) -> list: + """Duplicate of list_tables() but only returning tables in the output.""" + return self.list_tables(like=like, database=database, kind_like="T") + def _fully_qualified_name(self, name, database): if database: return "{}.{}".format(database, name)