Skip to content

Commit

Permalink
feat!: Exclude views from output of find tables (#1355)
Browse files Browse the repository at this point in the history
* tests: Add view dvt_core_type_vw for Oracle and PostgreSQL

* feat: Exclude views from Ibis sa.list_tables method

* tests: Add more find_tables with views tests

* feat: Exclude views from Teradata list_tables method

* Update third_party/ibis/ibis_addon/operations.py

Co-authored-by: Helen Cristina <[email protected]>

* Update tests/system/data_sources/test_postgres.py

Co-authored-by: Helen Cristina <[email protected]>

* chore: PR comments

* feat: Prepare list_tables to support listing only tables or both tables and views

* feat: find-tables supports --include-views

* feat: Avoid dvt_list_tables when using pandas (for unit tests)

* feat: Add pass-through dvt_list_tables on Hive backend

---------

Co-authored-by: Helen Cristina <[email protected]>
  • Loading branch information
nj1973 and helensilva14 authored Dec 18, 2024
1 parent 4646757 commit eafdc93
Show file tree
Hide file tree
Showing 27 changed files with 420 additions and 68 deletions.
13 changes: 10 additions & 3 deletions data_validation/cli_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,17 +354,24 @@ def _configure_deploy(subparsers):
def _configure_find_tables(subparsers):
"""Configure arguments for text search table matching."""
find_tables_parser = subparsers.add_parser(
"find-tables", help="Build tables list using approx string matching"
"find-tables", help="Build tables list using approx string matching."
)
find_tables_parser.add_argument(
"--source-conn", "-sc", help="Source connection name"
"--source-conn", "-sc", help="Source connection name."
)
find_tables_parser.add_argument(
"--target-conn", "-tc", help="Target connection name"
"--target-conn", "-tc", help="Target connection name."
)
find_tables_parser.add_argument(
"--allowed-schemas", "-as", help="List of source schemas to match."
)
find_tables_parser.add_argument(
"--include-views",
"-iv",
default=False,
action="store_true",
help="Include views in results.",
)
find_tables_parser.add_argument(
"--score-cutoff",
"-score",
Expand Down
17 changes: 11 additions & 6 deletions data_validation/clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,14 +233,19 @@ def list_schemas(client):
return [None]


def list_tables(client, schema_name):
def list_tables(client, schema_name, tables_only=True):
"""Return a list of tables in the DB schema."""
if client.name in ["db2", "mssql", "redshift", "snowflake"]:
return client.list_tables()
return client.list_tables(database=schema_name)
fn = (
client.dvt_list_tables
if tables_only and client.name != "pandas"
else client.list_tables
)
if client.name in ["db2", "mssql", "redshift", "snowflake", "pandas"]:
return fn()
return fn(database=schema_name)


def get_all_tables(client, allowed_schemas=None):
def get_all_tables(client, allowed_schemas=None, tables_only=True):
"""Return a list of tuples with database and table names.
client (IbisClient): Client to use for tables
Expand All @@ -252,7 +257,7 @@ def get_all_tables(client, allowed_schemas=None):
if allowed_schemas and schema_name not in allowed_schemas:
continue
try:
tables = list_tables(client, schema_name)
tables = list_tables(client, schema_name, tables_only=tables_only)
except Exception as e:
logging.warning(f"List Tables Error: {schema_name} -> {e}")
continue
Expand Down
17 changes: 13 additions & 4 deletions data_validation/find_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,14 @@ def _compare_match_tables(source_table_map, target_table_map, score_cutoff=0.8)
return table_configs


def _get_table_map(client: "ibis.backends.base.BaseBackend", allowed_schemas=None):
def _get_table_map(
client: "ibis.backends.base.BaseBackend", allowed_schemas=None, include_views=False
):
"""Return dict with searchable keys for table matching."""
table_map = {}
table_objs = clients.get_all_tables(client, allowed_schemas=allowed_schemas)
table_objs = clients.get_all_tables(
client, allowed_schemas=allowed_schemas, tables_only=(not include_views)
)

for table_obj in table_objs:
table_key = ".".join([t for t in table_obj if t])
Expand All @@ -79,11 +83,14 @@ def get_mapped_table_configs(
source_client: "ibis.backends.base.BaseBackend",
target_client: "ibis.backends.base.BaseBackend",
allowed_schemas: list = None,
include_views: bool = False,
score_cutoff: int = 1,
) -> list:
"""Get table list from each client and match them together into a single list of dicts."""
source_table_map = _get_table_map(source_client, allowed_schemas=allowed_schemas)
target_table_map = _get_table_map(target_client)
source_table_map = _get_table_map(
source_client, allowed_schemas=allowed_schemas, include_views=include_views
)
target_table_map = _get_table_map(target_client, include_views=include_views)
return _compare_match_tables(
source_table_map, target_table_map, score_cutoff=score_cutoff
)
Expand All @@ -102,6 +109,7 @@ def find_tables_using_string_matching(args) -> str:
source_client,
target_client,
allowed_schemas=allowed_schemas,
include_views=args.include_views,
score_cutoff=score_cutoff,
)
return json.dumps(table_configs)
Expand Down Expand Up @@ -141,6 +149,7 @@ def expand_tables_of_asterisk(
source_client,
target_client,
allowed_schemas=[mapping[consts.CONFIG_SCHEMA_NAME]],
include_views=False,
)
new_list.extend(expanded_tables)
else:
Expand Down
3 changes: 3 additions & 0 deletions tests/resources/bigquery_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ INSERT INTO `pso_data_validator`.`dvt_core_types` VALUES
,DATE '1970-01-03',DATETIME '1970-01-03 00:00:03'
,TIMESTAMP '1970-01-03 00:00:03-03:00');

CREATE VIEW `pso_data_validator`.`dvt_core_types_vw` AS
SELECT * FROM `pso_data_validator`.`dvt_core_types`;

DROP TABLE `pso_data_validator`.`dvt_null_not_null`;
CREATE TABLE `pso_data_validator`.`dvt_null_not_null`
( col_nn DATETIME NOT NULL
Expand Down
3 changes: 3 additions & 0 deletions tests/resources/hive_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES
,'Hello DVT','C ','Hello DVT'
,'1970-01-03','1970-01-03 00:00:03','1970-01-03 03:00:03');

CREATE VIEW `pso_data_validator`.`dvt_core_types_vw` AS
SELECT * FROM `pso_data_validator`.`dvt_core_types`;


DROP TABLE `pso_data_validator`.`dvt_null_not_null`;
CREATE TABLE `pso_data_validator`.`dvt_null_not_null`
Expand Down
4 changes: 4 additions & 0 deletions tests/resources/mysql_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ INSERT INTO `pso_data_validator`.`dvt_core_types` VALUES
,'Hello DVT','C ','Hello DVT'
,'1970-01-03','1970-01-03 00:00:03','1970-01-03 03:00:03');

CREATE VIEW `pso_data_validator`.`dvt_core_types_vw` AS
SELECT * FROM `pso_data_validator`.`dvt_core_types`;


DROP TABLE IF EXISTS `pso_data_validator`.`dvt_null_not_null`;
CREATE TABLE `pso_data_validator`.`dvt_null_not_null`
( col_nn datetime(0) NOT NULL
Expand Down
3 changes: 3 additions & 0 deletions tests/resources/oracle_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES
,to_timestamp_tz('1970-01-03 00:00:03 -03:00','YYYY-MM-DD HH24:MI:SS TZH:TZM'));
COMMIT;

CREATE VIEW pso_data_validator.dvt_core_types_vw AS
SELECT * FROM pso_data_validator.dvt_core_types;

DROP TABLE pso_data_validator.dvt_null_not_null;
CREATE TABLE pso_data_validator.dvt_null_not_null
( col_nn TIMESTAMP(0) NOT NULL
Expand Down
3 changes: 3 additions & 0 deletions tests/resources/postgresql_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES
,DATE'1970-01-03',TIMESTAMP'1970-01-03 00:00:03'
,TIMESTAMP WITH TIME ZONE'1970-01-03 00:00:03 -03:00');

CREATE VIEW pso_data_validator.dvt_core_types_vw AS
SELECT * FROM pso_data_validator.dvt_core_types;

DROP TABLE IF EXISTS pso_data_validator.dvt_ora2pg_types;
CREATE TABLE pso_data_validator.dvt_ora2pg_types
( id int NOT NULL PRIMARY KEY
Expand Down
3 changes: 3 additions & 0 deletions tests/resources/snowflake_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ INSERT INTO PSO_DATA_VALIDATOR.PUBLIC.DVT_CORE_TYPES VALUES
,DATE'1970-01-03',TIMESTAMP'1970-01-03 00:00:03'
,'1970-01-03 00:00:03 -03:00');

CREATE OR REPLACE VIEW PSO_DATA_VALIDATOR.PUBLIC.DVT_CORE_TYPES_VW AS
SELECT * FROM PSO_DATA_VALIDATOR.PUBLIC.DVT_CORE_TYPES;


CREATE OR REPLACE TABLE PSO_DATA_VALIDATOR.PUBLIC.TEST_GENERATE_PARTITIONS (
COURSE_ID VARCHAR(6),
Expand Down
4 changes: 4 additions & 0 deletions tests/resources/sqlserver_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES
,'1970-01-03','1970-01-03 00:00:03'
,cast('1970-01-03 00:00:03 -03:00' as datetimeoffset(3)));

CREATE VIEW pso_data_validator.dvt_core_types_vw AS
SELECT * FROM pso_data_validator.dvt_core_types;


DROP TABLE pso_data_validator.dvt_null_not_null;
CREATE TABLE pso_data_validator.dvt_null_not_null
( col_nn datetime2(0) NOT NULL
Expand Down
4 changes: 4 additions & 0 deletions tests/resources/teradata_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ INSERT INTO udf.dvt_core_types VALUES
,DATE'1970-01-03',TIMESTAMP'1970-01-03 00:00:03'
,CAST('1970-01-03 00:00:03.000-03:00' AS TIMESTAMP(3) WITH TIME ZONE));

CREATE VIEW udf.dvt_core_types_vw AS
SELECT * FROM udf.dvt_core_types;


DROP TABLE udf.dvt_null_not_null;
CREATE TABLE udf.dvt_null_not_null
( col_nn TIMESTAMP(0) NOT NULL
Expand Down
85 changes: 80 additions & 5 deletions tests/system/data_sources/common_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,41 @@
import pathlib

from data_validation import __main__ as main
from data_validation import consts, data_validation, raw_query

from data_validation import (
cli_tools,
consts,
data_validation,
find_tables,
raw_query,
)

from data_validation.partition_builder import PartitionBuilder

if TYPE_CHECKING:
from argparse import Namespace
from pandas import DataFrame


DVT_CORE_TYPES_COLUMNS = [
"id",
"col_int8",
"col_int16",
"col_int32",
"col_int64",
"col_dec_20",
"col_dec_38",
"col_dec_10_2",
"col_float32",
"col_float64",
"col_varchar_30",
"col_char_2",
"col_string",
"col_date",
"col_datetime",
"col_tstz",
]


def id_type_test_assertions(df, expected_rows=5):
"""Standard assertions for assorted primary key type integration tests."""
# Should be expected_rows rows in the df all with status success.
Expand Down Expand Up @@ -147,19 +170,71 @@ def row_validation_many_columns_test(
assert len(df) == 0


def find_tables_assertions(command_output: str):
def find_tables_assertions(
command_output: str,
expected_source_schema: str = "pso_data_validator",
expected_target_schema: str = "pso_data_validator",
include_views: bool = False,
check_for_view: bool = True,
):
assert isinstance(command_output, str)
assert command_output
output_dict = json.loads(command_output)
assert output_dict
assert isinstance(output_dict, list)
assert isinstance(output_dict[0], dict)
assert len(output_dict) > 1
assert all(_["schema_name"] == "pso_data_validator" for _ in output_dict)
assert all(_["target_schema_name"] == "pso_data_validator" for _ in output_dict)
assert all(_["schema_name"] == expected_source_schema for _ in output_dict)
assert all(_["target_schema_name"] == expected_target_schema for _ in output_dict)
# Assert that a couple of known tables are in the map.
assert "dvt_core_types" in [_["table_name"] for _ in output_dict]
assert "dvt_core_types" in [_["target_table_name"] for _ in output_dict]
if check_for_view:
source_names = [_["table_name"] for _ in output_dict]
target_names = [_["target_table_name"] for _ in output_dict]
if include_views:
# Assert that known view is IN the map.
assert (
"dvt_core_types_vw" in source_names
), f"dvt_core_types_vw should be in command source output: {source_names}"
assert (
"dvt_core_types_vw" in target_names
), f"dvt_core_types_vw should be in command target output: {target_names}"
else:
# Assert that known view is NOT IN the map.
assert (
"dvt_core_types_vw" not in source_names
), f"dvt_core_types_vw should NOT be in command source output: {source_names}"
assert (
"dvt_core_types_vw" not in target_names
), f"dvt_core_types_vw should NOT be in command target output: {target_names}"


def find_tables_test(
tc: str = "bq-conn",
allowed_schema: str = "pso_data_validator",
include_views: bool = False,
check_for_view: bool = True,
):
"""Generic find-tables test"""
parser = cli_tools.configure_arg_parser()
cli_arg_list = [
"find-tables",
"-sc=mock-conn",
f"-tc={tc}",
f"--allowed-schemas={allowed_schema}",
"--include-views" if include_views else None,
]
cli_arg_list = [_ for _ in cli_arg_list if _]
args = parser.parse_args(cli_arg_list)
output = find_tables.find_tables_using_string_matching(args)
find_tables_assertions(
output,
include_views=include_views,
check_for_view=check_for_view,
expected_source_schema=allowed_schema,
expected_target_schema=allowed_schema,
)


def schema_validation_test(
Expand Down
11 changes: 11 additions & 0 deletions tests/system/data_sources/test_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from data_validation import cli_tools, data_validation, consts
from tests.system.data_sources.common_functions import (
binary_key_assertions,
find_tables_test,
id_type_test_assertions,
null_not_null_assertions,
row_validation_many_columns_test,
Expand Down Expand Up @@ -371,3 +372,13 @@ def test_row_validation_hash_bool_to_bigquery():
tc="bq-conn",
hash="*",
)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_find_tables():
"""Hive to BigQuery test of find-tables command."""
# check_for_view=False because there is no practical way to exclude views on Hive.
find_tables_test(check_for_view=False)
Loading

0 comments on commit eafdc93

Please sign in to comment.