You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
The test tests/test_realtime.py::test_realtime_cache_different_settings[duckdb] failed in python 3.9 on this run. Re-running the job allowed the test to pass, which is odd as I can't see where any indeterminacy could occur.
Error was: Binder Error: Values list "l" does not have a column named "tf_first_name".
Full test output
________________ test_realtime_cache_different_settings[duckdb] ________________
self = <splink.internals.duckdb.database_api.DuckDBAPI object at 0x7fb7780ef4c0>
final_sql = 'CREATE TABLE __splink__realtime_compare_records_aj2njrcn AS\nWITH __splink__compare_records_left AS (\n SELECT\n ..._adj_city,\n "email_l",\n "email_r",\n gamma_email,\n bf_email,\n match_key\nFROM __splink__df_match_weight_parts'
templated_name = '__splink__realtime_compare_records'
physical_name = '__splink__realtime_compare_records_aj2njrcn'
@final
def _log_and_run_sql_execution(
self, final_sql: str, templated_name: str, physical_name: str
) -> TablishType:
"""
Log some sql, then call _run_sql_execution()
Any errors will be converted to SplinkException with more detail
names are only relevant for logging, not execution
"""
logger.debug(execute_sql_logging_message_info(templated_name, physical_name))
logger.log(5, log_sql(final_sql))
try:
> return self._execute_sql_against_backend(final_sql)
splink/internals/database_api.py:63:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <splink.internals.duckdb.database_api.DuckDBAPI object at 0x7fb7780ef4c0>
final_sql = 'CREATE TABLE __splink__realtime_compare_records_aj2njrcn AS \nWITH\n\n__splink__compare_records_left as (\nselect * f..._tf_adj_city,"email_l","email_r",gamma_email,bf_email,match_key \n from __splink__df_match_weight_parts\n \n '
def _execute_sql_against_backend(self, final_sql: str) -> duckdb.DuckDBPyRelation:
> return self._con.sql(final_sql)
E duckdb.duckdb.BinderException: Binder Error: Values list "l" does not have a column named "tf_first_name"
E LINE 11: ...from __splink__compare_records_left_aj2njrcn),
E
E __splink__compare_records_rig...
E ^
splink/internals/duckdb/database_api.py:102: BinderException
The above exception was the direct cause of the following exception:
test_helpers = {'duckdb': (<class 'tests.helpers.DuckDBTestHelper'>, []), 'spark': (<class 'tests.helpers.SparkTestHelper'>, [<functi...ite': (<class 'tests.helpers.SQLiteTestHelper'>, []), 'postgres': (<class 'tests.helpers.PostgresTestHelper'>, [None])}
dialect = 'duckdb'
@mark_with_dialects_excluding()
def test_realtime_cache_different_settings(test_helpers, dialect):
helper = test_helpers[dialect]
db_api = helper.extra_linker_args()["db_api"]
df1 = pd.DataFrame(
[
{
"unique_id": 0,
"first_name": "Julia",
"surname": "Taylor",
"city": "London",
"email": "[email protected]",
}
]
)
df2 = pd.DataFrame(
[
{
"unique_id": 1,
"first_name": "Julia",
"surname": "Taylor",
"city": "London",
"email": "[email protected]",
}
]
)
settings_1 = SettingsCreator(
link_type="dedupe_only",
comparisons=[
cl.ExactMatch("first_name"),
cl.ExactMatch("surname"),
cl.ExactMatch("city"),
],
blocking_rules_to_generate_predictions=[block_on("first_name")],
)
settings_2 = SettingsCreator(
link_type="dedupe_only",
comparisons=[
cl.ExactMatch("first_name"),
cl.ExactMatch("surname"),
cl.ExactMatch("email"),
],
blocking_rules_to_generate_predictions=[block_on("first_name")],
)
> res1 = compare_records(
df1, df2, settings_1, db_api, use_sql_from_cache=True
).as_record_dict()[0]["match_weight"]
tests/test_realtime.py:343:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
splink/internals/realtime.py:88: in compare_records
return db_api._sql_to_splink_dataframe(
splink/internals/database_api.py:93: in _sql_to_splink_dataframe
spark_df = self._log_and_run_sql_execution(sql, templated_name, physical_name)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <splink.internals.duckdb.database_api.DuckDBAPI object at 0x7fb7780ef4c0>
final_sql = 'CREATE TABLE __splink__realtime_compare_records_aj2njrcn AS\nWITH __splink__compare_records_left AS (\n SELECT\n ..._adj_city,\n "email_l",\n "email_r",\n gamma_email,\n bf_email,\n match_key\nFROM __splink__df_match_weight_parts'
templated_name = '__splink__realtime_compare_records'
physical_name = '__splink__realtime_compare_records_aj2njrcn'
@final
def _log_and_run_sql_execution(
self, final_sql: str, templated_name: str, physical_name: str
) -> TablishType:
"""
Log some sql, then call _run_sql_execution()
Any errors will be converted to SplinkException with more detail
names are only relevant for logging, not execution
"""
logger.debug(execute_sql_logging_message_info(templated_name, physical_name))
logger.log(5, log_sql(final_sql))
try:
return self._execute_sql_against_backend(final_sql)
except Exception as e:
# Parse our SQL through sqlglot to pretty print
try:
final_sql = sqlglot.parse_one(
final_sql,
read=self.sql_dialect.sqlglot_dialect,
).sql(pretty=True)
# if sqlglot produces any errors, just report the raw SQL
except Exception:
pass
> raise SplinkException(
f"Error executing the following sql for table "
f"`{templated_name}`({physical_name}):\n{final_sql}"
f"\n\nError was: {e}"
) from e
E splink.internals.exceptions.SplinkException: Error executing the following sql for table `__splink__realtime_compare_records`(__splink__realtime_compare_records_aj2njrcn):
E CREATE TABLE __splink__realtime_compare_records_aj2njrcn AS
E WITH __splink__compare_records_left AS (
E SELECT
E *
E FROM __splink__compare_records_left_aj2njrcn
E ), __splink__compare_records_right AS (
E SELECT
E *
E FROM __splink__compare_records_right_aj2njrcn
E ), __splink__compare_two_records_blocked AS (
E SELECT
E "l"."unique_id" AS "unique_id_l",
E "r"."unique_id" AS "unique_id_r",
E "l"."first_name" AS "first_name_l",
E "r"."first_name" AS "first_name_r",
E "l"."tf_first_name" AS "tf_first_name_l",
E "r"."tf_first_name" AS "tf_first_name_r",
E "l"."surname" AS "surname_l",
E "r"."surname" AS "surname_r",
E "l"."city" AS "city_l",
E "r"."city" AS "city_r",
E "l"."tf_city" AS "tf_city_l",
E "r"."tf_city" AS "tf_city_r",
E "l"."email" AS "email_l",
E "r"."email" AS "email_r",
E 0 AS match_key
E FROM __splink__compare_records_left AS l
E CROSS JOIN __splink__compare_records_right AS r
E ), __splink__df_comparison_vectors AS (
E SELECT
E "unique_id_l",
E "unique_id_r",
E "first_name_l",
E "first_name_r",
E CASE
E WHEN "first_name_l" IS NULL OR "first_name_r" IS NULL
E THEN -1
E WHEN "first_name_l" = "first_name_r"
E THEN 1
E ELSE 0
E END AS gamma_first_name,
E "tf_first_name_l",
E "tf_first_name_r",
E "surname_l",
E "surname_r",
E CASE
E WHEN "surname_l" IS NULL OR "surname_r" IS NULL
E THEN -1
E WHEN "surname_l" = "surname_r"
E THEN 1
E ELSE 0
E END AS gamma_surname,
E "city_l",
E "city_r",
E CASE
E WHEN "city_l" IS NULL OR "city_r" IS NULL
E THEN -1
E WHEN "city_l" = "city_r"
E THEN 1
E ELSE 0
E END AS gamma_city,
E "tf_city_l",
E "tf_city_r",
E "email_l",
E "email_r",
E CASE
E WHEN "email_l" IS NULL OR "email_r" IS NULL
E THEN -1
E WHEN "email_l" = "email_r"
E THEN 1
E ELSE 0
E END AS gamma_email,
E match_key
E FROM __splink__compare_two_records_blocked
E ), __splink__df_match_weight_parts AS (
E SELECT
E "unique_id_l",
E "unique_id_r",
E "first_name_l",
E "first_name_r",
E gamma_first_name,
E "tf_first_name_l",
E "tf_first_name_r",
E CASE
E WHEN gamma_first_name = -1
E THEN CAST(1.0 AS DOUBLE)
E WHEN gamma_first_name = 1
E THEN CAST(1024.0 AS DOUBLE)
E WHEN gamma_first_name = 0
E THEN CAST(0.03125 AS DOUBLE)
E END AS bf_first_name,
E CASE
E WHEN gamma_first_name = -1
E THEN CAST(1 AS DOUBLE)
E WHEN gamma_first_name = 1
E THEN (
E CASE
E WHEN NOT COALESCE("tf_first_name_l", "tf_first_name_r") IS NULL
E THEN POWER(
E CAST(0.000927734375 AS DOUBLE) / NULLIF(
E (
E CASE
E WHEN COALESCE("tf_first_name_l", "tf_first_name_r") >= COALESCE("tf_first_name_r", "tf_first_name_l")
E THEN COALESCE("tf_first_name_l", "tf_first_name_r")
E ELSE COALESCE("tf_first_name_r", "tf_first_name_l")
E END
E ),
E 0
E ),
E CAST(1.0 AS DOUBLE)
E )
E ELSE CAST(1 AS DOUBLE)
E END
E )
E WHEN gamma_first_name = 0
E THEN CAST(1 AS DOUBLE)
E END AS bf_tf_adj_first_name,
E "surname_l",
E "surname_r",
E gamma_surname,
E CASE
E WHEN gamma_surname = -1
E THEN CAST(1.0 AS DOUBLE)
E WHEN gamma_surname = 1
E THEN CAST(1024.0 AS DOUBLE)
E WHEN gamma_surname = 0
E THEN CAST(0.03125 AS DOUBLE)
E END AS bf_surname,
E "city_l",
E "city_r",
E gamma_city,
E "tf_city_l",
E "tf_city_r",
E CASE
E WHEN gamma_city = -1
E THEN CAST(1.0 AS DOUBLE)
E WHEN gamma_city = 1
E THEN CAST(1024.0 AS DOUBLE)
E WHEN gamma_city = 0
E THEN CAST(0.03125 AS DOUBLE)
E END AS bf_city,
E CASE
E WHEN gamma_city = -1
E THEN CAST(1 AS DOUBLE)
E WHEN gamma_city = 1
E THEN (
E CASE
E WHEN NOT COALESCE("tf_city_l", "tf_city_r") IS NULL
E THEN POWER(
E CAST(0.000927734375 AS DOUBLE) / NULLIF(
E (
E CASE
E WHEN COALESCE("tf_city_l", "tf_city_r") >= COALESCE("tf_city_r", "tf_city_l")
E THEN COALESCE("tf_city_l", "tf_city_r")
E ELSE COALESCE("tf_city_r", "tf_city_l")
E END
E ),
E 0
E ),
E CAST(1.0 AS DOUBLE)
E )
E ELSE CAST(1 AS DOUBLE)
E END
E )
E WHEN gamma_city = 0
E THEN CAST(1 AS DOUBLE)
E END AS bf_tf_adj_city,
E "email_l",
E "email_r",
E gamma_email,
E CASE
E WHEN gamma_email = -1
E THEN CAST(1.0 AS DOUBLE)
E WHEN gamma_email = 1
E THEN CAST(1024.0 AS DOUBLE)
E WHEN gamma_email = 0
E THEN CAST(0.03125 AS DOUBLE)
E END AS bf_email,
E match_key
E FROM __splink__df_comparison_vectors
E )
E SELECT
E LOG(
E 2,
E CAST(0.00010001000100010001 AS DOUBLE) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_city * bf_tf_adj_city * bf_email
E ) AS match_weight,
E CASE
E WHEN bf_first_name = CAST('infinity' AS DOUBLE)
E OR bf_tf_adj_first_name = CAST('infinity' AS DOUBLE)
E OR bf_surname = CAST('infinity' AS DOUBLE)
E OR bf_city = CAST('infinity' AS DOUBLE)
E OR bf_tf_adj_city = CAST('infinity' AS DOUBLE)
E OR bf_email = CAST('infinity' AS DOUBLE)
E THEN 1.0
E ELSE (
E CAST(0.00010001000100010001 AS DOUBLE) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_city * bf_tf_adj_city * bf_email
E ) / NULLIF(
E (
E 1 + (
E CAST(0.00010001000100010001 AS DOUBLE) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_city * bf_tf_adj_city * bf_email
E )
E ),
E 0
E )
E END AS match_probability,
E "unique_id_l",
E "unique_id_r",
E "first_name_l",
E "first_name_r",
E gamma_first_name,
E "tf_first_name_l",
E "tf_first_name_r",
E bf_first_name,
E bf_tf_adj_first_name,
E "surname_l",
E "surname_r",
E gamma_surname,
E bf_surname,
E "city_l",
E "city_r",
E gamma_city,
E "tf_city_l",
E "tf_city_r",
E bf_city,
E bf_tf_adj_city,
E "email_l",
E "email_r",
E gamma_email,
E bf_email,
E match_key
E FROM __splink__df_match_weight_parts
E
E Error was: Binder Error: Values list "l" does not have a column named "tf_first_name"
E LINE 11: ...from __splink__compare_records_left_aj2njrcn),
E
E __splink__compare_records_rig...
E ^
splink/internals/database_api.py:75: SplinkException
Test summary output
=========================== short test summary info ============================
FAILED tests/test_realtime.py::test_realtime_cache_different_settings[duckdb] - splink.internals.exceptions.SplinkException: Error executing the following sql for table `__splink__realtime_compare_records`(__splink__realtime_compare_records_aj2njrcn):
CREATE TABLE __splink__realtime_compare_records_aj2njrcn AS
WITH __splink__compare_records_left AS (
SELECT
*
FROM __splink__compare_records_left_aj2njrcn
), __splink__compare_records_right AS (
SELECT
*
FROM __splink__compare_records_right_aj2njrcn
), __splink__compare_two_records_blocked AS (
SELECT
"l"."unique_id" AS "unique_id_l",
"r"."unique_id" AS "unique_id_r",
"l"."first_name" AS "first_name_l",
"r"."first_name" AS "first_name_r",
"l"."tf_first_name" AS "tf_first_name_l",
"r"."tf_first_name" AS "tf_first_name_r",
"l"."surname" AS "surname_l",
"r"."surname" AS "surname_r",
"l"."city" AS "city_l",
"r"."city" AS "city_r",
"l"."tf_city" AS "tf_city_l",
"r"."tf_city" AS "tf_city_r",
"l"."email" AS "email_l",
"r"."email" AS "email_r",
0 AS match_key
FROM __splink__compare_records_left AS l
CROSS JOIN __splink__compare_records_right AS r
), __splink__df_comparison_vectors AS (
SELECT
"unique_id_l",
"unique_id_r",
"first_name_l",
"first_name_r",
CASE
WHEN "first_name_l" IS NULL OR "first_name_r" IS NULL
THEN -1
WHEN "first_name_l" = "first_name_r"
THEN 1
ELSE 0
END AS gamma_first_name,
"tf_first_name_l",
"tf_first_name_r",
"surname_l",
"surname_r",
CASE
WHEN "surname_l" IS NULL OR "surname_r" IS NULL
THEN -1
WHEN "surname_l" = "surname_r"
THEN 1
ELSE 0
END AS gamma_surname,
"city_l",
"city_r",
CASE
WHEN "city_l" IS NULL OR "city_r" IS NULL
THEN -1
WHEN "city_l" = "city_r"
THEN 1
ELSE 0
END AS gamma_city,
"tf_city_l",
"tf_city_r",
"email_l",
"email_r",
CASE
WHEN "email_l" IS NULL OR "email_r" IS NULL
THEN -1
WHEN "email_l" = "email_r"
THEN 1
ELSE 0
END AS gamma_email,
match_key
FROM __splink__compare_two_records_blocked
), __splink__df_match_weight_parts AS (
SELECT
"unique_id_l",
"unique_id_r",
"first_name_l",
"first_name_r",
gamma_first_name,
"tf_first_name_l",
"tf_first_name_r",
CASE
WHEN gamma_first_name = -1
THEN CAST(1.0 AS DOUBLE)
WHEN gamma_first_name = 1
THEN CAST(1024.0 AS DOUBLE)
WHEN gamma_first_name = 0
THEN CAST(0.03125 AS DOUBLE)
END AS bf_first_name,
CASE
WHEN gamma_first_name = -1
THEN CAST(1 AS DOUBLE)
WHEN gamma_first_name = 1
THEN (
CASE
WHEN NOT COALESCE("tf_first_name_l", "tf_first_name_r") IS NULL
THEN POWER(
CAST(0.000927734375 AS DOUBLE) / NULLIF(
(
CASE
WHEN COALESCE("tf_first_name_l", "tf_first_name_r") >= COALESCE("tf_first_name_r", "tf_first_name_l")
THEN COALESCE("tf_first_name_l", "tf_first_name_r")
ELSE COALESCE("tf_first_name_r", "tf_first_name_l")
END
),
0
),
CAST(1.0 AS DOUBLE)
)
ELSE CAST(1 AS DOUBLE)
END
)
WHEN gamma_first_name = 0
THEN CAST(1 AS DOUBLE)
END AS bf_tf_adj_first_name,
"surname_l",
"surname_r",
gamma_surname,
CASE
WHEN gamma_surname = -1
THEN CAST(1.0 AS DOUBLE)
WHEN gamma_surname = 1
THEN CAST(1024.0 AS DOUBLE)
WHEN gamma_surname = 0
THEN CAST(0.03125 AS DOUBLE)
END AS bf_surname,
"city_l",
"city_r",
gamma_city,
"tf_city_l",
"tf_city_r",
CASE
WHEN gamma_city = -1
THEN CAST(1.0 AS DOUBLE)
WHEN gamma_city = 1
THEN CAST(1024.0 AS DOUBLE)
WHEN gamma_city = 0
THEN CAST(0.03125 AS DOUBLE)
END AS bf_city,
CASE
WHEN gamma_city = -1
THEN CAST(1 AS DOUBLE)
WHEN gamma_city = 1
THEN (
CASE
WHEN NOT COALESCE("tf_city_l", "tf_city_r") IS NULL
THEN POWER(
CAST(0.000927734375 AS DOUBLE) / NULLIF(
(
CASE
WHEN COALESCE("tf_city_l", "tf_city_r") >= COALESCE("tf_city_r", "tf_city_l")
THEN COALESCE("tf_city_l", "tf_city_r")
ELSE COALESCE("tf_city_r", "tf_city_l")
END
),
0
),
CAST(1.0 AS DOUBLE)
)
ELSE CAST(1 AS DOUBLE)
END
)
WHEN gamma_city = 0
THEN CAST(1 AS DOUBLE)
END AS bf_tf_adj_city,
"email_l",
"email_r",
gamma_email,
CASE
WHEN gamma_email = -1
THEN CAST(1.0 AS DOUBLE)
WHEN gamma_email = 1
THEN CAST(1024.0 AS DOUBLE)
WHEN gamma_email = 0
THEN CAST(0.03125 AS DOUBLE)
END AS bf_email,
match_key
FROM __splink__df_comparison_vectors
)
SELECT
LOG(
2,
CAST(0.00010001000100010001 AS DOUBLE) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_city * bf_tf_adj_city * bf_email
) AS match_weight,
CASE
WHEN bf_first_name = CAST('infinity' AS DOUBLE)
OR bf_tf_adj_first_name = CAST('infinity' AS DOUBLE)
OR bf_surname = CAST('infinity' AS DOUBLE)
OR bf_city = CAST('infinity' AS DOUBLE)
OR bf_tf_adj_city = CAST('infinity' AS DOUBLE)
OR bf_email = CAST('infinity' AS DOUBLE)
THEN 1.0
ELSE (
CAST(0.00010001000100010001 AS DOUBLE) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_city * bf_tf_adj_city * bf_email
) / NULLIF(
(
1 + (
CAST(0.00010001000100010001 AS DOUBLE) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_city * bf_tf_adj_city * bf_email
)
),
0
)
END AS match_probability,
"unique_id_l",
"unique_id_r",
"first_name_l",
"first_name_r",
gamma_first_name,
"tf_first_name_l",
"tf_first_name_r",
bf_first_name,
bf_tf_adj_first_name,
"surname_l",
"surname_r",
gamma_surname,
bf_surname,
"city_l",
"city_r",
gamma_city,
"tf_city_l",
"tf_city_r",
bf_city,
bf_tf_adj_city,
"email_l",
"email_r",
gamma_email,
bf_email,
match_key
FROM __splink__df_match_weight_parts
Error was: Binder Error: Values list "l" does not have a column named "tf_first_name"
LINE 11: ...from __splink__compare_records_left_aj2njrcn),
__splink__compare_records_rig...
^
= 1 failed, 321 passed, 12 skipped, 395 deselected, 16 warnings in 116.40s (0:01:56) =
The test
tests/test_realtime.py::test_realtime_cache_different_settings[duckdb]
failed in python 3.9 on this run. Re-running the job allowed the test to pass, which is odd as I can't see where any indeterminacy could occur.Error was: Binder Error: Values list "l" does not have a column named "tf_first_name"
.Full test output
Test summary output
Environment info (3.9.20)
The text was updated successfully, but these errors were encountered: