From 9f53aecec2e167c303c171228b5b62aa9abda6b3 Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Wed, 6 Apr 2022 15:55:32 +0200 Subject: [PATCH 01/12] add test for long where clauses - slightly update test test_with_invalid_where_parameter_closing_unclosed__400 --- .../test-long-where-clause-1k.txt | 1 + .../charts/data/api_tests.py | 33 ++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 tests/example_data/test-long-where-clause-1k.txt diff --git a/tests/example_data/test-long-where-clause-1k.txt b/tests/example_data/test-long-where-clause-1k.txt new file mode 100644 index 0000000000000..78b51f3fb3cd0 --- /dev/null +++ b/tests/example_data/test-long-where-clause-1k.txt @@ -0,0 +1 @@ +Liam,James,Noah,Wyatt,Gabriel,Lucas,Ethan,Alexander,Joseph,Benjamin,William,Logan,Mason,Jack,John,Asher,Elijah,Daniel,Henry,Jacob,Jaxon,Michael,Oliver,Hunter,David,Levi,Matthew,Landon,Aiden,Isaac,Jackson,Caleb,Ryan,Elias,Connor,Evan,Joshua,Samuel,Christian,Jayden,Jeremiah,Cooper,Eli,Robert,Ryder,Christopher,Colton,Josiah,Andrew,Austin,Carson,Jaxson,Jonathan,Luke,Malachi,Nathan,Owen,Blake,Lincoln,Ezra,Gavin,Thomas,Dylan,Grayson,Kai,Ryker,Zachary,Anthony,Isaiah,Jase,Jason,Micah,Sebastian,Silas,Titus,Bentley,Brody,Cameron,Carter,Chase,Gideon,Jace,Sawyer,Tristan,Tyler,Weston,Adam,Charles,Everett,Wesley,Xander,Brandon,Brayden,Nathaniel,Theodore,Xavier,Ashton,Avery,Dominic,Easton,Finn,George,Hudson,Ian,Jasper,Kayden,Marshall,Max,Maxwell,Miles,Orion,Richard,Timothy,Abel,Drake,Garrett,Jameson,Jayce,Joel,Kenneth,Maximus,Nicholas,Parker,Travis,Cody,Dean,Declan,Elliot,Ezekiel,Karter,Nolan,Patrick,Riley,Seth,Solomon,Steven,Victor,Waylon,Aaron,August,Bradley,Braxton,Bryce,Calvin,Camden,Cayden,Charlie,Cole,Damian,Dawson,Eric,Greyson,Jake,Jeffrey,Jesse,Jonah,Julian,Kaiden,Killian,Kingston,Maddox,Matthias,Maverick,Odin,Paul,Peter,Roman,Trevor,Zane,Alex,Archer,Caden,Collin,Colt,Edward,Gage,Gunner,Harrison,Ivan,Jax,Leo,Lukas,Marcus,Paxton,Soren,Sullivan,Tanner,Trenton,Troy,Tucker,Vincent,Walter,Warren,Adrian,Augustus,Axel,Beckett,Cade,Clayton,Dante,Emmett,Felix,Grant,Hatcher,Jordan,Jude,Julius,Kaleb,Kevin,Remington,Rowan,Russell,Simon,Skyler,Stephen,Sterling,Talon,Westin,Aidan,Alden,Angelo,Arthur,Atticus,Barrett,Brantley,Brooks,Bruce,Cash,Cyrus,Derek,Desmond,Erik,Griffin,Hayden,Jeremy,Justice,Kaden,Kane,Kyle,Leif,Mark,Mateo,Miguel,Myles,Nikolai,Paxson,Peyton,Raymond,Reid,River,Rodney,Roland,Ronin,Rylan,Shawn,Spencer,Thaddeus,Tobias,Tyson,Walker,William,James,John,Mason,Elijah,Noah,Jackson,Aiden,Jacob,Joshua,Samuel,Michael,Ethan,Carter,Liam,Grayson,Christopher,Jayden,Logan,Hunter,Andrew,Landon,Caleb,Gabriel,Joseph,Charles,David,Jaxon,Colton,Luke,Ayden,Levi,Cameron,Matthew,Christian,Carson,Wyatt,Alexander,Jordan,Benjamin,Robert,Daniel,Kayden,Lucas,Easton,Bentley,Eli,Brantley,Braxton,Brayden,Jonathan,Henry,Anthony,Thomas,Jeremiah,Jace,Connor,Sawyer,Austin,Bryson,Oliver,Parker,Cooper,Hudson,Dylan,Isaac,Tyler,Isaiah,Josiah,Nathan,Karter,Brandon,Jase,Timothy,Jack,Kingston,Nicholas,Gavin,Tucker,Ashton,Tristan,Asher,Justin,Kaleb,Chase,Brody,Jayceon,Owen,Kaiden,Jaxson,Kevin,Ryan,Aaron,Cayden,Ian,Jason,Jayce,Micah,Ryder,Evan,Greyson,Hayden,Kaden,Harrison,Karson,Camden,Braylen,Rylan,Zachary,Zayden,Adam,Jose,Kameron,Wesley,Barrett,Marcus,Nolan,Blake,Dallas,Gunner,Kenneth,Malachi,Preston,Silas,Bryce,Collin,Antonio,Caden,Jonah,King,Peyton,Adrian,Maddox,Messiah,Brooks,Dawson,Trenton,Avery,Bennett,Lincoln,Steven,Alex,Richard,Sebastian,Weston,Braylon,Brian,Dalton,Ezra,Jaden,Jaiden,Riley,Tanner,Zion,Corbin,Walker,Xavier,Amari,Judah,Lawson,Maxwell,River,Jude,Angel,Carlos,Eric,Kason,Griffin,Juan,Princeton,Ryker,Abel,Bryan,Cason,Derrick,George,Roman,Aidan,Conner,Everett,Garrett,Jaylen,Miles,Stephen,Dominic,Emmett,Jameson,Johnny,Julian,Kyler,Zane,Bradley,Cole,Edward,Elias,Graham,Jesse,Kai,Nathaniel,Paul,Bryant,Caiden,Clayton,Gregory,Jeremy,Judson,Kylan,Patrick,Walter,Alan,Charlie,Colin,Landen,Chandler,Luis,Titus,Waylon,Anderson,Axel,Cody,Ezekiel,Grant,Jax,Kayson,Miguel,Tyson,Chance,Declan,Jett,Kyle,Vincent,Amir,Braydon,Johnathan,Seth,Travis,Brentley,Case,Devin,Jasper,Jeffery,Joel,Keegan,Marshall,Skyler,Cade,Fisher,Jensen,Max,Paxton,Rhett,Spencer,Trevor,Calvin,Cohen,Colt,Kendrick,Major,Taylor,Xander,Zander,Beau,Braden,Brycen,Curtis,Finn,Iker,Ivan,Jamari,Jesus,Mark,Maverick,Myles,Reid,Troy,Victor,Zaiden,Casen,Cash,Colby,Damien,Ellis,Grady,Houston,Kamden,Kolton,Landyn,Malcolm,Phillip,Warren,Brodie,Corey,Cristian,Dustin,Elliott,Jeffrey,Kasen,Raylan,Rowan,Wilson,Aden,Cedric,Coleman,Diego,Edwin,Gage,Jake,Kash,Prince,Ronald,Russell,Ryland,Theodore,Abram,Ace,Damian,Demarcus,Hayes,Holden,Jamarcus,Javier,Mateo,Maurice,Nicolas,Remington,Terry,Tony,Abraham,Brantlee,Carmelo,Demetrius,Dillon,Emmanuel,Israel,Jamichael,Jerry,Kobe,Kylen,Lane,Phoenix,Willie,August,Baylor,Channing,Daxton,Desmond,Donald,Donovan,Eduardo,Emerson,Jimmy,Kamari,Kendall,Porter,Skylar,Alejandro,Asa,Beckett,Bentlee,Bo,Brady,Brendan,Brock,Cannon,Casey,Courtney,Danny,Deacon,Dean,Elliot,Gannon,Jakobe,Jamarion,Jaxton,Jermaine,Jon,Leonardo,Luca,Morgan,Nehemiah,Pedro,Sylas,Tripp,Triston,Turner,Alexis,Andre,Brennan,Bruce,Cayson,Cornelius,Crimson,Davion,Davis,Deandre,Dennis,Franklin,Frederick,Gary,Harley,Jalen,Justice,Kade,Kaison,Kyson,Layton,Legend,Malik,Manuel,Micheal,Miller,Oakley,Omar,Reed,Ricardo,Ricky,Simon,Terrance,Trent,Zechariah,Archer,Dakota,Derek,Dexter,Erick,Felix,Jagger,Jasiah,Jorge,Keaton,Khalil,Kyrie,Ladarius,Leland,Leo,London,Lorenzo,Lukas,Maximus,Omari,Oscar,Quinton,Reese,Reginald,Rodney,Samson,Sean,Shane,Trace,Zachariah,Zaylen,Adan,Atticus,Branson,Braylin,Damon,Darius,Dominick,Dorian,Fletcher,Francisco,Issac,Jamison,Jared,Javion,Jay,Joe,Jordyn,Josue,Kamryn,Knox,Kolten,Kristian,Lance,Lathan,Louis,Lyric,Maison,Mitchell,Neymar,Peter,Randall,Reece,Scott,Shawn,Solomon,Tatum,Terrence,Tylan,Ahmad,Alijah,Andres,Andy,Beckham,Benton,Billy,Blaine,Bowen,Brenton,Byron,Camdyn,Cullen,Deangelo,Devon,Drake,Drew,Emory,Finley,Gael,Gerald,Gideon,Gunnar,Jacoby,Jakaiden,Jaydon,Jayson,Karsyn,Keagan,Keelan,Keith,Kristopher,Landan,Leon,Macon,Mario,Markell,Martin,Marvin,Raymond,Roderick,Sutton,Uriah,Allen,Amarion,Amos,Brayan,Brayson,Brenden,Callen,Camron,Chevy,Colten,Cortez,Dante,Deshawn,Eddie,Emanuel,Fernando,Foster,Giovanni,Hector,Izaiah,Jabari,Jakob,Jamir,Jaxen,Jayveon,Jayvion,Jorden,Kane,Kaysen,Keandre,Keller,Kenny,Kody,Konnor,Lamar,Langston,Larry,Lewis,Malakai,Mayson,Mohammed,Payton,Philip,Pierce,Quentin,Ridge,Ronan,Ronnie,Roy,Rylee,Santiago,Shepherd,Sullivan,Talon,Thaddeus,Toby,Tristen,Tristin,Will,Xzavier,Zackary,Zackery,Zayne,Zyon,Aiyden,Albert,Ali,Alton,Anakin,Angelo,Armani,Arthur,Aston,Barron,Ben,Bishop,Brent,Briar,Brylon,Camren,Canaan,Carl,Cashton,Caysen,Clark,Clinton,Damion,Darren,Daylen,Dayton,Douglas,Elisha,Enoch,Erik,Felipe,Forrest,Frank,Gatlin,Graysen,Isiah,Jacen,Jaceyon,Jaiceon,Jakaden,Jakari,Jakayden,Jamie,Jaquan,Jarvis,Jasen,Jaylon,Jerome,Justus,Kaidyn,Kalel,Kamron,Kashton,Kellan,Kelvin,Kemari,Kendarius,Kharter,Kohen,Kooper,Kymani,Lamarcus,Lawrence,Layne,Leonel,Marquis,Memphis,Nash,Nigel,Omarion,Pablo,Pierson,Randy,Ray,Rayden,Roland,Ruben,Stanley,Stephon,Sterling,Taylon,Tommy,Tristian,Wade,Wallace,Wells,Zamarion,Zayvion,Adrien,Aidyn,Alden,Alec,Alvin,Antwan,Armando,Aubrey,Aydon,Benson,Braylan,Brentlee,Brylan,Carsen,Chris,Christon,Collins,Conor,Corbyn,Coy,Crew,Dakari,Darrell,Darrion,Darryl,Dax,Deanthony,Demari,Deon,Dewayne,Dontavious,Draven,Eden,Emery,Ezequiel,Finnegan,Francis,Gauge,Gentry,Hank,Harlem,Harper,Harris,Hollis,Hosea,Jessie,Johan,Johnathon,Julien,Julio,Julius,Junior,Kace,Kaisen,Karsen,Kaydon, diff --git a/tests/integration_tests/charts/data/api_tests.py b/tests/integration_tests/charts/data/api_tests.py index 67586b33f86b5..fdecfa17b6e18 100644 --- a/tests/integration_tests/charts/data/api_tests.py +++ b/tests/integration_tests/charts/data/api_tests.py @@ -17,8 +17,10 @@ # isort:skip_file """Unit tests for Superset""" import json +import time import unittest import copy +import numpy as np from datetime import datetime from io import BytesIO from typing import Any, Optional @@ -589,12 +591,41 @@ def test_with_invalid_where_parameter_closing_unclosed__400(self): self.query_context_payload["queries"][0]["filters"] = [] self.query_context_payload["queries"][0]["extras"][ "where" - ] = "state = 'CA') OR (state = 'NY'" + ] = "state = 'CA') OR (stata = 'NY'" rv = self.post_assert_metric(CHART_DATA_URI, self.query_context_payload, "data") assert rv.status_code == 400 + @pytest.mark.usefixtures("load_birth_names_dashboard_with_slices") + def test_with_long_where_parameter(self): + long_clause = np.loadtxt("tests/example_data/test-long-where-clause-1k.txt", delimiter=",", dtype=str).tolist() + self.query_context_payload["queries"][0]["filters"][2]["val"] = long_clause + start_a = time.time() + rv = self.post_assert_metric(CHART_DATA_URI, self.query_context_payload, "data") + compute_time_a = time.time() - start_a + table = self.get_table_by_id(1) + del self.query_context_payload["queries"][0]["time_range"] + sqla_query = table.get_sqla_query(**{ + "granularity": None, + "from_dttm": None, + "to_dttm": None, + "groupby": ["gender"], + "metrics": ["count"], + "is_timeseries": False, + "filter": self.query_context_payload["queries"][0]["filters"], + "extras": {}, + }) + start_b = time.time() + sql = table.database.compile_sqla_query(sqla_query.sqla_query) + result = table.database.get_df(sql, schema=table.schema) + compute_time_b = time.time() - start_b + + buffer_time = 2 + self.assertLess(compute_time_a, + compute_time_b+buffer_time, + f"computation of post query payload: {compute_time_a} sec is higher than unparsed sqla querying time: {compute_time_b} sec plus {buffer_time} sec buffer time") + @pytest.mark.usefixtures("load_birth_names_dashboard_with_slices") def test_with_where_parameter_including_comment___200(self): self.query_context_payload["queries"][0]["filters"] = [] From d5ce5d6bb5e806290c4ae9dcc5463afe2d01c0c6 Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Fri, 1 Apr 2022 17:04:20 +0200 Subject: [PATCH 02/12] disable sqlparse.parse in db_engine_specs.base and sqla.models --- superset/common/query_object.py | 4 +++- superset/connectors/sqla/models.py | 4 +--- superset/db_engine_specs/base.py | 3 +-- superset/models/core.py | 32 +++++++----------------------- 4 files changed, 12 insertions(+), 31 deletions(-) diff --git a/superset/common/query_object.py b/superset/common/query_object.py index 5109c465e0abe..a45ffc560eaf5 100644 --- a/superset/common/query_object.py +++ b/superset/common/query_object.py @@ -272,7 +272,9 @@ def validate( try: self._validate_there_are_no_missing_series() self._validate_no_have_duplicate_labels() - self._sanitize_filters() + # problematic line where calling sqlparser.parse() causes quadratic + # performance for WHERE ... IN (...) clauses + # self._sanitize_filters() return None except QueryObjectValidationError as ex: if raise_exceptions: diff --git a/superset/connectors/sqla/models.py b/superset/connectors/sqla/models.py index 089b9c2f28960..a652a57b59b1c 100644 --- a/superset/connectors/sqla/models.py +++ b/superset/connectors/sqla/models.py @@ -1421,9 +1421,7 @@ def get_query_str_extended( sqlaq = self.get_sqla_query(**query_obj) sql = self.database.compile_sqla_query(sqlaq.sqla_query) sql = self._apply_cte(sql, sqlaq.cte) - sql = sqlparse.format(sql, reindent=True) - if mutate: - sql = self.mutate_query_from_config(sql) + sql = self.mutate_query_from_config(sql) return QueryStringExtended( applied_template_filters=sqlaq.applied_template_filters, applied_filter_columns=sqlaq.applied_filter_columns, diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py index 3b8bb2bd33292..dbd93cfeb0e35 100644 --- a/superset/db_engine_specs/base.py +++ b/superset/db_engine_specs/base.py @@ -915,8 +915,7 @@ def apply_top_to_sql(cls, sql: str, limit: int) -> str: cte = None sql_remainder = None - sql = sql.strip(" \t\n;") - sql_statement = sqlparse.format(sql, strip_comments=True) + sql_statement = sql.strip(" \t\n;") query_limit: int | None = sql_parse.extract_top_from_query( sql_statement, cls.top_keywords ) diff --git a/superset/models/core.py b/superset/models/core.py index 71a6e9d04237b..3d64c97b42b9c 100755 --- a/superset/models/core.py +++ b/superset/models/core.py @@ -555,11 +555,12 @@ def get_df( # pylint: disable=too-many-locals schema: str | None = None, mutator: Callable[[pd.DataFrame], None] | None = None, ) -> pd.DataFrame: - sqls = self.db_engine_spec.parse_sql(sql) + # before we split sqls using sql parse, however this core code is only reachable + # with single sql queries. Thus, we removed the engine spec parser + # sqls = self.db_engine_spec.parse_sql(sql) + with self.get_sqla_engine_with_context(schema) as engine: engine_url = engine.url - mutate_after_split = config["MUTATE_AFTER_SPLIT"] - sql_query_mutator = config["SQL_QUERY_MUTATOR"] def needs_conversion(df_series: pd.Series) -> bool: return ( @@ -580,28 +581,9 @@ def _log_query(sql: str) -> None: with self.get_raw_connection(schema=schema) as conn: cursor = conn.cursor() - for sql_ in sqls[:-1]: - if mutate_after_split: - sql_ = sql_query_mutator( - sql_, - security_manager=security_manager, - database=None, - ) - _log_query(sql_) - self.db_engine_spec.execute(cursor, sql_) - cursor.fetchall() - - if mutate_after_split: - last_sql = sql_query_mutator( - sqls[-1], - security_manager=security_manager, - database=None, - ) - _log_query(last_sql) - self.db_engine_spec.execute(cursor, last_sql) - else: - _log_query(sqls[-1]) - self.db_engine_spec.execute(cursor, sqls[-1]) + + _log_query(sql) + self.db_engine_spec.execute(cursor, sql) data = self.db_engine_spec.fetch_data(cursor) result_set = SupersetResultSet( From b991917973ace264224a7a6bc939ce85c2277a1a Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Wed, 6 Apr 2022 17:16:39 +0200 Subject: [PATCH 03/12] remove commented out code, document toggle comments --- superset/common/query_object.py | 2 ++ superset/models/core.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/superset/common/query_object.py b/superset/common/query_object.py index a45ffc560eaf5..235061a96d095 100644 --- a/superset/common/query_object.py +++ b/superset/common/query_object.py @@ -274,6 +274,8 @@ def validate( self._validate_no_have_duplicate_labels() # problematic line where calling sqlparser.parse() causes quadratic # performance for WHERE ... IN (...) clauses + # Clauses are anyway checked for their validity in + # e.g., connectors/sqla/models/get_query_str_extended # self._sanitize_filters() return None except QueryObjectValidationError as ex: diff --git a/superset/models/core.py b/superset/models/core.py index 3d64c97b42b9c..3b08e66eed6d1 100755 --- a/superset/models/core.py +++ b/superset/models/core.py @@ -556,7 +556,7 @@ def get_df( # pylint: disable=too-many-locals mutator: Callable[[pd.DataFrame], None] | None = None, ) -> pd.DataFrame: # before we split sqls using sql parse, however this core code is only reachable - # with single sql queries. Thus, we removed the engine spec parser + # with single sql queries. Thus, we remove the engine spec parser here # sqls = self.db_engine_spec.parse_sql(sql) with self.get_sqla_engine_with_context(schema) as engine: From 41581e912fa13e860f6dda624dbfae804fa56837 Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Thu, 22 Feb 2024 11:58:29 +0100 Subject: [PATCH 04/12] omit strip_comments from `ParsedQuery` --- superset/config.py | 4 ++-- superset/models/helpers.py | 2 +- superset/sql_lab.py | 1 - superset/sql_parse.py | 5 +---- superset/sqllab/query_render.py | 1 - tests/unit_tests/sql_parse_tests.py | 16 ++-------------- 6 files changed, 6 insertions(+), 23 deletions(-) diff --git a/superset/config.py b/superset/config.py index 197e4bac4296e..f4e655b07a000 100644 --- a/superset/config.py +++ b/superset/config.py @@ -1564,7 +1564,7 @@ def EMAIL_HEADER_MUTATOR( # pylint: disable=invalid-name,unused-argument # def DATASET_HEALTH_CHECK(datasource: SqlaTable) -> Optional[str]: # if ( # datasource.sql and -# len(sql_parse.ParsedQuery(datasource.sql, strip_comments=True).tables) == 1 +# len(sql_parse.ParsedQuery(datasource.sql).tables) == 1 # ): # return ( # "This virtual dataset queries only one table and therefore could be " @@ -1702,7 +1702,7 @@ class ExtraDynamicQueryFilters(TypedDict, total=False): try: # pylint: disable=import-error,wildcard-import,unused-wildcard-import import superset_config - from superset_config import * # type: ignore + from superset_config import * print(f"Loaded your LOCAL configuration at [{superset_config.__file__}]") except Exception: diff --git a/superset/models/helpers.py b/superset/models/helpers.py index 8d3ed36c465f7..dde9fdc02a3cc 100644 --- a/superset/models/helpers.py +++ b/superset/models/helpers.py @@ -906,7 +906,6 @@ def get_query_str_extended( sqlaq = self.get_sqla_query(**query_obj) sql = self.database.compile_sqla_query(sqlaq.sqla_query) sql = self._apply_cte(sql, sqlaq.cte) - sql = sqlparse.format(sql, reindent=True) if mutate: sql = self.mutate_query_from_config(sql) return QueryStringExtended( @@ -1071,6 +1070,7 @@ def get_rendered_sql( msg=ex.message, ) ) from ex + logger.info(f"{__name__} - get_rendered_sql: Stripping comments from SQL") sql = sqlparse.format(sql.strip("\t\r\n; "), strip_comments=True) if not sql: raise QueryObjectValidationError(_("Virtual dataset query cannot be empty")) diff --git a/superset/sql_lab.py b/superset/sql_lab.py index 1b883a77cfbbc..f9eb2b0d00c95 100644 --- a/superset/sql_lab.py +++ b/superset/sql_lab.py @@ -412,7 +412,6 @@ def execute_sql_statements( # Breaking down into multiple statements parsed_query = ParsedQuery( rendered_query, - strip_comments=True, engine=db_engine_spec.engine, ) if not db_engine_spec.run_multiple_statements_as_one: diff --git a/superset/sql_parse.py b/superset/sql_parse.py index 7b89ab8f0e2cb..44014cff0c4f7 100644 --- a/superset/sql_parse.py +++ b/superset/sql_parse.py @@ -256,12 +256,8 @@ class ParsedQuery: def __init__( self, sql_statement: str, - strip_comments: bool = False, engine: Optional[str] = None, ): - if strip_comments: - sql_statement = sqlparse.format(sql_statement, strip_comments=True) - self.sql: str = sql_statement self._dialect = SQLGLOT_DIALECTS.get(engine) if engine else None self._tables: set[Table] = set() @@ -604,6 +600,7 @@ def set_or_update_query_limit(self, new_limit: int, force: bool = False) -> str: def sanitize_clause(clause: str) -> str: + logger.info(f"{__name__}: sanitize") # clause = sqlparse.format(clause, strip_comments=True) statements = sqlparse.parse(clause) if len(statements) != 1: diff --git a/superset/sqllab/query_render.py b/superset/sqllab/query_render.py index 5597bcb086d7c..080a7c5f6619e 100644 --- a/superset/sqllab/query_render.py +++ b/superset/sqllab/query_render.py @@ -60,7 +60,6 @@ def render(self, execution_context: SqlJsonExecutionContext) -> str: parsed_query = ParsedQuery( query_model.sql, - strip_comments=True, engine=query_model.database.db_engine_spec.engine, ) rendered_query = sql_template_processor.process_template( diff --git a/tests/unit_tests/sql_parse_tests.py b/tests/unit_tests/sql_parse_tests.py index f05e16ae85fd0..f630467c24791 100644 --- a/tests/unit_tests/sql_parse_tests.py +++ b/tests/unit_tests/sql_parse_tests.py @@ -863,9 +863,7 @@ def test_is_valid_ctas() -> None: A valid CTAS has a ``SELECT`` as its last statement. """ - assert ( - ParsedQuery("SELECT * FROM table", strip_comments=True).is_valid_ctas() is True - ) + assert ParsedQuery("SELECT * FROM table").is_valid_ctas() is True assert ( ParsedQuery( @@ -874,7 +872,6 @@ def test_is_valid_ctas() -> None: SELECT * FROM table -- comment 2 """, - strip_comments=True, ).is_valid_ctas() is True ) @@ -887,7 +884,6 @@ def test_is_valid_ctas() -> None: SELECT @value as foo; -- comment 2 """, - strip_comments=True, ).is_valid_ctas() is True ) @@ -899,7 +895,6 @@ def test_is_valid_ctas() -> None: EXPLAIN SELECT * FROM table -- comment 2 """, - strip_comments=True, ).is_valid_ctas() is False ) @@ -910,7 +905,6 @@ def test_is_valid_ctas() -> None: SELECT * FROM table; INSERT INTO TABLE (foo) VALUES (42); """, - strip_comments=True, ).is_valid_ctas() is False ) @@ -922,9 +916,7 @@ def test_is_valid_cvas() -> None: A valid CVAS has a single ``SELECT`` statement. """ - assert ( - ParsedQuery("SELECT * FROM table", strip_comments=True).is_valid_cvas() is True - ) + assert ParsedQuery("SELECT * FROM table").is_valid_cvas() is True assert ( ParsedQuery( @@ -933,7 +925,6 @@ def test_is_valid_cvas() -> None: SELECT * FROM table -- comment 2 """, - strip_comments=True, ).is_valid_cvas() is True ) @@ -946,7 +937,6 @@ def test_is_valid_cvas() -> None: SELECT @value as foo; -- comment 2 """, - strip_comments=True, ).is_valid_cvas() is False ) @@ -958,7 +948,6 @@ def test_is_valid_cvas() -> None: EXPLAIN SELECT * FROM table -- comment 2 """, - strip_comments=True, ).is_valid_cvas() is False ) @@ -969,7 +958,6 @@ def test_is_valid_cvas() -> None: SELECT * FROM table; INSERT INTO TABLE (foo) VALUES (42); """, - strip_comments=True, ).is_valid_cvas() is False ) From ac00d6419b517cca299ba0293048ebb451255398 Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Thu, 22 Feb 2024 12:43:00 +0100 Subject: [PATCH 05/12] add check for mutate --- superset/connectors/sqla/models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/superset/connectors/sqla/models.py b/superset/connectors/sqla/models.py index a652a57b59b1c..16e82c5072ab1 100644 --- a/superset/connectors/sqla/models.py +++ b/superset/connectors/sqla/models.py @@ -1421,7 +1421,8 @@ def get_query_str_extended( sqlaq = self.get_sqla_query(**query_obj) sql = self.database.compile_sqla_query(sqlaq.sqla_query) sql = self._apply_cte(sql, sqlaq.cte) - sql = self.mutate_query_from_config(sql) + if mutate: + sql = self.mutate_query_from_config(sql) return QueryStringExtended( applied_template_filters=sqlaq.applied_template_filters, applied_filter_columns=sqlaq.applied_filter_columns, From a536a97481bb177ad2edc318a7656faac3148d52 Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Thu, 22 Feb 2024 13:14:33 +0100 Subject: [PATCH 06/12] fix typo --- .../charts/data/api_tests.py | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/tests/integration_tests/charts/data/api_tests.py b/tests/integration_tests/charts/data/api_tests.py index fdecfa17b6e18..76e916d71b60e 100644 --- a/tests/integration_tests/charts/data/api_tests.py +++ b/tests/integration_tests/charts/data/api_tests.py @@ -591,7 +591,7 @@ def test_with_invalid_where_parameter_closing_unclosed__400(self): self.query_context_payload["queries"][0]["filters"] = [] self.query_context_payload["queries"][0]["extras"][ "where" - ] = "state = 'CA') OR (stata = 'NY'" + ] = "state = 'CA') OR (state = 'NY'" rv = self.post_assert_metric(CHART_DATA_URI, self.query_context_payload, "data") @@ -599,32 +599,38 @@ def test_with_invalid_where_parameter_closing_unclosed__400(self): @pytest.mark.usefixtures("load_birth_names_dashboard_with_slices") def test_with_long_where_parameter(self): - long_clause = np.loadtxt("tests/example_data/test-long-where-clause-1k.txt", delimiter=",", dtype=str).tolist() + long_clause = np.loadtxt( + "tests/example_data/test-long-where-clause-1k.txt", delimiter=",", dtype=str + ).tolist() self.query_context_payload["queries"][0]["filters"][2]["val"] = long_clause start_a = time.time() rv = self.post_assert_metric(CHART_DATA_URI, self.query_context_payload, "data") compute_time_a = time.time() - start_a table = self.get_table_by_id(1) del self.query_context_payload["queries"][0]["time_range"] - sqla_query = table.get_sqla_query(**{ - "granularity": None, - "from_dttm": None, - "to_dttm": None, - "groupby": ["gender"], - "metrics": ["count"], - "is_timeseries": False, - "filter": self.query_context_payload["queries"][0]["filters"], - "extras": {}, - }) + sqla_query = table.get_sqla_query( + **{ + "granularity": None, + "from_dttm": None, + "to_dttm": None, + "groupby": ["gender"], + "metrics": ["count"], + "is_timeseries": False, + "filter": self.query_context_payload["queries"][0]["filters"], + "extras": {}, + } + ) start_b = time.time() sql = table.database.compile_sqla_query(sqla_query.sqla_query) result = table.database.get_df(sql, schema=table.schema) compute_time_b = time.time() - start_b buffer_time = 2 - self.assertLess(compute_time_a, - compute_time_b+buffer_time, - f"computation of post query payload: {compute_time_a} sec is higher than unparsed sqla querying time: {compute_time_b} sec plus {buffer_time} sec buffer time") + self.assertLess( + compute_time_a, + compute_time_b + buffer_time, + f"computation of post query payload: {compute_time_a} sec is higher than unparsed sqla querying time: {compute_time_b} sec plus {buffer_time} sec buffer time", + ) @pytest.mark.usefixtures("load_birth_names_dashboard_with_slices") def test_with_where_parameter_including_comment___200(self): From f23300198d857bac2e9830f317802eb15fcd40a9 Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Fri, 23 Feb 2024 08:54:32 +0100 Subject: [PATCH 07/12] lint: trailing whitespace --- superset/models/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superset/models/core.py b/superset/models/core.py index 3b08e66eed6d1..7fb4391f61055 100755 --- a/superset/models/core.py +++ b/superset/models/core.py @@ -558,7 +558,7 @@ def get_df( # pylint: disable=too-many-locals # before we split sqls using sql parse, however this core code is only reachable # with single sql queries. Thus, we remove the engine spec parser here # sqls = self.db_engine_spec.parse_sql(sql) - + with self.get_sqla_engine_with_context(schema) as engine: engine_url = engine.url From 8674647f450e521848193b0541cd48d800f23275 Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Fri, 23 Feb 2024 09:35:14 +0100 Subject: [PATCH 08/12] restore `type: ignore` --- superset/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superset/config.py b/superset/config.py index f4e655b07a000..2392660e5a701 100644 --- a/superset/config.py +++ b/superset/config.py @@ -1702,7 +1702,7 @@ class ExtraDynamicQueryFilters(TypedDict, total=False): try: # pylint: disable=import-error,wildcard-import,unused-wildcard-import import superset_config - from superset_config import * + from superset_config import * # type: ignore print(f"Loaded your LOCAL configuration at [{superset_config.__file__}]") except Exception: From 965751f01c519502e6d7fdeefffdf8724871cb60 Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Fri, 23 Feb 2024 09:44:15 +0100 Subject: [PATCH 09/12] omit debug log --- superset/sql_parse.py | 1 - 1 file changed, 1 deletion(-) diff --git a/superset/sql_parse.py b/superset/sql_parse.py index 44014cff0c4f7..347d62b125d61 100644 --- a/superset/sql_parse.py +++ b/superset/sql_parse.py @@ -600,7 +600,6 @@ def set_or_update_query_limit(self, new_limit: int, force: bool = False) -> str: def sanitize_clause(clause: str) -> str: - logger.info(f"{__name__}: sanitize") # clause = sqlparse.format(clause, strip_comments=True) statements = sqlparse.parse(clause) if len(statements) != 1: From 2fa6df688c6b3cc334e62fb171a06b2831d7c240 Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Tue, 5 Mar 2024 08:42:05 +0100 Subject: [PATCH 10/12] use sqlglot to parse multiple sql statements --- superset/models/core.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/superset/models/core.py b/superset/models/core.py index 7fb4391f61055..860c7d1ad90c4 100755 --- a/superset/models/core.py +++ b/superset/models/core.py @@ -57,6 +57,7 @@ from sqlalchemy.pool import NullPool from sqlalchemy.schema import UniqueConstraint from sqlalchemy.sql import ColumnElement, expression, Select +from sqlglot import parse from superset import app, db_engine_specs from superset.commands.database.exceptions import DatabaseInvalidError @@ -558,9 +559,12 @@ def get_df( # pylint: disable=too-many-locals # before we split sqls using sql parse, however this core code is only reachable # with single sql queries. Thus, we remove the engine spec parser here # sqls = self.db_engine_spec.parse_sql(sql) + sqls = parse(sql) with self.get_sqla_engine_with_context(schema) as engine: engine_url = engine.url + mutate_after_split = config["MUTATE_AFTER_SPLIT"] + sql_query_mutator = config["SQL_QUERY_MUTATOR"] def needs_conversion(df_series: pd.Series) -> bool: return ( @@ -582,8 +586,29 @@ def _log_query(sql: str) -> None: with self.get_raw_connection(schema=schema) as conn: cursor = conn.cursor() - _log_query(sql) - self.db_engine_spec.execute(cursor, sql) + for sql_ in sqls[:-1]: + sql_ = str(sql_) + if mutate_after_split: + sql_ = sql_query_mutator( + sql_, + security_manager=security_manager, + database=None, + ) + _log_query(sql_) + self.db_engine_spec.execute(cursor, sql_) + cursor.fetchall() + + if mutate_after_split: + last_sql = sql_query_mutator( + str(sqls[-1]), + security_manager=security_manager, + database=None, + ) + _log_query(last_sql) + self.db_engine_spec.execute(cursor, last_sql) + else: + _log_query(str(sqls[-1])) + self.db_engine_spec.execute(cursor, str(sqls[-1])) data = self.db_engine_spec.fetch_data(cursor) result_set = SupersetResultSet( From bd6f5deaddeee5cf0acaec26c812ed6c34f0c6e6 Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Tue, 5 Mar 2024 12:48:32 +0100 Subject: [PATCH 11/12] remove debug comment --- superset/models/helpers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/superset/models/helpers.py b/superset/models/helpers.py index dde9fdc02a3cc..6b7a75fda1d0d 100644 --- a/superset/models/helpers.py +++ b/superset/models/helpers.py @@ -1070,7 +1070,6 @@ def get_rendered_sql( msg=ex.message, ) ) from ex - logger.info(f"{__name__} - get_rendered_sql: Stripping comments from SQL") sql = sqlparse.format(sql.strip("\t\r\n; "), strip_comments=True) if not sql: raise QueryObjectValidationError(_("Virtual dataset query cannot be empty")) From 482dc3a03c110f1a40d1e19def23eb503309f078 Mon Sep 17 00:00:00 2001 From: Christian Bors Date: Mon, 7 Oct 2024 14:12:41 +0200 Subject: [PATCH 12/12] replace `sqlparse.format` with `sqlglot.transpile` --- superset/common/query_object.py | 2 +- superset/config.py | 2 +- superset/db_engine_specs/base.py | 7 ++-- superset/models/core.py | 7 +--- superset/sql_parse.py | 53 ++++++++++++++++------------- tests/unit_tests/sql_parse_tests.py | 13 +++---- 6 files changed, 44 insertions(+), 40 deletions(-) diff --git a/superset/common/query_object.py b/superset/common/query_object.py index 21290dbff7b57..e721017ad668a 100644 --- a/superset/common/query_object.py +++ b/superset/common/query_object.py @@ -276,7 +276,7 @@ def validate( # performance for WHERE ... IN (...) clauses # Clauses are anyway checked for their validity in # e.g., connectors/sqla/models/get_query_str_extended - # self._sanitize_filters() + self._sanitize_filters() return None except QueryObjectValidationError as ex: if raise_exceptions: diff --git a/superset/config.py b/superset/config.py index a033b61e1a70f..ca8b64a4d94d9 100644 --- a/superset/config.py +++ b/superset/config.py @@ -1900,7 +1900,7 @@ class ExtraDynamicQueryFilters(TypedDict, total=False): elif importlib.util.find_spec("superset_config") and not is_test(): try: # pylint: disable=import-error,wildcard-import,unused-wildcard-import - import superset_config + import superset_config as superset_config from superset_config import * # noqa: F403, F401 click.secho( diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py index dcdfff6c3f242..45c8130c1c543 100644 --- a/superset/db_engine_specs/base.py +++ b/superset/db_engine_specs/base.py @@ -38,7 +38,7 @@ import pandas as pd import requests -import sqlparse +import sqlglot from apispec import APISpec from apispec.ext.marshmallow import MarshmallowPlugin from deprecation import deprecated @@ -1236,7 +1236,7 @@ def get_cte_query(cls, sql: str) -> str | None: """ if not cls.allows_cte_in_subquery: - stmt = sqlparse.parse(sql)[0] + stmt = sqlglot.tokenize(sql) # The first meaningful token for CTE will be with WITH idx, token = stmt.token_next(-1, skip_ws=True, skip_cm=True) @@ -2158,7 +2158,8 @@ def cancel_query( # pylint: disable=unused-argument @classmethod def parse_sql(cls, sql: str) -> list[str]: - return [str(s).strip(" ;") for s in sqlparse.parse(sql)] + return sqlglot.transpile(sql) + # return [str(s).strip(" ;") for s in sqlparse.parse(sql)] @classmethod def get_impersonation_key(cls, user: User | None) -> Any: diff --git a/superset/models/core.py b/superset/models/core.py index 016c0957be6c5..5d3a6ea74ddab 100755 --- a/superset/models/core.py +++ b/superset/models/core.py @@ -58,7 +58,6 @@ from sqlalchemy.pool import NullPool from sqlalchemy.schema import UniqueConstraint from sqlalchemy.sql import ColumnElement, expression, Select -from sqlglot import parse from superset import app, db_engine_specs, is_feature_enabled from superset.commands.database.exceptions import DatabaseInvalidError @@ -654,11 +653,7 @@ def get_df( # pylint: disable=too-many-locals schema: str | None = None, mutator: Callable[[pd.DataFrame], None] | None = None, ) -> pd.DataFrame: - # before we split sqls using sql parse, however this core code is only reachable - # with single sql queries. Thus, we remove the engine spec parser here - # sqls = self.db_engine_spec.parse_sql(sql) - sqls = parse(sql) - + sqls = self.db_engine_spec.parse_sql(sql) with self.get_sqla_engine(catalog=catalog, schema=schema) as engine: engine_url = engine.url diff --git a/superset/sql_parse.py b/superset/sql_parse.py index ce874e89ed0c0..7ee82b508424d 100644 --- a/superset/sql_parse.py +++ b/superset/sql_parse.py @@ -24,11 +24,13 @@ from collections.abc import Iterator from typing import Any, cast, TYPE_CHECKING +import sqlglot import sqlparse from flask_babel import gettext as __ from jinja2 import nodes from sqlalchemy import and_ from sqlglot.dialects.dialect import Dialects +from sqlglot.errors import ParseError from sqlparse import keywords from sqlparse.lexer import Lexer from sqlparse.sql import ( @@ -42,7 +44,6 @@ Where, ) from sqlparse.tokens import ( - Comment, CTE, DDL, DML, @@ -257,6 +258,7 @@ def __init__( sql_statement: str, engine: str = "base", ): + sql_statement = sqlglot.transpile(sql_statement) self.sql: str = sql_statement self._engine = engine self._dialect = SQLGLOT_DIALECTS.get(engine) if engine else None @@ -579,30 +581,35 @@ def set_or_update_query_limit(self, new_limit: int, force: bool = False) -> str: def sanitize_clause(clause: str) -> str: # clause = sqlparse.format(clause, strip_comments=True) - statements = sqlparse.parse(clause) + try: + statements = sqlglot.transpile(clause, pretty=True) + except Exception as p_err: + if isinstance(p_err, ParseError): + raise QueryClauseValidationException(str(p_err)) from p_err + raise ValueError(str(p_err)) from None if len(statements) != 1: raise QueryClauseValidationException("Clause contains multiple statements") - open_parens = 0 - - previous_token = None - for token in statements[0]: - if token.value == "/" and previous_token and previous_token.value == "*": - raise QueryClauseValidationException("Closing unopened multiline comment") - if token.value == "*" and previous_token and previous_token.value == "/": - raise QueryClauseValidationException("Unclosed multiline comment") - if token.value in (")", "("): - open_parens += 1 if token.value == "(" else -1 - if open_parens < 0: - raise QueryClauseValidationException( - "Closing unclosed parenthesis in filter clause" - ) - previous_token = token - if open_parens > 0: - raise QueryClauseValidationException("Unclosed parenthesis in filter clause") - - if previous_token and previous_token.ttype in Comment: - if previous_token.value[-1] != "\n": - clause = f"{clause}\n" + # open_parens = 0 + + # previous_token = None + # for token in statements[0]: + # if token.value == "/" and previous_token and previous_token.value == "*": + # raise QueryClauseValidationException("Closing unopened multiline comment") + # if token.value == "*" and previous_token and previous_token.value == "/": + # raise QueryClauseValidationException("Unclosed multiline comment") + # if token.value in (")", "("): + # open_parens += 1 if token.value == "(" else -1 + # if open_parens < 0: + # raise QueryClauseValidationException( + # "Closing unclosed parenthesis in filter clause" + # ) + # previous_token = token + # if open_parens > 0: + # raise QueryClauseValidationException("Unclosed parenthesis in filter clause") + + # if previous_token and previous_token.ttype in Comment: + # if previous_token.value[-1] != "\n": + # clause = f"{clause}\n" return clause diff --git a/tests/unit_tests/sql_parse_tests.py b/tests/unit_tests/sql_parse_tests.py index 4557e220b8b4a..2054ccd8d4d67 100644 --- a/tests/unit_tests/sql_parse_tests.py +++ b/tests/unit_tests/sql_parse_tests.py @@ -20,6 +20,7 @@ from unittest.mock import Mock import pytest +import sqlglot import sqlparse from pytest_mock import MockerFixture from sqlalchemy import text @@ -1168,17 +1169,17 @@ def test_messy_breakdown_statements() -> None: ] -def test_sqlparse_formatting(): +def test_sqlglot_formatting(): """ Test that ``from_unixtime`` is formatted correctly. """ - assert sqlparse.format( + assert sqlglot.transpile( "SELECT extract(HOUR from from_unixtime(hour_ts) " "AT TIME ZONE 'America/Los_Angeles') from table", - reindent=True, - ) == ( - "SELECT extract(HOUR\n from from_unixtime(hour_ts) " - "AT TIME ZONE 'America/Los_Angeles')\nfrom table" + pretty=True, + )[0] == ( + "SELECT\n EXTRACT(HOUR FROM FROM_UNIXTIME(hour_ts) AT TIME ZONE 'America/Los_Angeles')" + "\nFROM table" )