From c6ca239810373429ea72834132d1ca614f3fd76e Mon Sep 17 00:00:00 2001 From: Steve Russo <64294847+sjrusso8@users.noreply.github.com> Date: Mon, 2 Jan 2023 10:54:51 -0500 Subject: [PATCH] feat(databricks): add initial databricks syntax --- sqlparse/filters/aligned_indent.py | 2 +- sqlparse/keywords.py | 28 ++++++++++++++ sqlparse/lexer.py | 1 + sqlparse/sql.py | 2 +- tests/test_parse.py | 61 ++++++++++++++++++++++++++++++ tests/test_tokenize.py | 22 ++++++----- 6 files changed, 104 insertions(+), 12 deletions(-) diff --git a/sqlparse/filters/aligned_indent.py b/sqlparse/filters/aligned_indent.py index dc609263..ee20a5a8 100644 --- a/sqlparse/filters/aligned_indent.py +++ b/sqlparse/filters/aligned_indent.py @@ -13,7 +13,7 @@ class AlignedIndentFilter: join_words = (r'((LEFT\s+|RIGHT\s+|FULL\s+)?' r'(INNER\s+|OUTER\s+|STRAIGHT\s+)?|' r'(CROSS\s+|NATURAL\s+)?)?JOIN\b') - by_words = r'(GROUP|ORDER)\s+BY\b' + by_words = r'(GROUP|ORDER|ZORDER)\s+BY\b' split_words = ('FROM', join_words, 'ON', by_words, 'WHERE', 'AND', 'OR', diff --git a/sqlparse/keywords.py b/sqlparse/keywords.py index f04f928e..ed9a4dc0 100644 --- a/sqlparse/keywords.py +++ b/sqlparse/keywords.py @@ -77,6 +77,14 @@ (r'DOUBLE\s+PRECISION\b', tokens.Name.Builtin), (r'GROUP\s+BY\b', tokens.Keyword), (r'ORDER\s+BY\b', tokens.Keyword), + (r'ZORDER\s+BY\b', tokens.Keyword), + (r'PARTITIONED\s+BY\b', tokens.Keyword), + (r'SORTED\s+BY\b', tokens.Keyword), + (r'CLUSTERED\s+BY\b', tokens.Keyword), + (r'WITH\s+DBPROPERTIES\b', tokens.Keyword), + (r'BLOOMFILTER\s+INDEX\b', tokens.Keyword), + (r'(DEEP|SHALLOW)\s+CLONE\b', tokens.Keyword), + (r'(MSCK|FSCK)\s+REPAIR\b', tokens.Keyword), (r'HANDLER\s+FOR\b', tokens.Keyword), (r'(LATERAL\s+VIEW\s+)' r'(EXPLODE|INLINE|PARSE_URL_TUPLE|POSEXPLODE|STACK)\b', @@ -959,3 +967,23 @@ KEYWORDS_MSACCESS = { 'DISTINCTROW': tokens.Keyword, } + +# Databricks & SparkSQL Syntax +# see https://docs.databricks.com/sql/language-manual/index.html +# see https://spark.apache.org/docs/latest/sql-ref-syntax.html +KEYWORDS_DBX = { + 'BLOOMFILTER': tokens.Keyword, + 'BUCKETS': tokens.Keyword, + 'DBPROPERTIES': tokens.Keyword, + 'DETAIL': tokens.Keyword, + 'HISTORY': tokens.Keyword, + 'METADATA': tokens.Keyword, + 'MSCK': tokens.Keyword, + 'OPTIMIZE': tokens.Keyword, + 'PARTITIONS': tokens.Keyword, + 'REFRESH': tokens.Keyword, + 'REPAIR': tokens.Keyword, + 'SYNC': tokens.Keyword, + 'VACUUM': tokens.Keyword, + 'ZORDER': tokens.Keyword +} diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 9d25c9e6..59873764 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -62,6 +62,7 @@ def default_initialization(self): self.add_keywords(keywords.KEYWORDS_PLPGSQL) self.add_keywords(keywords.KEYWORDS_HQL) self.add_keywords(keywords.KEYWORDS_MSACCESS) + self.add_keywords(keywords.KEYWORDS_DBX) self.add_keywords(keywords.KEYWORDS) def clear(self): diff --git a/sqlparse/sql.py b/sqlparse/sql.py index 586cd216..fb7e1934 100644 --- a/sqlparse/sql.py +++ b/sqlparse/sql.py @@ -550,7 +550,7 @@ class Where(TokenList): M_OPEN = T.Keyword, 'WHERE' M_CLOSE = T.Keyword, ( 'ORDER BY', 'GROUP BY', 'LIMIT', 'UNION', 'UNION ALL', 'EXCEPT', - 'HAVING', 'RETURNING', 'INTO') + 'HAVING', 'RETURNING', 'INTO', 'ZORDER BY') class Having(TokenList): diff --git a/tests/test_parse.py b/tests/test_parse.py index 5feef5a7..809c2103 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -566,3 +566,64 @@ def test_configurable_regex(): for t in tokens if t.ttype not in sqlparse.tokens.Whitespace )[4] == (sqlparse.tokens.Keyword, "zorder by") + + +def test_spark_schema_create(): + s = """CREATE SCHEMA IF NOT EXISTS database_name + COMMENT "my database comment" + LOCATION "/mnt/path/to/db" + WITH DBPROPERTIES (property_name=property_value) ; + """ + + tokens = sqlparse.parse(s)[0].tokens + + token_values = [(t.value, t.ttype) for t in tokens if t.ttype not in (None, T.Whitespace, T.Text.Whitespace.Newline)] + + assert token_values == [('CREATE', T.Keyword.DDL), + ('SCHEMA', T.Keyword), + ('IF', T.Keyword), + ('NOT', T.Keyword), + ('EXISTS', T.Keyword), + ('COMMENT', T.Keyword), + ('LOCATION', T.Keyword), + ('WITH DBPROPERTIES', T.Keyword), + (';', T.Punctuation)] + + +def test_spark_table_create(): + s = """CREATE TABLE IF NOT EXISTS database_name.table_identifier + ( + col_name1 int COMMENT "woah, cool column", + b string + ) + USING DELTA + OPTIONS ( key1=val1, key2=val2 ) + PARTITIONED BY ( col_name1 ) + CLUSTERED BY ( b ) + SORTED BY ( col_name1 DESC ) + INTO 4 BUCKETS + LOCATION "/mnt/path/to/db/tbl" + COMMENT "nice table" + TBLPROPERTIES ( key1=val1, key2=val2 ) + """ + + tokens = sqlparse.parse(s)[0].tokens + + token_values = [(t.value, t.ttype) for t in tokens if t.ttype not in (None, T.Whitespace, T.Text.Whitespace.Newline)] + + assert token_values == [('CREATE', T.Keyword.DDL), + ('TABLE', T.Keyword), + ('IF', T.Keyword), + ('NOT', T.Keyword), + ('EXISTS', T.Keyword), + ('USING', T.Keyword), + ('OPTIONS', T.Keyword), + ('PARTITIONED BY', T.Keyword), + ('CLUSTERED BY', T.Keyword), + ('SORTED BY', T.Keyword), + ('INTO', T.Keyword), + ('4', T.Literal.Number.Integer), + ('BUCKETS', T.Keyword), + ('LOCATION', T.Keyword), + ('COMMENT', T.Keyword), + ('TBLPROPERTIES', T.Keyword)] \ No newline at end of file diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index af0ba163..b29e8b00 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -189,19 +189,21 @@ def test_parse_identifiers(s): assert str(token) == s assert isinstance(token, sql.Identifier) - -def test_parse_group_by(): - p = sqlparse.parse('GROUP BY')[0] - assert len(p.tokens) == 1 - assert p.tokens[0].ttype is T.Keyword - - -def test_parse_order_by(): - p = sqlparse.parse('ORDER BY')[0] + +@pytest.mark.parametrize('s', [ + 'GROUP BY', + 'ORDER BY', + 'ZORDER BY', + 'PARTITIONED BY', + 'SORTED BY', + 'CLUSTERED BY' +]) +def test_parse_by_statements(s): + p = sqlparse.parse(s)[0] assert len(p.tokens) == 1 assert p.tokens[0].ttype is T.Keyword - + def test_parse_window_as(): p = sqlparse.parse('WINDOW w AS')[0] assert len(p.tokens) == 5