From c6ca239810373429ea72834132d1ca614f3fd76e Mon Sep 17 00:00:00 2001
From: Steve Russo <64294847+sjrusso8@users.noreply.github.com>
Date: Mon, 2 Jan 2023 10:54:51 -0500
Subject: [PATCH] feat(databricks): add initial databricks syntax

---
 sqlparse/filters/aligned_indent.py |  2 +-
 sqlparse/keywords.py               | 28 ++++++++++++++
 sqlparse/lexer.py                  |  1 +
 sqlparse/sql.py                    |  2 +-
 tests/test_parse.py                | 61 ++++++++++++++++++++++++++++++
 tests/test_tokenize.py             | 22 ++++++-----
 6 files changed, 104 insertions(+), 12 deletions(-)

diff --git a/sqlparse/filters/aligned_indent.py b/sqlparse/filters/aligned_indent.py
index dc609263..ee20a5a8 100644
--- a/sqlparse/filters/aligned_indent.py
+++ b/sqlparse/filters/aligned_indent.py
@@ -13,7 +13,7 @@ class AlignedIndentFilter:
     join_words = (r'((LEFT\s+|RIGHT\s+|FULL\s+)?'
                   r'(INNER\s+|OUTER\s+|STRAIGHT\s+)?|'
                   r'(CROSS\s+|NATURAL\s+)?)?JOIN\b')
-    by_words = r'(GROUP|ORDER)\s+BY\b'
+    by_words = r'(GROUP|ORDER|ZORDER)\s+BY\b'
     split_words = ('FROM',
                    join_words, 'ON', by_words,
                    'WHERE', 'AND', 'OR',
diff --git a/sqlparse/keywords.py b/sqlparse/keywords.py
index f04f928e..ed9a4dc0 100644
--- a/sqlparse/keywords.py
+++ b/sqlparse/keywords.py
@@ -77,6 +77,14 @@
     (r'DOUBLE\s+PRECISION\b', tokens.Name.Builtin),
     (r'GROUP\s+BY\b', tokens.Keyword),
     (r'ORDER\s+BY\b', tokens.Keyword),
+    (r'ZORDER\s+BY\b', tokens.Keyword),
+    (r'PARTITIONED\s+BY\b', tokens.Keyword),
+    (r'SORTED\s+BY\b', tokens.Keyword),
+    (r'CLUSTERED\s+BY\b', tokens.Keyword),
+    (r'WITH\s+DBPROPERTIES\b', tokens.Keyword),
+    (r'BLOOMFILTER\s+INDEX\b', tokens.Keyword),
+    (r'(DEEP|SHALLOW)\s+CLONE\b', tokens.Keyword),
+    (r'(MSCK|FSCK)\s+REPAIR\b', tokens.Keyword),
     (r'HANDLER\s+FOR\b', tokens.Keyword),
     (r'(LATERAL\s+VIEW\s+)'
      r'(EXPLODE|INLINE|PARSE_URL_TUPLE|POSEXPLODE|STACK)\b',
@@ -959,3 +967,23 @@
 KEYWORDS_MSACCESS = {
     'DISTINCTROW': tokens.Keyword,
 }
+
+# Databricks & SparkSQL Syntax
+# see https://docs.databricks.com/sql/language-manual/index.html
+# see https://spark.apache.org/docs/latest/sql-ref-syntax.html
+KEYWORDS_DBX = {
+    'BLOOMFILTER': tokens.Keyword,
+    'BUCKETS': tokens.Keyword,
+    'DBPROPERTIES': tokens.Keyword,
+    'DETAIL': tokens.Keyword,
+    'HISTORY': tokens.Keyword,
+    'METADATA': tokens.Keyword,
+    'MSCK': tokens.Keyword,
+    'OPTIMIZE': tokens.Keyword,
+    'PARTITIONS': tokens.Keyword,
+    'REFRESH': tokens.Keyword,
+    'REPAIR': tokens.Keyword,
+    'SYNC': tokens.Keyword,
+    'VACUUM': tokens.Keyword,
+    'ZORDER': tokens.Keyword
+}
diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py
index 9d25c9e6..59873764 100644
--- a/sqlparse/lexer.py
+++ b/sqlparse/lexer.py
@@ -62,6 +62,7 @@ def default_initialization(self):
         self.add_keywords(keywords.KEYWORDS_PLPGSQL)
         self.add_keywords(keywords.KEYWORDS_HQL)
         self.add_keywords(keywords.KEYWORDS_MSACCESS)
+        self.add_keywords(keywords.KEYWORDS_DBX)
         self.add_keywords(keywords.KEYWORDS)
 
     def clear(self):
diff --git a/sqlparse/sql.py b/sqlparse/sql.py
index 586cd216..fb7e1934 100644
--- a/sqlparse/sql.py
+++ b/sqlparse/sql.py
@@ -550,7 +550,7 @@ class Where(TokenList):
     M_OPEN = T.Keyword, 'WHERE'
     M_CLOSE = T.Keyword, (
         'ORDER BY', 'GROUP BY', 'LIMIT', 'UNION', 'UNION ALL', 'EXCEPT',
-        'HAVING', 'RETURNING', 'INTO')
+        'HAVING', 'RETURNING', 'INTO', 'ZORDER BY')
 
 
 class Having(TokenList):
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 5feef5a7..809c2103 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -566,3 +566,64 @@ def test_configurable_regex():
         for t in tokens
         if t.ttype not in sqlparse.tokens.Whitespace
     )[4] == (sqlparse.tokens.Keyword, "zorder by")
+
+
+def test_spark_schema_create():
+    s = """CREATE SCHEMA IF NOT EXISTS database_name
+    COMMENT "my database comment"
+    LOCATION "/mnt/path/to/db"
+    WITH DBPROPERTIES (property_name=property_value) ;
+    """
+
+    tokens = sqlparse.parse(s)[0].tokens
+
+    token_values = [(t.value, t.ttype) for t in tokens if t.ttype not in (None, T.Whitespace, T.Text.Whitespace.Newline)]
+
+    assert token_values == [('CREATE', T.Keyword.DDL),
+                            ('SCHEMA', T.Keyword),
+                            ('IF', T.Keyword),
+                            ('NOT', T.Keyword),
+                            ('EXISTS', T.Keyword),
+                            ('COMMENT', T.Keyword),
+                            ('LOCATION', T.Keyword),
+                            ('WITH DBPROPERTIES', T.Keyword),
+                            (';', T.Punctuation)]
+
+
+def test_spark_table_create():
+    s = """CREATE TABLE IF NOT EXISTS database_name.table_identifier
+    (
+        col_name1 int COMMENT "woah, cool column",
+        b string
+    )
+    USING DELTA
+    OPTIONS ( key1=val1, key2=val2 )
+    PARTITIONED BY ( col_name1  )
+    CLUSTERED BY ( b )
+    SORTED BY ( col_name1  DESC )
+    INTO 4 BUCKETS
+    LOCATION "/mnt/path/to/db/tbl"
+    COMMENT "nice table"
+    TBLPROPERTIES ( key1=val1, key2=val2  )
+    """
+
+    tokens = sqlparse.parse(s)[0].tokens
+
+    token_values = [(t.value, t.ttype) for t in tokens if t.ttype not in (None, T.Whitespace, T.Text.Whitespace.Newline)]
+
+    assert token_values == [('CREATE', T.Keyword.DDL),
+                            ('TABLE', T.Keyword),
+                            ('IF', T.Keyword),
+                            ('NOT', T.Keyword),
+                            ('EXISTS', T.Keyword),
+                            ('USING', T.Keyword),
+                            ('OPTIONS', T.Keyword),
+                            ('PARTITIONED BY', T.Keyword),
+                            ('CLUSTERED BY', T.Keyword),
+                            ('SORTED BY', T.Keyword),
+                            ('INTO', T.Keyword),
+                            ('4', T.Literal.Number.Integer),
+                            ('BUCKETS', T.Keyword),
+                            ('LOCATION', T.Keyword),
+                            ('COMMENT', T.Keyword),
+                            ('TBLPROPERTIES', T.Keyword)]
\ No newline at end of file
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index af0ba163..b29e8b00 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -189,19 +189,21 @@ def test_parse_identifiers(s):
     assert str(token) == s
     assert isinstance(token, sql.Identifier)
 
-
-def test_parse_group_by():
-    p = sqlparse.parse('GROUP BY')[0]
-    assert len(p.tokens) == 1
-    assert p.tokens[0].ttype is T.Keyword
-
-
-def test_parse_order_by():
-    p = sqlparse.parse('ORDER BY')[0]
+    
+@pytest.mark.parametrize('s', [
+    'GROUP BY',
+    'ORDER BY',
+    'ZORDER BY',
+    'PARTITIONED BY',
+    'SORTED BY',
+    'CLUSTERED BY'
+])
+def test_parse_by_statements(s):
+    p = sqlparse.parse(s)[0]
     assert len(p.tokens) == 1
     assert p.tokens[0].ttype is T.Keyword
 
-
+    
 def test_parse_window_as():
     p = sqlparse.parse('WINDOW w AS')[0]
     assert len(p.tokens) == 5