databrickslabs · ronanstokes-db · Mar 17, 2023 · Mar 17, 2023 · Mar 20, 2023 · Mar 20, 2023
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -48,4 +48,5 @@ jobs:
         with:
           files: ./coverage.xml
           name: dbldatagen
-          verbose: true
+          verbose: true
+          token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ All notable changes to the Databricks Labs Data Generator will be documented in
 
 #### Changed
 * Fixed use of logger in _version.py and in spark_singleton.py
+* Fixed template issues 
 
 ### Version 0.3.2
 

diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py
diff --git a/docs/source/APIDOCS.md b/docs/source/APIDOCS.md
@@ -250,11 +250,11 @@ dataspec = (dg.DataGenerator(spark, rows=10000000, partitions=8,
             .withSchema(table_schema))
 
 dataspec = (dataspec
-                .withColumnSpec("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')                                       
+                .withColumnSpec("name", percentNulls=0.01, template=r'\w \w|\w a. \w')                                       
                 .withColumnSpec("serial_number", minValue=1000000, maxValue=10000000, 
                                  prefix="dr", random=True) 
-                .withColumnSpec("email", template=r'\\w.\\w@\\w.com')       
-                .withColumnSpec("license_plate", template=r'\\n-\\n')
+                .withColumnSpec("email", template=r'\w.\w@\w.com')       
+                .withColumnSpec("license_plate", template=r'\n-\n')
            )
 df1 = dataspec.build()
 
@@ -472,7 +472,7 @@ data_rows = 10000000
 spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
 
 dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname")
-                .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w') 
+                .withColumn("name", percentNulls=0.01, template=r'\w \w|\w a. \w') 
                 .withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'], 
                             random=True)             
                 .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,  
@@ -481,7 +481,7 @@ dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMeth
                 .withColumn("payment_instrument", 
                              expr="format_number(int_payment_instrument, '**** ****** *####')",
                              baseColumn="int_payment_instrument")
-                .withColumn("email", template=r'\\w.\\w@\\w.com')       
+                .withColumn("email", template=r'\w.\w@\w.com')       
                 .withColumn("md5_payment_instrument", 
                             expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
                             baseColumn=['payment_instrument_type', 'payment_instrument']) 
@@ -524,7 +524,7 @@ spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
 dataspec = (
     dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname", 
                      randomSeed=42)
-    .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
+    .withColumn("name", percentNulls=0.01, template=r'\w \w|\w a. \w')
     .withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'],
                 random=True)
     .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,  
@@ -533,7 +533,7 @@ dataspec = (
     .withColumn("payment_instrument", 
                 expr="format_number(int_payment_instrument, '**** ****** *####')",
                 baseColumn="int_payment_instrument")
-    .withColumn("email", template=r'\\w.\\w@\\w.com')
+    .withColumn("email", template=r'\w.\w@\w.com')
     .withColumn("md5_payment_instrument",
                 expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
                 baseColumn=['payment_instrument_type', 'payment_instrument'])

diff --git a/docs/source/generating_cdc_data.rst b/docs/source/generating_cdc_data.rst
@@ -49,18 +49,18 @@ We'll add a timestamp for when the row was generated and a memo field to mark wh
 
    dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
                .withColumn("customer_id","long", uniqueValues=uniqueCustomers)
-               .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
-               .withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
+               .withColumn("name", percentNulls=0.01, template=r'\w \w|\w a. \w')
+               .withColumn("alias", percentNulls=0.01, template=r'\w \w|\w a. \w')
                .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard',
                            'American Express', 'discover', 'branded visa', 'branded mastercard'],
                            random=True, distribution="normal")
                .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,  baseColumn="customer_id",
                            baseColumnType="hash", omit=True)
                .withColumn("payment_instrument", expr="format_number(int_payment_instrument, '**** ****** *####')",
                            baseColumn="int_payment_instrument")
-               .withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
-               .withColumn("email2", template=r'\\w.\\w@\\w.com')
-               .withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')
+               .withColumn("email", template=r'\w.\w@\w.com|\w-\w@\w')
+               .withColumn("email2", template=r'\w.\w@\w.com')
+               .withColumn("ip_address", template=r'\n.\n.\n.\n')
                .withColumn("md5_payment_instrument",
                            expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
                            base_column=['payment_instrument_type', 'payment_instrument'])

diff --git a/docs/source/multi_table_data.rst b/docs/source/multi_table_data.rst
@@ -167,7 +167,7 @@ when using hashed values, the range of the hashes produced can be large.
    customer_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
                .withColumn("customer_id","decimal(10)", minValue=CUSTOMER_MIN_VALUE,
                            uniqueValues=UNIQUE_CUSTOMERS)
-               .withColumn("customer_name", template=r"\\w \\w|\\w a. \\w")
+               .withColumn("customer_name", template=r"\w \w|\w a. \w")
 
                # use the following for a simple sequence
                #.withColumn("device_id","decimal(10)", minValue=DEVICE_MIN_VALUE,

diff --git a/docs/source/textdata.rst b/docs/source/textdata.rst
@@ -164,6 +164,13 @@ If set to False, then the template ``r"\\dr_\\v"`` will generate the values ``"d
 to the values zero to 999. This conforms to earlier implementations for backwards compatibility.
 
 If set to True, then the template ``r"dr_\\v"`` will generate the values ``"dr_0"`` ... ``"dr_999"``
-when applied to the values zero to 999. This conforms to the preferred style going forward
+when applied to the values zero to 999. This conforms to the preferred style going forward. In other words the char `d`
+will not be treated as a special char.
+
+.. note::
+          The legacy mode of operation has a bug where the template sequence r'\\a' produces the same result as r'\a'.
+          This can be disabled by setting the parameter `legacyEscapeTreatment` to False on the TemplateTextGenerator
+          object. It is true by default.
+