Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature new escape treatment #173

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
3 changes: 2 additions & 1 deletion .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,5 @@ jobs:
with:
files: ./coverage.xml
name: dbldatagen
verbose: true
verbose: true
token: ${{ secrets.CODECOV_TOKEN }}
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ All notable changes to the Databricks Labs Data Generator will be documented in

#### Changed
* Fixed use of logger in _version.py and in spark_singleton.py
* Fixed template issues

### Version 0.3.2

Expand Down
248 changes: 135 additions & 113 deletions dbldatagen/text_generators.py

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions docs/source/APIDOCS.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,11 +250,11 @@ dataspec = (dg.DataGenerator(spark, rows=10000000, partitions=8,
.withSchema(table_schema))

dataspec = (dataspec
.withColumnSpec("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
.withColumnSpec("name", percentNulls=0.01, template=r'\w \w|\w a. \w')
.withColumnSpec("serial_number", minValue=1000000, maxValue=10000000,
prefix="dr", random=True)
.withColumnSpec("email", template=r'\\w.\\w@\\w.com')
.withColumnSpec("license_plate", template=r'\\n-\\n')
.withColumnSpec("email", template=r'\w.\w@\w.com')
.withColumnSpec("license_plate", template=r'\n-\n')
)
df1 = dataspec.build()

Expand Down Expand Up @@ -472,7 +472,7 @@ data_rows = 10000000
spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)

dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname")
.withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
.withColumn("name", percentNulls=0.01, template=r'\w \w|\w a. \w')
.withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'],
random=True)
.withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999,
Expand All @@ -481,7 +481,7 @@ dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMeth
.withColumn("payment_instrument",
expr="format_number(int_payment_instrument, '**** ****** *####')",
baseColumn="int_payment_instrument")
.withColumn("email", template=r'\\w.\\w@\\w.com')
.withColumn("email", template=r'\w.\w@\w.com')
.withColumn("md5_payment_instrument",
expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
baseColumn=['payment_instrument_type', 'payment_instrument'])
Expand Down Expand Up @@ -524,7 +524,7 @@ spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
dataspec = (
dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname",
randomSeed=42)
.withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
.withColumn("name", percentNulls=0.01, template=r'\w \w|\w a. \w')
.withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'],
random=True)
.withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999,
Expand All @@ -533,7 +533,7 @@ dataspec = (
.withColumn("payment_instrument",
expr="format_number(int_payment_instrument, '**** ****** *####')",
baseColumn="int_payment_instrument")
.withColumn("email", template=r'\\w.\\w@\\w.com')
.withColumn("email", template=r'\w.\w@\w.com')
.withColumn("md5_payment_instrument",
expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
baseColumn=['payment_instrument_type', 'payment_instrument'])
Expand Down
10 changes: 5 additions & 5 deletions docs/source/generating_cdc_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,18 @@ We'll add a timestamp for when the row was generated and a memo field to mark wh

dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
.withColumn("customer_id","long", uniqueValues=uniqueCustomers)
.withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
.withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
.withColumn("name", percentNulls=0.01, template=r'\w \w|\w a. \w')
.withColumn("alias", percentNulls=0.01, template=r'\w \w|\w a. \w')
.withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard',
'American Express', 'discover', 'branded visa', 'branded mastercard'],
random=True, distribution="normal")
.withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999, baseColumn="customer_id",
baseColumnType="hash", omit=True)
.withColumn("payment_instrument", expr="format_number(int_payment_instrument, '**** ****** *####')",
baseColumn="int_payment_instrument")
.withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
.withColumn("email2", template=r'\\w.\\w@\\w.com')
.withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')
.withColumn("email", template=r'\w.\w@\w.com|\w-\w@\w')
.withColumn("email2", template=r'\w.\w@\w.com')
.withColumn("ip_address", template=r'\n.\n.\n.\n')
.withColumn("md5_payment_instrument",
expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
base_column=['payment_instrument_type', 'payment_instrument'])
Expand Down
2 changes: 1 addition & 1 deletion docs/source/multi_table_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ when using hashed values, the range of the hashes produced can be large.
customer_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
.withColumn("customer_id","decimal(10)", minValue=CUSTOMER_MIN_VALUE,
uniqueValues=UNIQUE_CUSTOMERS)
.withColumn("customer_name", template=r"\\w \\w|\\w a. \\w")
.withColumn("customer_name", template=r"\w \w|\w a. \w")

# use the following for a simple sequence
#.withColumn("device_id","decimal(10)", minValue=DEVICE_MIN_VALUE,
Expand Down
9 changes: 8 additions & 1 deletion docs/source/textdata.rst
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,13 @@ If set to False, then the template ``r"\\dr_\\v"`` will generate the values ``"d
to the values zero to 999. This conforms to earlier implementations for backwards compatibility.

If set to True, then the template ``r"dr_\\v"`` will generate the values ``"dr_0"`` ... ``"dr_999"``
when applied to the values zero to 999. This conforms to the preferred style going forward
when applied to the values zero to 999. This conforms to the preferred style going forward. In other words the char `d`
will not be treated as a special char.

.. note::
The legacy mode of operation has a bug where the template sequence r'\\a' produces the same result as r'\a'.
This can be disabled by setting the parameter `legacyEscapeTreatment` to False on the TemplateTextGenerator
object. It is true by default.



Loading