Skip to content

Commit

Permalink
feat: generate hashed data and repeat X times
Browse files Browse the repository at this point in the history
  • Loading branch information
ejblanco committed Nov 19, 2024
1 parent e0f0d9d commit 414f13b
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 2 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM gradle:8.1-jdk11

RUN apt-get update && \
apt-get install -y python3.6 python3-pip jq gettext-base
apt-get install -y python3.6 python3-pip jq gettext-base uuid-runtime

RUN python3 -m pip install deepmerge

Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,21 @@ if <restart> is equal to <start>. If provided with a boolean
schema, only <start> may be specified; the resulting values will
begin with <start> and alternate from `true` to `false` and from
`false` to `true` from that point on.
+ __hashed:__ If this options is set to `true`, the iteration will be hashed.
This is useful when you want to generate IDs that can join with other datasets.
+ __num_repetitions:__ Number of times you want each element to be repeated.
This is useful when you want to generate a dataset with a lot of repeated IDs.
```json
"arg.properties": {
"iteration": {
"start": "0"
},
"hashed": true,
"num_repetitions": 4
}
```
> ITERATION_STEP environment var can be used as script argument
+ __range:__ A JSON object that conforms to the following formats:
- `{"min": <min>, "max": <max>}` (at least one of "min" or "max" must be
specified). If provided, ensures that the generated number will be
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;

/**
Expand Down Expand Up @@ -93,6 +95,15 @@ public class Generator {
*/
public static final String SUFFIX_PROP = "suffix";

/**
* If you want to apply a hash function to generated values in iterations. As a boolean.
*/
public static final String HASHED_PROP = "hashed";

/**
* Number of times the same element would be repeated in iterations. As an integer.
*/
public static final String REPEAT_PROP = "num_repetitions";
/**
* The name of the attribute for specifying specific values which should be randomly chosen from
* when generating values for the schema. Can be given as either an array of values or an object
Expand Down Expand Up @@ -1042,6 +1053,9 @@ private Iterator<Object> parseIterations(Schema schema, Map propertiesProp) {
case DOUBLE:
return getDoubleIterator(iterationProps);
case STRING:
if (propertiesProp.containsKey(HASHED_PROP) && (Boolean) propertiesProp.get(HASHED_PROP)) {
return createHashedStringIterator(getIntegerIterator(iterationProps), propertiesProp);
}
return createStringIterator(getIntegerIterator(iterationProps), propertiesProp);
default:
throw new UnsupportedOperationException(String.format(
Expand Down Expand Up @@ -1157,6 +1171,53 @@ public Object next() {
};
}

private String hashNumber(BigInteger number) {
try {
MessageDigest md = MessageDigest.getInstance("SHA-256");
byte[] hash = md.digest(number.toByteArray());
StringBuilder hexString = new StringBuilder();
for (byte b : hash) {
String hex = Integer.toHexString(0xff & b);
if (hex.length() == 1) hexString.append('0');
hexString.append(hex);
}
return hexString.toString();
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
}

private Iterator<Object> createHashedStringIterator(Iterator<Object> inner, Map propertiesProp) {
return new Iterator<Object>() {
private Integer count = 0;
private String currentValue = "";
private final Integer repeatValue = getIntegerNumberField(
HASHED_PROP,
REPEAT_PROP,
propertiesProp
);
private final Integer numRepeat = repeatValue == null ? 1 : repeatValue;

@Override
public boolean hasNext() {
return inner.hasNext();
}

@Override
public Object next() {
if (numRepeat == 1) {
return hashNumber(BigInteger.valueOf((Integer) inner.next()));
}
if (count % numRepeat == 0) {
currentValue = hashNumber(BigInteger.valueOf((Integer) inner.next()));
count = 0;
}
count++;
return currentValue;
}
};
}

private Iterator<Object> getIntegerIterator(Map iterationProps) {
Integer iterationStartField = getIntegerNumberField(
ITERATION_PROP,
Expand Down
3 changes: 2 additions & 1 deletion lanuza/scripts/build-data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ function run() {
python3 lib/merge-schemas.py -s "$schema_orig" -e "$schema_extension" --out "${merged_schema_path}"

export DATE_RANGE_START="${START_DATE}"
local uuid=$(uuidgen)
while [ "${DATE_RANGE_START}" != "${END_DATE}" ]; do
export DATE_RANGE_END=$(date -I -d "${DATE_RANGE_START} + 1 day")

Expand All @@ -75,7 +76,7 @@ function run() {
if [ "${SHOW}" = "true" ]; then
ARGS="-j -p"
else
out="${out_dir}/${DATASET_ID}.${FILE_PREFIX}.${counter}_${DATE_RANGE_START}.avro"
out="${out_dir}/${DATASET_ID}.${FILE_PREFIX}.${counter}_${uuid}_${DATE_RANGE_START}.avro"
touch $out
ARGS="-b -o ${out}"
fi
Expand Down

0 comments on commit 414f13b

Please sign in to comment.