Skip to content

Commit

Permalink
WIP: Experiment with binary strings
Browse files Browse the repository at this point in the history
  • Loading branch information
nicowilliams committed Jul 21, 2023
1 parent 99a77f7 commit 7db1557
Show file tree
Hide file tree
Showing 15 changed files with 604 additions and 153 deletions.
49 changes: 48 additions & 1 deletion docs/content/manual/manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,9 @@ sections:
`null` can be added to any value, and returns the other
value unchanged.
A numeric byte value between 0 and 255, inclusive, can be
added to a binary string value.
examples:
- program: '.a + 1'
input: '{"a": 7}'
Expand Down Expand Up @@ -1414,13 +1417,55 @@ sections:
input: '[1, "1", [1]]'
output: ['"1"', '"1"', '"[1]"']

- title: "`tobinary`"
body: |
The `tobinary` function is like `tostring`, but its output
will be a string which when output to jq's output stream
will be base64-encoded, and which if added with other
strings will produce a binary string value.
Internally the binary string may be represented efficiently,
and may not be encoded until it is output or until it is
passed to `tostring`. Adding a byte value (integer value
between 0 and 255, inclusive) to a binary string is allowed,
and will append that byte to it.
- title: "`tobinary_bytearray`"
body: |
The `tobinary_bytearray` function is like `tobinary`, but
when output by jq it will be represented as an array of
small non-negative byte value integers.
- title: "`tobinary_utf8`"
body: |
The `tobinary_utf8` function is like `tobinary`, but when
output by jq it will be converted to UTF-8 with bad
character replacements.
- title: "`tobinary(bytes)`"
body: |
This function constructs a binary string value like
`tobinary` but consisting of the byte values output by
`bytes`.
- title: "`type`"
body: |
The `type` function returns the type of its argument as a
string, which is one of null, boolean, number, string, array
or object.
- title: "`stringtype`"
body: |
Strings can be UTF-8 strings or binary strings. The
`stringtype` builtin outputs `"UTF-8"` or `"binary"` when
given a string as input.
examples:
- program: 'map(type)'
input: '[0, false, [], {}, null, "hello"]'
Expand Down Expand Up @@ -2038,7 +2083,9 @@ sections:
* `@base64d`:
The inverse of `@base64`, input is decoded as specified by RFC 4648.
Note\: If the decoded string is not UTF-8, the results are undefined.
The result will be a binary string as if `tobinary_utf8`
was used, meaning that on output bad characters will be
replaced.
This syntax can be combined with string interpolation in a
useful way. You can follow a `@foo` token with a string
Expand Down
23 changes: 22 additions & 1 deletion jq.1.prebuilt

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

168 changes: 83 additions & 85 deletions src/builtin.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,14 @@ static jv f_plus(jq_state *jq, jv input, jv a, jv b) {
return jv_array_concat(a, b);
} else if (jv_get_kind(a) == JV_KIND_OBJECT && jv_get_kind(b) == JV_KIND_OBJECT) {
return jv_object_merge(a, b);
} else if (jv_get_kind(a) == JV_KIND_STRING &&
jv_get_string_kind(a) != JV_STRING_KIND_UTF8 &&
jv_get_kind(b) == JV_KIND_NUMBER) {
int c = jv_number_value(b);
if (c < 0 || c > 255)
return type_error2(a, b, "cannot be added because the latter is not a valid byte value");
unsigned char uc = c;
return jv_binary_append_buf(a, &uc, 1);
} else {
return type_error2(a, b, "cannot be added");
}
Expand Down Expand Up @@ -511,7 +519,12 @@ static jv f_length(jq_state *jq, jv input) {

static jv f_tostring(jq_state *jq, jv input) {
if (jv_get_kind(input) == JV_KIND_STRING) {
return input;
switch (jv_get_string_kind(input)) {
case JV_STRING_KIND_UTF8:
return input;
default:
return jv_string_from_binary(input);
}
} else {
return jv_dump_string(input, 0);
}
Expand All @@ -525,24 +538,6 @@ static jv f_utf8bytelength(jq_state *jq, jv input) {

#define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"

static const unsigned char BASE64_ENCODE_TABLE[64 + 1] = CHARS_ALPHANUM "+/";
static const unsigned char BASE64_INVALID_ENTRY = 0xFF;
static const unsigned char BASE64_DECODE_TABLE[255] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62, // +
0xFF, 0xFF, 0xFF,
63, // /
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // 0-9
0xFF, 0xFF, 0xFF,
99, // =
0xFF, 0xFF, 0xFF,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // A-Z
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // a-z
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
};


static jv escape_string(jv input, const char* escapings) {

assert(jv_get_kind(input) == JV_KIND_STRING);
Expand All @@ -560,7 +555,7 @@ static jv escape_string(jv input, const char* escapings) {
const char* i = jv_string_value(input);
const char* end = i + jv_string_length_bytes(jv_copy(input));
const char* cstart;
int c = 0;
uint32_t c = 0;
while ((i = jvp_utf8_next((cstart = i), end, &c))) {
if (c < 128 && lookup[c]) {
ret = jv_string_append_str(ret, lookup[c]);
Expand Down Expand Up @@ -693,70 +688,10 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
return line;
} else if (!strcmp(fmt_s, "base64")) {
jv_free(fmt);
input = f_tostring(jq, input);
jv line = jv_string("");
const unsigned char* data = (const unsigned char*)jv_string_value(input);
int len = jv_string_length_bytes(jv_copy(input));
for (int i=0; i<len; i+=3) {
uint32_t code = 0;
int n = len - i >= 3 ? 3 : len-i;
for (int j=0; j<3; j++) {
code <<= 8;
code |= j < n ? (unsigned)data[i+j] : 0;
}
char buf[4];
for (int j=0; j<4; j++) {
buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
}
if (n < 3) buf[3] = '=';
if (n < 2) buf[2] = '=';
line = jv_string_append_buf(line, buf, sizeof(buf));
}
jv_free(input);
return line;
return jv_binary_to_base64(f_tostring(jq, input));
} else if (!strcmp(fmt_s, "base64d")) {
jv_free(fmt);
input = f_tostring(jq, input);
const unsigned char* data = (const unsigned char*)jv_string_value(input);
int len = jv_string_length_bytes(jv_copy(input));
size_t decoded_len = (3 * len) / 4; // 3 usable bytes for every 4 bytes of input
char *result = jv_mem_calloc(decoded_len, sizeof(char));
memset(result, 0, decoded_len * sizeof(char));
uint32_t ri = 0;
int input_bytes_read=0;
uint32_t code = 0;
for (int i=0; i<len && data[i] != '='; i++) {
if (BASE64_DECODE_TABLE[data[i]] == BASE64_INVALID_ENTRY) {
free(result);
return type_error(input, "is not valid base64 data");
}

code <<= 6;
code |= BASE64_DECODE_TABLE[data[i]];
input_bytes_read++;

if (input_bytes_read == 4) {
result[ri++] = (code >> 16) & 0xFF;
result[ri++] = (code >> 8) & 0xFF;
result[ri++] = code & 0xFF;
input_bytes_read = 0;
code = 0;
}
}
if (input_bytes_read == 3) {
result[ri++] = (code >> 10) & 0xFF;
result[ri++] = (code >> 2) & 0xFF;
} else if (input_bytes_read == 2) {
result[ri++] = (code >> 4) & 0xFF;
} else if (input_bytes_read == 1) {
free(result);
return type_error(input, "trailing base64 byte found");
}

jv line = jv_string_sized(result, ri);
jv_free(input);
free(result);
return line;
return jv_binary_from_base64(f_tostring(jq, input));
} else {
jv_free(input);
return jv_invalid_with_msg(jv_string_concat(fmt, jv_string(" is not a valid format")));
Expand Down Expand Up @@ -1603,13 +1538,24 @@ static jv f_strftime(jq_state *jq, jv a, jv b) {

const char *fmt = jv_string_value(b);
size_t alloced = strlen(fmt) + 100;
char *buf = alloca(alloced);
char *buf;
if (alloced > 2048)
buf = jv_mem_alloc(alloced);
else
buf = alloca(alloced);
size_t n = strftime(buf, alloced, fmt, &tm);
jv_free(b);
/* POSIX doesn't provide errno values for strftime() failures; weird */
if (n == 0 || n > alloced)
if (n == 0 || n > alloced) {
if (alloced > 2048)
jv_mem_free(buf);
return jv_invalid_with_msg(jv_string("strftime/1: unknown system failure"));
return jv_string(buf);
}
if (alloced < 2048)
return jv_string(buf);
b = jv_string(buf);
jv_mem_free(buf);
return b;
}
#else
static jv f_strftime(jq_state *jq, jv a, jv b) {
Expand Down Expand Up @@ -1678,6 +1624,54 @@ static jv f_current_line(jq_state *jq, jv a) {
return jq_util_input_get_current_line(jq);
}

static jv f_tobinary(jq_state *jq, jv a) {
switch (jv_get_kind(a)) {
case JV_KIND_STRING:
a.subkind = JV_STRING_KIND_BINARY;
return a;
case JV_KIND_ARRAY:
int len = jv_array_length(jv_copy(a));
unsigned char *b = jv_mem_alloc(len);
jv_array_foreach(a, i, x) {
if (jv_get_kind(x) != JV_KIND_NUMBER) {
char errbuf[15];

jv_mem_free(b);
return jv_invalid_with_msg(jv_string_fmt("Not a byte value at array index %d: %s", i,
jv_dump_string_trunc(jv_copy(x), errbuf, sizeof(errbuf))));
}
/* XXX No validation that `x' is an integer... */
b[i] = jv_number_value(x);
}
jv_free(a);
a = jv_binary_sized(b, len);
jv_mem_free(b);
return a;
default:
return ret_error(a, jv_string("Only strings and arrays of byte values can be converted to binary"));
}
}

static jv f_tobinary_bytearray(jq_state *jq, jv a) {
a = f_tobinary(jq, a);
a.subkind = JV_STRING_KIND_BINARY_BYTEARRAY;
return a;
}

static jv f_tobinary_utf8(jq_state *jq, jv a) {
a = f_tobinary(jq, a);
a.subkind = JV_STRING_KIND_BINARY_UTF8;
return a;
}

static jv f_stringtype(jq_state *jq, jv a) {
if (jv_get_kind(a) != JV_KIND_STRING)
return type_error(a, "is not a string");
jv b = jv_string(jv_string_kind_name(jv_get_string_kind(a)));
jv_free(a);
return b;
}

#define LIBM_DD(name) \
{(cfunction_ptr)f_ ## name, #name, 1},
#define LIBM_DD_NO(name)
Expand Down Expand Up @@ -1769,6 +1763,10 @@ static const struct cfunction function_list[] = {
{(cfunction_ptr)f_now, "now", 1},
{(cfunction_ptr)f_current_filename, "input_filename", 1},
{(cfunction_ptr)f_current_line, "input_line_number", 1},
{(cfunction_ptr)f_tobinary, "tobinary", 1},
{(cfunction_ptr)f_tobinary_bytearray, "tobinary_bytearray", 1},
{(cfunction_ptr)f_tobinary_utf8, "tobinary_utf8", 1},
{(cfunction_ptr)f_stringtype, "stringtype", 1},
};
#undef LIBM_DDDD_NO
#undef LIBM_DDD_NO
Expand Down
2 changes: 2 additions & 0 deletions src/builtin.jq
Original file line number Diff line number Diff line change
Expand Up @@ -281,3 +281,5 @@ def JOIN($idx; stream; idx_expr; join_expr):
stream | [., $idx[idx_expr]] | join_expr;
def IN(s): any(s == .; .);
def IN(src; s): any(src == s; .);

def tobinary(bytes): reduce bytes as $byte (""|tobinary; . + $byte);
Loading

0 comments on commit 7db1557

Please sign in to comment.