Skip to content

Commit

Permalink
WIP: Experiment with binary strings
Browse files Browse the repository at this point in the history
  • Loading branch information
nicowilliams committed Jul 21, 2023
1 parent 99a77f7 commit fe7661f
Show file tree
Hide file tree
Showing 10 changed files with 438 additions and 145 deletions.
140 changes: 55 additions & 85 deletions src/builtin.c
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,15 @@ static jv f_length(jq_state *jq, jv input) {

static jv f_tostring(jq_state *jq, jv input) {
if (jv_get_kind(input) == JV_KIND_STRING) {
return input;
switch (jv_get_string_kind(input)) {
case JV_STRING_KIND_UTF8:
return input;
default:
jv o = jv_string_sized(jv_string_value(input),
jv_string_length_bytes(jv_copy(input)));
jv_free(input);
return o;
}
} else {
return jv_dump_string(input, 0);
}
Expand All @@ -525,24 +533,6 @@ static jv f_utf8bytelength(jq_state *jq, jv input) {

#define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"

static const unsigned char BASE64_ENCODE_TABLE[64 + 1] = CHARS_ALPHANUM "+/";
static const unsigned char BASE64_INVALID_ENTRY = 0xFF;
static const unsigned char BASE64_DECODE_TABLE[255] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62, // +
0xFF, 0xFF, 0xFF,
63, // /
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // 0-9
0xFF, 0xFF, 0xFF,
99, // =
0xFF, 0xFF, 0xFF,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // A-Z
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // a-z
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
};


static jv escape_string(jv input, const char* escapings) {

assert(jv_get_kind(input) == JV_KIND_STRING);
Expand All @@ -560,7 +550,7 @@ static jv escape_string(jv input, const char* escapings) {
const char* i = jv_string_value(input);
const char* end = i + jv_string_length_bytes(jv_copy(input));
const char* cstart;
int c = 0;
uint32_t c = 0;
while ((i = jvp_utf8_next((cstart = i), end, &c))) {
if (c < 128 && lookup[c]) {
ret = jv_string_append_str(ret, lookup[c]);
Expand Down Expand Up @@ -693,70 +683,10 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
return line;
} else if (!strcmp(fmt_s, "base64")) {
jv_free(fmt);
input = f_tostring(jq, input);
jv line = jv_string("");
const unsigned char* data = (const unsigned char*)jv_string_value(input);
int len = jv_string_length_bytes(jv_copy(input));
for (int i=0; i<len; i+=3) {
uint32_t code = 0;
int n = len - i >= 3 ? 3 : len-i;
for (int j=0; j<3; j++) {
code <<= 8;
code |= j < n ? (unsigned)data[i+j] : 0;
}
char buf[4];
for (int j=0; j<4; j++) {
buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
}
if (n < 3) buf[3] = '=';
if (n < 2) buf[2] = '=';
line = jv_string_append_buf(line, buf, sizeof(buf));
}
jv_free(input);
return line;
return jv_binary_to_base64(f_tostring(jq, input));
} else if (!strcmp(fmt_s, "base64d")) {
jv_free(fmt);
input = f_tostring(jq, input);
const unsigned char* data = (const unsigned char*)jv_string_value(input);
int len = jv_string_length_bytes(jv_copy(input));
size_t decoded_len = (3 * len) / 4; // 3 usable bytes for every 4 bytes of input
char *result = jv_mem_calloc(decoded_len, sizeof(char));
memset(result, 0, decoded_len * sizeof(char));
uint32_t ri = 0;
int input_bytes_read=0;
uint32_t code = 0;
for (int i=0; i<len && data[i] != '='; i++) {
if (BASE64_DECODE_TABLE[data[i]] == BASE64_INVALID_ENTRY) {
free(result);
return type_error(input, "is not valid base64 data");
}

code <<= 6;
code |= BASE64_DECODE_TABLE[data[i]];
input_bytes_read++;

if (input_bytes_read == 4) {
result[ri++] = (code >> 16) & 0xFF;
result[ri++] = (code >> 8) & 0xFF;
result[ri++] = code & 0xFF;
input_bytes_read = 0;
code = 0;
}
}
if (input_bytes_read == 3) {
result[ri++] = (code >> 10) & 0xFF;
result[ri++] = (code >> 2) & 0xFF;
} else if (input_bytes_read == 2) {
result[ri++] = (code >> 4) & 0xFF;
} else if (input_bytes_read == 1) {
free(result);
return type_error(input, "trailing base64 byte found");
}

jv line = jv_string_sized(result, ri);
jv_free(input);
free(result);
return line;
return jv_binary_from_base64(f_tostring(jq, input));
} else {
jv_free(input);
return jv_invalid_with_msg(jv_string_concat(fmt, jv_string(" is not a valid format")));
Expand Down Expand Up @@ -1603,13 +1533,24 @@ static jv f_strftime(jq_state *jq, jv a, jv b) {

const char *fmt = jv_string_value(b);
size_t alloced = strlen(fmt) + 100;
char *buf = alloca(alloced);
char *buf;
if (alloced > 2048)
buf = jv_mem_alloc(alloced);
else
buf = alloca(alloced);
size_t n = strftime(buf, alloced, fmt, &tm);
jv_free(b);
/* POSIX doesn't provide errno values for strftime() failures; weird */
if (n == 0 || n > alloced)
if (n == 0 || n > alloced) {
if (alloced > 2048)
jv_mem_free(buf);
return jv_invalid_with_msg(jv_string("strftime/1: unknown system failure"));
return jv_string(buf);
}
if (alloced > 2048)
return jv_string(buf);
b = jv_string(buf);
jv_mem_free(buf);
return b;
}
#else
static jv f_strftime(jq_state *jq, jv a, jv b) {
Expand Down Expand Up @@ -1678,6 +1619,34 @@ static jv f_current_line(jq_state *jq, jv a) {
return jq_util_input_get_current_line(jq);
}

static jv f_tobinary(jq_state *jq, jv a) {
switch (jv_get_kind(a)) {
case JV_KIND_STRING:
a.subkind = JV_STRING_KIND_BINARY;
return a;
case JV_KIND_ARRAY:
int len = jv_array_length(jv_copy(a));
unsigned char *b = jv_mem_alloc(len);
jv_array_foreach(a, i, x) {
if (jv_get_kind(x) != JV_KIND_NUMBER) {
char errbuf[15];

jv_mem_free(b);
return jv_invalid_with_msg(jv_string_fmt("Not a byte value at array index %d: %s", i,
jv_dump_string_trunc(jv_copy(x), errbuf, sizeof(errbuf))));
}
/* XXX No validation that `x' is an integer... */
b[i] = jv_number_value(x);
}
jv_free(a);
a = jv_binary_sized(b, len);
jv_mem_free(b);
return a;
default:
return ret_error(a, jv_string("Only strings and arrays of byte values can be converted to binary"));
}
}

#define LIBM_DD(name) \
{(cfunction_ptr)f_ ## name, #name, 1},
#define LIBM_DD_NO(name)
Expand Down Expand Up @@ -1769,6 +1738,7 @@ static const struct cfunction function_list[] = {
{(cfunction_ptr)f_now, "now", 1},
{(cfunction_ptr)f_current_filename, "input_filename", 1},
{(cfunction_ptr)f_current_line, "input_line_number", 1},
{(cfunction_ptr)f_tobinary, "tobinary", 1},
};
#undef LIBM_DDDD_NO
#undef LIBM_DDD_NO
Expand Down
97 changes: 81 additions & 16 deletions src/execute.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "jv_alloc.h"
#include "jq_parser.h"
#include "jv_unicode.h"
#include "locfile.h"
#include "jv.h"
#include "jq.h"
Expand Down Expand Up @@ -673,20 +674,45 @@ jv jq_next(jq_state *jq) {
case INDEX_OPT: {
jv t = stack_pop(jq);
jv k = stack_pop(jq);
// detect invalid path expression like path(reverse | .a)
if (!path_intact(jq, jv_copy(t))) {
char keybuf[15];
char objbuf[30];
jv msg = jv_string_fmt(
"Invalid path expression near attempt to access element %s of %s",
jv_dump_string_trunc(k, keybuf, sizeof(keybuf)),
jv_dump_string_trunc(t, objbuf, sizeof(objbuf)));
set_error(jq, jv_invalid_with_msg(msg));
goto do_backtrack;
jv v;
if (jv_get_kind(t) == JV_KIND_STRING && jv_get_kind(k) == JV_KIND_NUMBER) {
switch (jv_get_string_kind(t)) {
case JV_STRING_KIND_UTF8:
v = jv_string_append_codepoint(jv_string(""), jv_string_index(t, jv_number_value(k)));
break;
case JV_STRING_KIND_BINARY:
const char *s = jv_string_value(t);
int len = jv_string_length_bytes(jv_copy(t));
int idx = jv_number_value(k);

if (idx < 0)
idx += idx;
if (idx < 0 || idx >= len)
goto do_backtrack;
v = jv_number(((unsigned char *)s)[idx]);
jv_free(t);
break;
default:
set_error(jq, jv_invalid_with_msg(jv_string("Internal error: unknown string sub-type")));
goto do_backtrack;
}
} else {
// detect invalid path expression like path(reverse | .a)
if (!path_intact(jq, jv_copy(t))) {
char keybuf[15];
char objbuf[30];
jv msg = jv_string_fmt(
"Invalid path expression near attempt to access element %s of %s",
jv_dump_string_trunc(k, keybuf, sizeof(keybuf)),
jv_dump_string_trunc(t, objbuf, sizeof(objbuf)));
set_error(jq, jv_invalid_with_msg(msg));
goto do_backtrack;
}
v = jv_get(t, jv_copy(k));
if (jv_is_valid(v))
path_append(jq, k, jv_copy(v));
}
jv v = jv_get(t, jv_copy(k));
if (jv_is_valid(v)) {
path_append(jq, k, jv_copy(v));
stack_push(jq, v);
} else {
jv_free(k);
Expand Down Expand Up @@ -721,7 +747,8 @@ jv jq_next(jq_state *jq) {
case EACH_OPT: {
jv container = stack_pop(jq);
// detect invalid path expression like path(reverse | .[])
if (!path_intact(jq, jv_copy(container))) {
if (jv_get_kind(container) != JV_KIND_STRING &&
!path_intact(jq, jv_copy(container))) {
char errbuf[30];
jv msg = jv_string_fmt(
"Invalid path expression near attempt to iterate through %s",
Expand Down Expand Up @@ -758,6 +785,42 @@ jv jq_next(jq_state *jq) {
key = jv_object_iter_key(container, idx);
value = jv_object_iter_value(container, idx);
}
} else if (jv_get_kind(container) == JV_KIND_STRING) {
switch (jv_get_string_kind(container)) {
case JV_STRING_KIND_UTF8: {
const char *s = jv_string_value(container);
const char *next = s;
int len = jv_string_length_bytes(jv_copy(container));
const char *end = s + len;
int c;
if (opcode == EACH || opcode == EACH_OPT) {
idx = 0;
} else {
next = s + idx;
}
keep_going = idx < len;
next = jvp_utf8_next(next, end, &c);
idx = next - s;
value = jv_string_append_codepoint(jv_string(""), c);
is_last = jvp_utf8_next(next, end, &c) == 0;
break;
}
case JV_STRING_KIND_BINARY:
const unsigned char *s = (const unsigned char *)jv_string_value(container);
int len = jv_string_length_bytes(jv_copy(container));
if (opcode == EACH || opcode == EACH_OPT) {
idx = 0;
} else {
idx++;
}
keep_going = idx < len;
value = jv_string_append_codepoint(jv_string(""), s[idx]);
is_last = idx == len -1;
break;
default:
set_error(jq, jv_invalid_with_msg(jv_string("Internal error: unknown string sub-type")));
goto do_backtrack;
}
} else {
assert(opcode == EACH || opcode == EACH_OPT);
if (opcode == EACH) {
Expand All @@ -777,15 +840,17 @@ jv jq_next(jq_state *jq) {
goto do_backtrack;
} else if (is_last) {
// we don't need to make a backtrack point
jv_free(container);
path_append(jq, key, jv_copy(value));
if (jv_get_kind(container) != JV_KIND_STRING)
path_append(jq, key, jv_copy(value));
stack_push(jq, value);
jv_free(container);
} else {
struct stack_pos spos = stack_get_pos(jq);
stack_push(jq, container);
stack_push(jq, jv_number(idx));
stack_save(jq, pc - 1, spos);
path_append(jq, key, jv_copy(value));
if (jv_get_kind(container) != JV_KIND_STRING)
path_append(jq, key, jv_copy(value));
stack_push(jq, value);
}
break;
Expand Down
8 changes: 7 additions & 1 deletion src/jq.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,15 @@ jv jq_get_attr(jq_state *, jv);
*/
typedef struct jq_util_input_state jq_util_input_state;
typedef void (*jq_util_msg_cb)(void *, const char *);
typedef enum {
JQ_UTIL_PARSE_SLURP = 1,
JQ_UTIL_PARSE_BINARY = 2,
} jq_util_parser_enum;



jq_util_input_state *jq_util_input_init(jq_util_msg_cb, void *);
void jq_util_input_set_parser(jq_util_input_state *, jv_parser *, int);
void jq_util_input_set_parser(jq_util_input_state *, jv_parser *, jq_util_parser_enum);
void jq_util_input_free(jq_util_input_state **);
void jq_util_input_add_input(jq_util_input_state *, const char *);
int jq_util_input_errors(jq_util_input_state *);
Expand Down
Loading

0 comments on commit fe7661f

Please sign in to comment.