Skip to content
This repository has been archived by the owner on Oct 12, 2022. It is now read-only.

Make encode reusable in Phobos #2404

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 106 additions & 80 deletions src/core/internal/utf.d
Original file line number Diff line number Diff line change
Expand Up @@ -425,61 +425,98 @@ dchar decode(in dchar[] s, ref size_t idx)
return c; // dummy return
}


/* =================== Encode ======================= */

/*******************************
* Encodes character c and appends it to array s[].
/**
* Encodes `c` into the static array `buf`.
*
* Params:
* buf = destination of encoded character
* c = character to encode
*
* Returns:
* The length of the encoded character (a number between `1` and `4` for
* `char[4]` buffers and a number between `1` and `2` for `wchar[2]` buffers)
* or `0` in case of failure.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great idea. Let's define this function for all inputs (no contracts, no assert(isValidDChar)) and have it return 0 upon bad dchar. Then higher-level functions can throw exceptions, use replacement char etc.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Additionally, instead of returning a length, return a slice to the buffer, or null on bad dchar, because the next thing people will need to do is slice that buffer.
Returning null and not an empty slice is important, because otherwise if (auto s = encode(...)) will always be true.
Lastly, since this piece of code is highly performance sensitive, did you inspect the assembly to make sure out does not get default-initialized on function entry ?

Copy link
Contributor

@jacob-carlborg jacob-carlborg Jan 2, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it better to pass buf as a regular char[] to push the allocation choice to the caller? If the caller wants to pass a static array, the array can be sliced to be able to pass it as char[].

*/
@safe pure nothrow
void encode(ref char[] s, dchar c)
in
@nogc nothrow pure @safe
size_t encode(out char[4] buf, dchar c)
in
{
assert(isValidDchar(c));
}
do
{
if (c <= 0x7F)
{
assert(isValidDchar(c));
buf[0] = cast(char) c;
return 1;
}
do
else if (c <= 0x7FF)
{
char[] r = s;

if (c <= 0x7F)
{
r ~= cast(char) c;
}
else
{
char[4] buf;
uint L;
buf[0] = cast(char)(0xC0 | (c >> 6));
buf[1] = cast(char)(0x80 | (c & 0x3F));
return 2;
}
else if (c <= 0xFFFF)
{
buf[0] = cast(char)(0xE0 | (c >> 12));
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[2] = cast(char)(0x80 | (c & 0x3F));
return 3;
}
else if (c <= 0x10FFFF)
{
buf[0] = cast(char)(0xF0 | (c >> 18));
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[3] = cast(char)(0x80 | (c & 0x3F));
return 4;
}
return 0;
}

if (c <= 0x7FF)
{
buf[0] = cast(char)(0xC0 | (c >> 6));
buf[1] = cast(char)(0x80 | (c & 0x3F));
L = 2;
}
else if (c <= 0xFFFF)
{
buf[0] = cast(char)(0xE0 | (c >> 12));
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[2] = cast(char)(0x80 | (c & 0x3F));
L = 3;
}
else if (c <= 0x10FFFF)
{
buf[0] = cast(char)(0xF0 | (c >> 18));
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[3] = cast(char)(0x80 | (c & 0x3F));
L = 4;
}
else
{
assert(0);
}
r ~= buf[0 .. L];
}
s = r;
/// ditto
@nogc nothrow pure @safe
size_t encode(out wchar[2] buf, dchar c)
in
{
assert(isValidDchar(c));
}
do
{
if (c <= 0xFFFF)
{
buf[0] = cast(wchar) c;
return 1;
}
else if (c <= 0x10FFFF)
{
buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
return 2;
}
return 0;
}

/**
* Encodes character c and appends it to array s[].
*/
nothrow pure @safe
void encode(ref char[] s, dchar c)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a fan of the following functions because:

  1. They are an overload of the previous encode functions, but do something fundamentally different
  2. They rely on the GC. We should be crafting functions that do not allocate, but instead send the characters to a sink or an output range.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exactly: #2404 (comment).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, is this the right order of arguments for UFCS? Would it be better to pass c first?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sink normally is the first argument, as in:

sink.put(c);

in
{
assert(isValidDchar(c));
}
do
{
char[4] buf;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

= void;

size_t L = encode(buf, c);
assert(L); // If L is 0, then encode has failed
s ~= buf[0 .. L];
}

///
unittest
{
debug(utf) printf("utf.encode.unittest\n");
Expand All @@ -499,43 +536,32 @@ unittest
assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
}

/** ditto */
@safe pure nothrow
/// ditto
nothrow pure @safe
void encode(ref wchar[] s, dchar c)
in
{
assert(isValidDchar(c));
}
do
{
wchar[] r = s;

if (c <= 0xFFFF)
{
r ~= cast(wchar) c;
}
else
{
wchar[2] buf;

buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
r ~= buf;
}
s = r;
}
in
{
assert(isValidDchar(c));
}
do
{
wchar[2] buf;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

= void;

size_t L = encode(buf, c);
assert(L);
s ~= buf[0 .. L];
}

/** ditto */
@safe pure nothrow
/// ditto
nothrow pure @safe
void encode(ref dchar[] s, dchar c)
in
{
assert(isValidDchar(c));
}
do
{
s ~= c;
}
in
{
assert(isValidDchar(c));
}
do
{
s ~= c;
}

/**
Returns the code length of $(D c) in the encoding using $(D C) as a
Expand Down