Skip to content

Commit

Permalink
Fix unicode string indexing bug in RegExpExec
Browse files Browse the repository at this point in the history
Signed-off-by: Seonghyun Kim <[email protected]>
  • Loading branch information
ksh8281 committed Jul 22, 2024
1 parent 098222a commit 34f7b81
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 19 deletions.
4 changes: 2 additions & 2 deletions src/builtins/BuiltinRegExp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@ static Value builtinRegExpExec(ExecutionState& state, Value thisValue, size_t ar
if (regexp->matchNonGlobally(state, str, result, false, lastIndex)) {
int e = result.m_matchResults[0][0].m_end;
if (option & RegExpObject::Option::Unicode) {
char16_t utfRes = str->charAt(e);
char16_t utfRes = (static_cast<size_t>(e) == str->length()) ? 0 : str->charAt(e);
const char* buf = reinterpret_cast<const char*>(&utfRes);
size_t len = strnlen(buf, 2);
size_t eUTF = str->find(buf, len, 0);
size_t eUTF = len == 0 ? str->length() : (str->find(buf, len, 0));
if (eUTF >= str->length()) {
e = str->length();
} else if ((int)eUTF > e || e == (int)str->length()) {
Expand Down
58 changes: 42 additions & 16 deletions src/runtime/String.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1030,37 +1030,63 @@ String* String::substring(size_t from, size_t to)
return builder.finalize();
}

// https://www.ecma-international.org/ecma-262/6.0/#sec-advancestringindex
// https://tc39.es/ecma262/multipage/text-processing.html#sec-advancestringindex
uint64_t String::advanceStringIndex(uint64_t index, bool unicode)
{
ASSERT(isString());
// Assert: index ≤ 2**53 - 1.
ASSERT(index <= (1ULL << 53) - 1);

// If unicode is false, return index + 1.
if (!unicode) {
return index + 1;
}
// Let length be the number of code units in S.

// Let length be the length of S.
size_t length = this->length();
// If index + 1 >= legnth, return index + 1.

// If index + 1 ≥ length, return index + 1.
if (index + 1 >= length) {
return index + 1;
}

// Let first be the code unit value at index `index` in S.
char16_t first = this->charAt(index);
// If first < 0xD800 or first > 0xDBFF, return index + 1.
if (first < 0xD800 || first > 0xDBFF) {
return index + 1;
// Let cp be CodePointAt(S, index).
auto cp = codePointAt(index);
// Return index + cp.[[CodeUnitCount]].
return index + cp.codeUnitCount;
}

// https://tc39.es/ecma262/multipage/ecmascript-language-source-code.html#sec-codepointat
String::CodePointAtResult String::codePointAt(size_t position)
{
auto bad = bufferAccessData();
// Let size be the length of string.
const auto& size = bad.length;
// Assert: position ≥ 0 and position < size.
// Let first be the code unit at index position within string.
char16_t first = bad.charAt(position);
// Let cp be the code point whose numeric value is the numeric value of first.
char32_t cp = first;
// If first is neither a leading surrogate nor a trailing surrogate, then
if (!U16_IS_LEAD(cp) && !U16_IS_TRAIL(cp)) {
// Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: false }.
return { cp, 1, false };
}
// Let second be the code unit value at index `index + 1` in S.
char16_t second = this->charAt(index + 1);
// If second < 0xDC00 or second > 0xDFFF, return index + 1.
if (second < 0xDC00 || second > 0xDFFF) {
return index + 1;
// If first is a trailing surrogate or position + 1 = size, then
if (U16_IS_TRAIL(first) || (position + 1) == size) {
// Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
return { cp, 1, true };
}
// Let second be the code unit at index position + 1 within string.
char16_t second = bad.charAt(position + 1);
// If second is not a trailing surrogate, then
if (!U16_IS_TRAIL(second)) {
// Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
return { cp, 1, true };
}
// Return index + 2.
return index + 2;
// Set cp to UTF16SurrogatePairToCodePoint(first, second).
cp = U16_GET_SUPPLEMENTARY(first, second);
// Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 2, [[IsUnpairedSurrogate]]: false }.
return { cp, 2, false };
}

bool String::isAllSpecialCharacters(bool (*fn)(char))
Expand Down
7 changes: 7 additions & 0 deletions src/runtime/String.h
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,13 @@ class String : public PointerValue {

uint64_t advanceStringIndex(uint64_t index, bool unicode);

struct CodePointAtResult {
char32_t codePoint;
unsigned codeUnitCount;
bool isUnpairedSurrogate;
};
CodePointAtResult codePointAt(size_t position);

bool isAllSpecialCharacters(bool (*fn)(char));

enum StringTrimWhere : unsigned {
Expand Down
1 change: 0 additions & 1 deletion tools/test/test262/excludelist.orig.xml
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,6 @@
<test id="built-ins/Object/prototype/toString/symbol-tag-set-builtin"><reason>TODO</reason></test>
<test id="built-ins/Object/prototype/toString/symbol-tag-string-builtin"><reason>TODO</reason></test>
<test id="built-ins/Object/seal/proxy-with-defineProperty-handler"><reason>TODO</reason></test>
<test id="built-ins/RegExp/prototype/Symbol.split/u-lastindex-adv-thru-match"><reason>TODO</reason></test>
<test id="built-ins/RegExp/prototype/source/value-line-terminator"><reason>TODO</reason></test>
<test id="built-ins/RegExp/prototype/unicodeSets/cross-realm"><reason>TODO</reason></test>
<test id="built-ins/RegExp/prototype/unicodeSets/length"><reason>TODO</reason></test>
Expand Down

0 comments on commit 34f7b81

Please sign in to comment.