[GR-34937] Adopt TruffleString in TruffleRuby and replace Rope

PullRequest: truffleruby/3432
oracle · Jul 26, 2022 · c5b753a · c5b753a
2 parents 925274f + d216490
commit c5b753a
Show file tree

Hide file tree

Showing 312 changed files with 8,759 additions and 13,592 deletions.
diff --git a/bench/micro/file/write.rb b/bench/micro/file/write.rb
@@ -10,8 +10,8 @@
 
 kilobyte = 'x' * 1024
 
-if defined?(Truffle::Ropes.flatten_rope)
-  kilobyte = Truffle::Ropes.flatten_rope(kilobyte)
+if defined?(Truffle::Debug.flatten_string)
+  kilobyte = Truffle::Debug.flatten_string(kilobyte)
 end
 
 benchmark 'core-write-kilobyte' do
@@ -20,8 +20,8 @@
 
 gigabyte = 'x' * 1024 * 1024 * 1024
 
-if defined?(Truffle::Ropes.flatten_rope)
-  gigabyte = Truffle::Ropes.flatten_rope(gigabyte)
+if defined?(Truffle::Debug.flatten_string)
+  gigabyte = Truffle::Debug.flatten_string(gigabyte)
 end
 
 benchmark 'core-write-gigabyte' do

diff --git a/bench/micro/string/flatten.rb b/bench/micro/string/flatten.rb
diff --git a/bench/micro/string/substring.rb b/bench/micro/string/substring.rb
@@ -9,7 +9,7 @@
 side = 512 * 1024
 big_string = ("a".b * side + "é".b + "z".b * side)[1...-1]
 result = big_string.byteslice(4, 8)
-# Truffle::Ropes.debug_print_rope(big_string, false)
+# Truffle::Debug.tstring_to_debug_string(big_string)
 
 benchmark "core-string-many-substrings-of-large-substring" do
   i = 0

diff --git a/doc/contributor/truffle-string.md b/doc/contributor/truffle-string.md
@@ -0,0 +1,37 @@
+# TruffleString in TruffleRuby
+
+TruffleRuby uses `TruffleString` to represent Ruby Strings, but wraps them in either a RubyString or a ImmutableRubyString object.
+
+## Encodings Compatibility
+
+The notion of encodings compatibility is mostly the same between Ruby and TruffleString but differs in one point:
+* An empty Ruby String is always considered compatible with any other Ruby String of any encoding.
+* TruffleString does not consider whether a string is empty or not, and only look at their encodings and code range.
+
+As a result, to use TruffleString equality nodes, one needs to:
+1. Compute the compatible encoding with `NegotiateCompatibleStringEncodingNode` or `Primitive.encoding_ensure_compatible_str`.
+2. Check if both sides are empty, and if so return true before using TruffleString equality nodes.
+
+`StringHelperNodes.StringEqualInternalNode` is a good example showing what is needed.
+
+An example which would throw without empty checks is comparing an empty ISO-2022-JP (a dummy, non-ascii-compatible, fixed-width encoding) string with an empty US-ASCII string:
+
+```bash
+$ jt ruby -e '"".force_encoding("ISO-2022-JP") == ""'
+the given string is not compatible to the expected encoding "ISO_2022_JP", did you forget to convert it? (java.lang.IllegalArgumentException)
+```
+
+## Logical vs Physical Byte Offsets
+
+We categorize a byte offset into a `TruffleString` as either *logical* or *physical*.
+A physical byte offset includes the offset from the `InternalByteArray` (`InternalByteArray#getOffset()`).
+A logical byte offset does not include that and is the semantic byte offset from the start of the string.
+Physical offsets are quite difficult to use and they are error-prone as they can be passed by mistake to a method taking a logical offset.
+So avoid physical offsets as much as possible, and therefore avoid `InternalByteArray#getArray()`.
+
+## Tests
+
+This is a good set of tests to run when touching String code:
+```
+jt test integration strict-encoding-checks
+```
diff --git a/lib/cext/ABI_check.txt b/lib/cext/ABI_check.txt
@@ -1 +1 @@
-7
+8
diff --git a/lib/truffle/truffle/cext.rb b/lib/truffle/truffle/cext.rb
@@ -692,10 +692,6 @@ def rb_thread_alone
     Thread.list.count == 1 ? 1 : 0
   end
 
-  def rb_intern(str)
-    str.intern
-  end
-
   def rb_int_positive_pow(a, b)
     a ** b
   end
@@ -809,14 +805,6 @@ def rb_enc_get_index(obj)
     enc
   end
 
-  def rb_intern_str(string)
-    string.intern
-  end
-
-  def rb_intern3(string, enc)
-    string.force_encoding(enc).intern
-  end
-
   def rb_str_append(str, to_append)
     Primitive.string_append(str, to_append)
   end
@@ -1766,7 +1754,7 @@ def rb_gv_get(name)
   end
 
   def rb_reg_match(re, str)
-    result = str ? Truffle::RegexpOperations.match(re, str, 0) : nil
+    result = Truffle::RegexpOperations.match(re, str, 0)
     Primitive.regexp_last_match_set(rb_get_special_vars(), result)
 
     result.begin(0) if result

diff --git a/spec/ruby/core/file/shared/fnmatch.rb b/spec/ruby/core/file/shared/fnmatch.rb
@@ -159,10 +159,10 @@
   end
 
   it "does not match leading periods in filenames with wildcards by default" do
-    File.send(@method, '*', '.profile').should == false
-    File.send(@method, '*', 'home/.profile').should == true
-    File.send(@method, '*/*', 'home/.profile').should == true
-    File.send(@method, '*/*', 'dave/.profile', File::FNM_PATHNAME).should == false
+    File.should_not.send(@method, '*', '.profile')
+    File.should.send(@method, '*', 'home/.profile')
+    File.should.send(@method, '*/*', 'home/.profile')
+    File.should_not.send(@method, '*/*', 'dave/.profile', File::FNM_PATHNAME)
   end
 
   it "matches patterns with leading periods to dotfiles by default" do

diff --git a/spec/ruby/core/regexp/shared/quote.rb b/spec/ruby/core/regexp/shared/quote.rb
@@ -17,6 +17,11 @@
     Regexp.send(@method, str).should == '\+\[\]\('
   end
 
+  it "works for broken strings" do
+    Regexp.send(@method, "a.\x85b.".force_encoding("US-ASCII")).should =="a\\.\x85b\\.".force_encoding("US-ASCII")
+    Regexp.send(@method, "a.\x80".force_encoding("UTF-8")).should == "a\\.\x80".force_encoding("UTF-8")
+  end
+
   it "sets the encoding of the result to US-ASCII if there are only US-ASCII characters present in the input String" do
     str = "abc".force_encoding("euc-jp")
     Regexp.send(@method, str).encoding.should == Encoding::US_ASCII

diff --git a/spec/ruby/core/string/capitalize_spec.rb b/spec/ruby/core/string/capitalize_spec.rb
@@ -10,6 +10,7 @@
     "hello".capitalize.should == "Hello"
     "HELLO".capitalize.should == "Hello"
     "123ABC".capitalize.should == "123abc"
+    "abcdef"[1...-1].capitalize.should == "Bcde"
   end
 
   describe "full Unicode case mapping" do
@@ -37,7 +38,7 @@
     end
 
     it "handles non-ASCII substrings properly" do
-      "garçon"[1..-1].capitalize(:ascii).should == "Arçon"
+      "garçon"[1...-1].capitalize(:ascii).should == "Arço"
     end
   end
 

diff --git a/spec/ruby/core/string/delete_prefix_spec.rb b/spec/ruby/core/string/delete_prefix_spec.rb
@@ -21,6 +21,10 @@
     r.should == s
   end
 
+  it "does not remove partial bytes, only full characters" do
+    "\xe3\x81\x82".delete_prefix("\xe3").should == "\xe3\x81\x82"
+  end
+
   it "doesn't set $~" do
     $~ = nil
 

diff --git a/spec/ruby/core/string/delete_suffix_spec.rb b/spec/ruby/core/string/delete_suffix_spec.rb
@@ -21,6 +21,10 @@
     r.should == s
   end
 
+  it "does not remove partial bytes, only full characters" do
+    "\xe3\x81\x82".delete_suffix("\x82").should == "\xe3\x81\x82"
+  end
+
   it "doesn't set $~" do
     $~ = nil
 

diff --git a/spec/ruby/core/string/downcase_spec.rb b/spec/ruby/core/string/downcase_spec.rb
@@ -27,6 +27,10 @@
     it "does not downcase non-ASCII characters" do
       "CÅR".downcase(:ascii).should == "cÅr"
     end
+
+    it "works with substrings" do
+      "prefix TÉ"[-2..-1].downcase(:ascii).should == "tÉ"
+    end
   end
 
   describe "full Unicode case mapping adapted for Turkic languages" do

diff --git a/spec/ruby/core/string/include_spec.rb b/spec/ruby/core/string/include_spec.rb
@@ -13,6 +13,20 @@
     StringSpecs::MyString.new("hello").include?(StringSpecs::MyString.new("lo")).should == true
   end
 
+  it "returns true if both strings are empty" do
+    "".should.include?("")
+    "".force_encoding("EUC-JP").should.include?("")
+    "".should.include?("".force_encoding("EUC-JP"))
+    "".force_encoding("EUC-JP").should.include?("".force_encoding("EUC-JP"))
+  end
+
+  it "returns true if the RHS is empty" do
+    "a".should.include?("")
+    "a".force_encoding("EUC-JP").should.include?("")
+    "a".should.include?("".force_encoding("EUC-JP"))
+    "a".force_encoding("EUC-JP").should.include?("".force_encoding("EUC-JP"))
+  end
+
   it "tries to convert other to string using to_str" do
     other = mock('lo')
     other.should_receive(:to_str).and_return("lo")

diff --git a/spec/ruby/core/string/inspect_spec.rb b/spec/ruby/core/string/inspect_spec.rb
@@ -19,6 +19,21 @@
     ].should be_computed_by(:inspect)
   end
 
+  it "returns a string with special characters replaced with \\<char> notation for UTF-16" do
+    pairs = [
+      ["\a", '"\\a"'],
+      ["\b", '"\\b"'],
+      ["\t", '"\\t"'],
+      ["\n", '"\\n"'],
+      ["\v", '"\\v"'],
+      ["\f", '"\\f"'],
+      ["\r", '"\\r"'],
+      ["\e", '"\\e"']
+    ].map { |str, result| [str.encode('UTF-16LE'), result] }
+
+    pairs.should be_computed_by(:inspect)
+  end
+
   it "returns a string with \" and \\ escaped with a backslash" do
     [ ["\"", '"\\""'],
       ["\\", '"\\\\"']
@@ -311,6 +326,11 @@
     "\xF0\x9F".inspect.should == '"\\xF0\\x9F"'
   end
 
+  it "works for broken US-ASCII strings" do
+    s = "©".force_encoding("US-ASCII")
+    s.inspect.should == '"\xC2\xA9"'
+  end
+
   describe "when default external is UTF-8" do
     before :each do
       @extenc, Encoding.default_external = Encoding.default_external, Encoding::UTF_8

diff --git a/spec/ruby/core/string/lstrip_spec.rb b/spec/ruby/core/string/lstrip_spec.rb
@@ -10,6 +10,14 @@
    "  hello world  ".lstrip.should == "hello world  "
    "\n\r\t\n\v\r hello world  ".lstrip.should == "hello world  "
    "hello".lstrip.should == "hello"
+   " こにちわ".lstrip.should == "こにちわ"
+  end
+
+  it "works with lazy substrings" do
+    "  hello  "[1...-1].lstrip.should == "hello "
+    "  hello world  "[1...-1].lstrip.should == "hello world "
+    "\n\r\t\n\v\r hello world  "[1...-1].lstrip.should == "hello world "
+    "   こにちわ "[1...-1].lstrip.should == "こにちわ"
   end
 
   ruby_version_is '3.0' do
@@ -27,20 +35,26 @@
     a.should == "hello  "
   end
 
+  it "returns nil if no modifications were made" do
+    a = "hello"
+    a.lstrip!.should == nil
+    a.should == "hello"
+  end
+
+  it "makes a string empty if it is only whitespace" do
+    "".lstrip!.should == nil
+    " ".lstrip.should == ""
+    "  ".lstrip.should == ""
+  end
+
   ruby_version_is '3.0' do
-    it "strips leading \\0" do
+    it "removes leading NULL bytes and whitespace" do
       a = "\000 \000hello\000 \000"
       a.lstrip!
       a.should == "hello\000 \000"
     end
   end
 
-  it "returns nil if no modifications were made" do
-    a = "hello"
-    a.lstrip!.should == nil
-    a.should == "hello"
-  end
-
   it "raises a FrozenError on a frozen instance that is modified" do
     -> { "  hello  ".freeze.lstrip! }.should raise_error(FrozenError)
   end
@@ -51,9 +65,13 @@
     -> { "".freeze.lstrip!      }.should raise_error(FrozenError)
   end
 
-  it "raises an ArgumentError if the first codepoint is invalid" do
+  it "raises an ArgumentError if the first non-space codepoint is invalid" do
     s = "\xDFabc".force_encoding(Encoding::UTF_8)
     s.valid_encoding?.should be_false
     -> { s.lstrip! }.should raise_error(ArgumentError)
+
+    s = "   \xDFabc".force_encoding(Encoding::UTF_8)
+    s.valid_encoding?.should be_false
+    -> { s.lstrip! }.should raise_error(ArgumentError)
   end
 end
diff --git a/spec/ruby/core/string/ord_spec.rb b/spec/ruby/core/string/ord_spec.rb
@@ -25,4 +25,9 @@
   it "raises an ArgumentError if called on an empty String" do
     -> { ''.ord }.should raise_error(ArgumentError)
   end
+
+  it "raises ArgumentError if the character is broken" do
+    s = "©".force_encoding("US-ASCII")
+    -> { s.ord }.should raise_error(ArgumentError, "invalid byte sequence in US-ASCII")
+  end
 end
diff --git a/spec/ruby/core/string/reverse_spec.rb b/spec/ruby/core/string/reverse_spec.rb
@@ -29,6 +29,14 @@
   it "reverses a string with multi byte characters" do
     "微軟正黑體".reverse.should == "體黑正軟微"
   end
+
+  it "works with a broken string" do
+    str = "微軟\xDF\xDE正黑體".force_encoding(Encoding::UTF_8)
+
+    str.valid_encoding?.should be_false
+
+    str.reverse.should == "體黑正\xDE\xDF軟微"
+  end
 end
 
 describe "String#reverse!" do
@@ -55,4 +63,13 @@
     str.reverse!
     str.should == "體黑正軟微"
   end
+
+  it "works with a broken string" do
+    str = "微軟\xDF\xDE正黑體".force_encoding(Encoding::UTF_8)
+
+    str.valid_encoding?.should be_false
+    str.reverse!
+
+    str.should == "體黑正\xDE\xDF軟微"
+  end
 end