From 1adb7040345a959037e16656c93f0dd4b6619bb4 Mon Sep 17 00:00:00 2001 From: duanyang25 Date: Sat, 4 Dec 2021 11:04:28 -0600 Subject: [PATCH 1/8] reproduce #1341 --- src/test/java/org/jsoup/1341.html | 12 ++++++++++ src/test/java/org/jsoup/ReproduceTest.java | 26 ++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 src/test/java/org/jsoup/1341.html create mode 100644 src/test/java/org/jsoup/ReproduceTest.java diff --git a/src/test/java/org/jsoup/1341.html b/src/test/java/org/jsoup/1341.html new file mode 100644 index 0000000000..a3e56c73be --- /dev/null +++ b/src/test/java/org/jsoup/1341.html @@ -0,0 +1,12 @@ + + + + + + + UnboundPrefix + + + + + diff --git a/src/test/java/org/jsoup/ReproduceTest.java b/src/test/java/org/jsoup/ReproduceTest.java new file mode 100644 index 0000000000..f24e426483 --- /dev/null +++ b/src/test/java/org/jsoup/ReproduceTest.java @@ -0,0 +1,26 @@ +package org.jsoup; + +import org.jsoup.nodes.DataNode; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.select.Elements; +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ReproduceTest { + @Test + public void issue1341Test() throws IOException { + File input = new File("src/test/java/org/jsoup/1341.html"); + Document doc = Jsoup.parse(input, "UTF-8", ""); + + Element rv = doc.select("body").get(0).children().get(0); + assertEquals("test:h1", rv.tagName()); + } +} From 4664d53da98d48e3d7661bf6f3f393e5cdccc97f Mon Sep 17 00:00:00 2001 From: duanyang25 Date: Tue, 7 Dec 2021 04:40:47 -0600 Subject: [PATCH 2/8] pinpoint files --- src/main/java/org/jsoup/parser/Tag.java | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java index d573033a56..c9ebc8c04e 100644 --- a/src/main/java/org/jsoup/parser/Tag.java +++ b/src/main/java/org/jsoup/parser/Tag.java @@ -46,6 +46,28 @@ public String normalName() { return normalName; } + /** + * Get the tag name whose symbols are converted to Unicodes. + * @return the tag's converted name. + */ + public String convertSymbol(String tagName) { + String convertName = ""; + for(int i = 0; i < tagName.length(); i++){ + char curr = tagName.charAt(i); + + + // TODO: check whether the char is symbol + + + // TODO: convert the symbol to Unicode `U00` + unicode + + convertName += curr; + } + + return convertName; + } + + /** * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. *

From 2b20403364beed81309201deacd94fce4f62bf33 Mon Sep 17 00:00:00 2001 From: duanyang25 Date: Tue, 7 Dec 2021 05:39:49 -0600 Subject: [PATCH 3/8] try convert --- src/main/java/org/jsoup/parser/Tag.java | 29 +++++++++++++++++++--- src/test/java/org/jsoup/ReproduceTest.java | 2 ++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java index c9ebc8c04e..30e6da08c5 100644 --- a/src/main/java/org/jsoup/parser/Tag.java +++ b/src/main/java/org/jsoup/parser/Tag.java @@ -16,6 +16,7 @@ public class Tag implements Cloneable { private String tagName; private String normalName; // always the lower case version of this tag, regardless of case preservation mode + private String unicodeName; // always the converted version of this tag with symbols in Uni-16 and original letters, regardless of case preservation mode private boolean isBlock = true; // block private boolean formatAsBlock = true; // should be formatted as a block private boolean empty = false; // can hold nothing; e.g. img @@ -27,6 +28,7 @@ public class Tag implements Cloneable { private Tag(String tagName) { this.tagName = tagName; normalName = Normalizer.lowerCase(tagName); + unicodeName = convertSymbol(tagName); } /** @@ -46,22 +48,41 @@ public String normalName() { return normalName; } + /** + * Get this tag's name whose symbols are converted to Unicode. + * @return the tag's converted name. + */ + public String unicodeName() { + return unicodeName; + } + /** * Get the tag name whose symbols are converted to Unicodes. + * + * @param tagName Name of tag, e.g. "p". Case insensitive. * @return the tag's converted name. */ public String convertSymbol(String tagName) { String convertName = ""; for(int i = 0; i < tagName.length(); i++){ - char curr = tagName.charAt(i); + char c = tagName.charAt(i); + boolean isDigit = Character.isDigit(c); + boolean isLowerLetter = Character.isLowerCase(c); + boolean isUpperLetter = Character.isUpperCase(c); - // TODO: check whether the char is symbol + // check whether the char is a digit or letter + if(!(isDigit | isLowerLetter | isUpperLetter)){ + // convert the symbol to Unicode `U00` + unicode + int uni16 = (int) c; - // TODO: convert the symbol to Unicode `U00` + unicode - convertName += curr; + convertName += "U"; + convertName += Integer.toHexString(uni16 | 0x000000); + }else{ + convertName += c; + } } return convertName; diff --git a/src/test/java/org/jsoup/ReproduceTest.java b/src/test/java/org/jsoup/ReproduceTest.java index f24e426483..d00ab6cad1 100644 --- a/src/test/java/org/jsoup/ReproduceTest.java +++ b/src/test/java/org/jsoup/ReproduceTest.java @@ -21,6 +21,8 @@ public void issue1341Test() throws IOException { Document doc = Jsoup.parse(input, "UTF-8", ""); Element rv = doc.select("body").get(0).children().get(0); + assertEquals("testU00003Ah1", rv.tag().unicodeName()); assertEquals("test:h1", rv.tagName()); + } } From 496ccefb29102bc801565f41ee3875385fa04d1f Mon Sep 17 00:00:00 2001 From: duanyang25 Date: Tue, 7 Dec 2021 06:03:33 -0600 Subject: [PATCH 4/8] successfully handle unrecognized symbols to Uni-16 bytes --- src/main/java/org/jsoup/parser/Tag.java | 12 +++++++-- src/test/java/org/jsoup/1341.html | 12 --------- src/test/java/org/jsoup/ReproduceTest.java | 28 --------------------- src/test/java/org/jsoup/parser/TagTest.java | 28 +++++++++++++++++++++ 4 files changed, 38 insertions(+), 42 deletions(-) delete mode 100644 src/test/java/org/jsoup/1341.html delete mode 100644 src/test/java/org/jsoup/ReproduceTest.java diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java index 30e6da08c5..b3865203ad 100644 --- a/src/main/java/org/jsoup/parser/Tag.java +++ b/src/main/java/org/jsoup/parser/Tag.java @@ -75,11 +75,19 @@ public String convertSymbol(String tagName) { if(!(isDigit | isLowerLetter | isUpperLetter)){ // convert the symbol to Unicode `U00` + unicode int uni16 = (int) c; + String hexString = Integer.toHexString(uni16); + // Reference: https://stackoverflow.com/questions/8689526/integer-to-two-digits-hex-in-java + convertName += "U"; + if(hexString.length() < 6){ + for(int idx = 0; idx < (6 - hexString.length()); idx++){ + convertName += "0"; + } + } + + convertName += hexString.toUpperCase(); - convertName += "U"; - convertName += Integer.toHexString(uni16 | 0x000000); }else{ convertName += c; } diff --git a/src/test/java/org/jsoup/1341.html b/src/test/java/org/jsoup/1341.html deleted file mode 100644 index a3e56c73be..0000000000 --- a/src/test/java/org/jsoup/1341.html +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - UnboundPrefix - - - - - diff --git a/src/test/java/org/jsoup/ReproduceTest.java b/src/test/java/org/jsoup/ReproduceTest.java deleted file mode 100644 index d00ab6cad1..0000000000 --- a/src/test/java/org/jsoup/ReproduceTest.java +++ /dev/null @@ -1,28 +0,0 @@ -package org.jsoup; - -import org.jsoup.nodes.DataNode; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.select.Elements; -import org.junit.jupiter.api.Test; - -import java.io.File; -import java.io.IOException; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -public class ReproduceTest { - @Test - public void issue1341Test() throws IOException { - File input = new File("src/test/java/org/jsoup/1341.html"); - Document doc = Jsoup.parse(input, "UTF-8", ""); - - Element rv = doc.select("body").get(0).children().get(0); - assertEquals("testU00003Ah1", rv.tag().unicodeName()); - assertEquals("test:h1", rv.tagName()); - - } -} diff --git a/src/test/java/org/jsoup/parser/TagTest.java b/src/test/java/org/jsoup/parser/TagTest.java index 65a794d847..a98f244c24 100644 --- a/src/test/java/org/jsoup/parser/TagTest.java +++ b/src/test/java/org/jsoup/parser/TagTest.java @@ -1,8 +1,12 @@ package org.jsoup.parser; +import org.jsoup.Jsoup; import org.jsoup.MultiLocaleExtension.MultiLocaleTest; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.junit.jupiter.api.Test; +import java.util.List; import java.util.Locale; import static org.junit.jupiter.api.Assertions.*; @@ -81,4 +85,28 @@ public void canBeInsensitive(Locale locale) { assertTrue(Tag.isKnownTag("div")); assertFalse(Tag.isKnownTag("explain")); } + + // Test the tag containing symbols like `:` + // Issue #1341 + // https://github.com/jhy/jsoup/issues/1341 + @Test public void handleSymbolTags() { + String h = "\n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\tUnboundPrefix\n" + + "\t\n" + + " \t\n" + + " \t\n" + + " \n" + + "\n"; + + Document doc = Jsoup.parse(h); + + Element rv = doc.select("body").get(0).children().get(0); + assertEquals("testU00003Ah1", rv.tag().unicodeName()); + assertEquals("test:h1", rv.tagName()); + } } From 97a18b922006672c7c1712b675edf28c21ba16ce Mon Sep 17 00:00:00 2001 From: duanyang25 Date: Tue, 7 Dec 2021 08:14:55 -0600 Subject: [PATCH 5/8] reproduce the second case #1341 --- .../org/jsoup/parser/AttributeParseTest.java | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/test/java/org/jsoup/parser/AttributeParseTest.java b/src/test/java/org/jsoup/parser/AttributeParseTest.java index d328b9fb40..1858b3765a 100644 --- a/src/test/java/org/jsoup/parser/AttributeParseTest.java +++ b/src/test/java/org/jsoup/parser/AttributeParseTest.java @@ -96,4 +96,28 @@ public class AttributeParseTest { doc = Jsoup.parse(html, "", Parser.xmlParser()); assertEquals("", doc.html()); } + + // Test the attribute name like `xlink:href` + // Issue #1341 + // https://github.com/jhy/jsoup/issues/1341 + @Test public void handleUnboundPrefixofXlink() { + String h = "\n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\tUnboundPrefix\n" + + "\t\n" + + " \t\n" + + " \t\n" + + " \n" + + "\n"; + + Document doc = Jsoup.parse(h); + + Element rv = doc.select("body").get(0).children().get(1); + + assertEquals("UnboundPrefix", rv.attributes().get("xlink:href")); + } } From 9d2b0b8d900f6f13baa0256c85041dfe6d57e90c Mon Sep 17 00:00:00 2001 From: duanyang25 Date: Tue, 7 Dec 2021 08:40:38 -0600 Subject: [PATCH 6/8] handle the second case --- src/main/java/org/jsoup/nodes/Attribute.java | 51 +++++++++++++++++++ src/main/java/org/jsoup/parser/Tag.java | 5 +- .../org/jsoup/parser/AttributeParseTest.java | 3 ++ 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jsoup/nodes/Attribute.java b/src/main/java/org/jsoup/nodes/Attribute.java index 5cdecc339c..ebd0abdcc4 100644 --- a/src/main/java/org/jsoup/nodes/Attribute.java +++ b/src/main/java/org/jsoup/nodes/Attribute.java @@ -24,6 +24,7 @@ public class Attribute implements Map.Entry, Cloneable { }; private String key; + private String convertedKey; // the converted version of the key with symbols in Uni-16 and original letters @Nullable private String val; @Nullable Attributes parent; // used to update the holding Attributes when the key / value is changed via this interface @@ -48,10 +49,52 @@ public Attribute(String key, @Nullable String val, @Nullable Attributes parent) key = key.trim(); Validate.notEmpty(key); // trimming could potentially make empty, so validate here this.key = key; + this.convertedKey = convertSymbol(key); this.val = val; this.parent = parent; } + /** + * Convert the unencoded (raw) key that contains unrecognized symbols to Unicode 16 following + * HTML Living Standard https://html.spec.whatwg.org/#coercing-an-html-dom-into-an-infoset. + * + * @param key attribute key; case is preserved. + * @return the converted key. + */ + public String convertSymbol(String key) { + String convertedKey = ""; + for(int i = 0; i < key.length(); i++){ + char c = key.charAt(i); + + boolean isDigit = Character.isDigit(c); + boolean isLowerLetter = Character.isLowerCase(c); + boolean isUpperLetter = Character.isUpperCase(c); + + // check whether the char is a digit or letter + if(!(isDigit | isLowerLetter | isUpperLetter)){ + // convert the symbol to Unicode `U00` + unicode + int uni16 = (int) c; + String hexString = Integer.toHexString(uni16); + + // Reference: https://stackoverflow.com/questions/8689526/integer-to-two-digits-hex-in-java + convertedKey += "U"; + + if(hexString.length() < 6){ + for(int idx = 0; idx < (6 - hexString.length()); idx++){ + convertedKey += "0"; + } + } + + convertedKey += hexString.toUpperCase(); + + }else{ + convertedKey += c; + } + } + + return convertedKey; + } + /** Get the attribute key. @return the attribute key @@ -60,6 +103,14 @@ public String getKey() { return key; } + /** + Get the attribute converted key. + @return the attribute converted key + */ + public String getConvertedKey() { + return convertedKey; + } + /** Set the attribute key; case is preserved. @param key the new key; must not be null diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java index b3865203ad..0e93bd5d8d 100644 --- a/src/main/java/org/jsoup/parser/Tag.java +++ b/src/main/java/org/jsoup/parser/Tag.java @@ -57,9 +57,10 @@ public String unicodeName() { } /** - * Get the tag name whose symbols are converted to Unicodes. + * Get the tag name whose symbols are converted to Unicode 16 following + * HTML Living Standard https://html.spec.whatwg.org/#coercing-an-html-dom-into-an-infoset. * - * @param tagName Name of tag, e.g. "p". Case insensitive. + * @param tagName Name of tag, e.g. "p", case is preserved. * @return the tag's converted name. */ public String convertSymbol(String tagName) { diff --git a/src/test/java/org/jsoup/parser/AttributeParseTest.java b/src/test/java/org/jsoup/parser/AttributeParseTest.java index 1858b3765a..60fadbc7e9 100644 --- a/src/test/java/org/jsoup/parser/AttributeParseTest.java +++ b/src/test/java/org/jsoup/parser/AttributeParseTest.java @@ -118,6 +118,9 @@ public class AttributeParseTest { Element rv = doc.select("body").get(0).children().get(1); + assertEquals("xlinkU00003Ahref", rv.attributes().asList().get(2).getConvertedKey()); + + assertEquals("UnboundPrefix", rv.attributes().asList().get(2).getValue()); assertEquals("UnboundPrefix", rv.attributes().get("xlink:href")); } } From e7001804046d1485f5fd5c276e5af383ffe19cfc Mon Sep 17 00:00:00 2001 From: Yang <34642309+duanyang25@users.noreply.github.com> Date: Tue, 7 Dec 2021 08:54:12 -0600 Subject: [PATCH 7/8] Update Attribute.java --- src/main/java/org/jsoup/nodes/Attribute.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/jsoup/nodes/Attribute.java b/src/main/java/org/jsoup/nodes/Attribute.java index ebd0abdcc4..89a0c8ac94 100644 --- a/src/main/java/org/jsoup/nodes/Attribute.java +++ b/src/main/java/org/jsoup/nodes/Attribute.java @@ -72,7 +72,7 @@ public String convertSymbol(String key) { // check whether the char is a digit or letter if(!(isDigit | isLowerLetter | isUpperLetter)){ - // convert the symbol to Unicode `U00` + unicode + // convert the symbol to Unicode `U` + unicode with 6 hex chars int uni16 = (int) c; String hexString = Integer.toHexString(uni16); From 239897f29724860f714f84d9888dfcb355eda311 Mon Sep 17 00:00:00 2001 From: Yang <34642309+duanyang25@users.noreply.github.com> Date: Tue, 7 Dec 2021 08:55:30 -0600 Subject: [PATCH 8/8] Update Tag.java --- src/main/java/org/jsoup/parser/Tag.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java index 0e93bd5d8d..b0145d45c3 100644 --- a/src/main/java/org/jsoup/parser/Tag.java +++ b/src/main/java/org/jsoup/parser/Tag.java @@ -74,7 +74,7 @@ public String convertSymbol(String tagName) { // check whether the char is a digit or letter if(!(isDigit | isLowerLetter | isUpperLetter)){ - // convert the symbol to Unicode `U00` + unicode + // convert the symbol to Unicode `U` + unicode with 6 hex chars int uni16 = (int) c; String hexString = Integer.toHexString(uni16);