From b2be8a3f962afee78fdc4924a0ed45a5d6569fb3 Mon Sep 17 00:00:00 2001 From: MVP-D77 <11912714@mail.sustech.edu.cn> Date: Sat, 23 Apr 2022 17:36:37 +0800 Subject: [PATCH 1/3] FixBug of jsoup Inconsistent Handling Of Duplicate Attributes #1719 --- pom.xml | 8 ++ .../org/jsoup/parser/HtmlTreeBuilder.java | 3 + .../parser/DuplicateAttributes_Issue1719.java | 89 +++++++++++++++++++ 3 files changed, 100 insertions(+) create mode 100644 src/test/java/org/jsoup/parser/DuplicateAttributes_Issue1719.java diff --git a/pom.xml b/pom.xml index 4328b12716..b401863fc4 100644 --- a/pom.xml +++ b/pom.xml @@ -313,6 +313,14 @@ + + org.assertj + assertj-core + + 3.8.0 + test + + org.junit.jupiter diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 7ddda8e7d9..06ab40abc9 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -262,6 +262,9 @@ void insert(Element el) { } Element insertEmpty(Token.StartTag startTag) { + if (startTag.hasAttributes()) + startTag.attributes.deduplicate(settings); + Tag tag = tagFor(startTag.name(), settings); Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); insertNode(el); diff --git a/src/test/java/org/jsoup/parser/DuplicateAttributes_Issue1719.java b/src/test/java/org/jsoup/parser/DuplicateAttributes_Issue1719.java new file mode 100644 index 0000000000..6bcbb77449 --- /dev/null +++ b/src/test/java/org/jsoup/parser/DuplicateAttributes_Issue1719.java @@ -0,0 +1,89 @@ +package org.jsoup.parser; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class DuplicateAttributes_Issue1719 { + private static final String HEAD = "\n" + + " \n" + + " \n" + + " "; + private static final String TRAIL = "\n" + + " \n" + + ""; + public static final String DESIRED_XML_TAG = ""; + public static final String DESIRED_HTML_TAG = ""; + + public static final String INPUT = ""; + + public static final String INPUT_NO_SLASH = ""; + + @Test + void parserXML() { + String doubleTag = INPUT; + Parser parser = Parser.xmlParser().setTrackErrors(10); + Document doc = parser.parseInput(doubleTag, ""); + System.out.println(doc.html()); + + assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_XML_TAG); + } + + @Test + void parserHTML() { + String doubleTag = INPUT; + Parser parser = Parser.htmlParser().setTrackErrors(10); + Document doc = parser.parseInput(doubleTag, ""); + + System.out.println(doc.html()); + assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_HTML_TAG); + } + + @Test + void parserXML_toXML() { + String doubleTag = INPUT; + Parser parser = Parser.xmlParser().setTrackErrors(10); + Document doc = parser.parseInput(doubleTag, ""); + doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml); + + assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_XML_TAG); + } + + @Test + void parserHTML_toXML() { + String doubleTag = INPUT; + Parser parser = Parser.htmlParser().setTrackErrors(10); + Document doc = parser.parseInput(doubleTag, ""); + doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml); + + assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_XML_TAG); + } + + @Test + void jsoupParseToXML() { + String doubleTag = INPUT; + + final Document document = Jsoup.parse(doubleTag); + document.outputSettings().syntax(Document.OutputSettings.Syntax.xml); + + String outputXhtml = document.html() + .replaceAll(" ", " ");// nbsp does not exist in xhtml. + + assertThat(outputXhtml).isNotBlank().isEqualTo(HEAD + DESIRED_XML_TAG + TRAIL); + } + + @Test + void jsoupParseToXML_outerMethod() { + String doubleTag = INPUT; + + final Document document = Jsoup.parse(doubleTag); + document.outputSettings().syntax(Document.OutputSettings.Syntax.xml); + + String outputXhtml = document.outerHtml() + .replaceAll(" ", " ");// nbsp does not exist in xhtml. + + assertThat(outputXhtml).isNotBlank().isEqualTo(HEAD + DESIRED_XML_TAG + TRAIL); + } +} \ No newline at end of file From 019b58ee92abeabf44b46380aab1496452a005ee Mon Sep 17 00:00:00 2001 From: MVP-D77 <11912714@mail.sustech.edu.cn> Date: Sun, 24 Apr 2022 20:54:49 +0800 Subject: [PATCH 2/3] FixBug of jsoup Inconsistent Handling Of Duplicate Attributes #1719 and do some update --- .../org/jsoup/parser/HtmlTreeBuilder.java | 3 - .../jsoup/parser/HtmlTreeBuilderState.java | 11 ++++ .../parser/DuplicateAttributes_Issue1719.java | 60 +++++++++++++------ 3 files changed, 53 insertions(+), 21 deletions(-) diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 06ab40abc9..7ddda8e7d9 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -262,9 +262,6 @@ void insert(Element el) { } Element insertEmpty(Token.StartTag startTag) { - if (startTag.hasAttributes()) - startTag.attributes.deduplicate(settings); - Tag tag = tagFor(startTag.name(), settings); Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); insertNode(el); diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index af1fefe453..6e739eeb0c 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -316,11 +316,22 @@ boolean process(Token t, HtmlTreeBuilder tb) { return true; } + /** + * Current StartTag is inBody, and need to insert next tags, consist of different situations. + + * @param t next want to add to tb tree. + * @param tb HtmlTreeBuilder to construct the nodes as tree + */ + private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) { final Token.StartTag startTag = t.asStartTag(); final String name = startTag.normalName(); final ArrayList stack; Element el; +// cleanup duplicate attributes: + if (startTag.hasAttributes()) { + startTag.attributes.deduplicate(tb.settings); + } switch (name) { case "a": diff --git a/src/test/java/org/jsoup/parser/DuplicateAttributes_Issue1719.java b/src/test/java/org/jsoup/parser/DuplicateAttributes_Issue1719.java index 6bcbb77449..7be312f427 100644 --- a/src/test/java/org/jsoup/parser/DuplicateAttributes_Issue1719.java +++ b/src/test/java/org/jsoup/parser/DuplicateAttributes_Issue1719.java @@ -14,56 +14,80 @@ public class DuplicateAttributes_Issue1719 { private static final String TRAIL = "\n" + " \n" + ""; - public static final String DESIRED_XML_TAG = ""; - public static final String DESIRED_HTML_TAG = ""; + public static final String DESIRED_XML_IMG_TAG = ""; + public static final String DESIRED_HTML_IMG_TAG = ""; - public static final String INPUT = ""; + public static final String IMG_INPUT = ""; - public static final String INPUT_NO_SLASH = ""; + public static final String IMG_INPUT_NO_SLASH = ""; + + + public static final String AREA_INPUT = "\n" + + " \n" + + " \n" + + "
\n" + +// " \nh2<\a>" + + "
\n" + + " \n" + + "\n"+ + " \n" + + ""; + + public static final String DESIRED_HTML_AREA_TAG = ""; @Test void parserXML() { - String doubleTag = INPUT; + String doubleTag = IMG_INPUT; Parser parser = Parser.xmlParser().setTrackErrors(10); Document doc = parser.parseInput(doubleTag, ""); System.out.println(doc.html()); - assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_XML_TAG); + assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_XML_IMG_TAG); } @Test - void parserHTML() { - String doubleTag = INPUT; + void parserHTML_IMAGE() { + String doubleTag = IMG_INPUT; Parser parser = Parser.htmlParser().setTrackErrors(10); Document doc = parser.parseInput(doubleTag, ""); - System.out.println(doc.html()); - assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_HTML_TAG); +// System.out.println(doc.html()); + assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_HTML_IMG_TAG); + } + + @Test + void parserHTML_AREA() { + String doubleTag = AREA_INPUT; + Parser parser = Parser.htmlParser().setTrackErrors(10); + Document doc = parser.parseInput(doubleTag, ""); + +// System.out.println(doc.html()); + assertThat(doc.selectFirst("area").outerHtml()).isNotBlank().isEqualTo(DESIRED_HTML_AREA_TAG); } @Test void parserXML_toXML() { - String doubleTag = INPUT; + String doubleTag = IMG_INPUT; Parser parser = Parser.xmlParser().setTrackErrors(10); Document doc = parser.parseInput(doubleTag, ""); doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml); - assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_XML_TAG); + assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_XML_IMG_TAG); } @Test void parserHTML_toXML() { - String doubleTag = INPUT; + String doubleTag = IMG_INPUT; Parser parser = Parser.htmlParser().setTrackErrors(10); Document doc = parser.parseInput(doubleTag, ""); doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml); - assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_XML_TAG); + assertThat(doc.selectFirst("img").outerHtml()).isNotBlank().isEqualTo(DESIRED_XML_IMG_TAG); } @Test void jsoupParseToXML() { - String doubleTag = INPUT; + String doubleTag = IMG_INPUT; final Document document = Jsoup.parse(doubleTag); document.outputSettings().syntax(Document.OutputSettings.Syntax.xml); @@ -71,12 +95,12 @@ void jsoupParseToXML() { String outputXhtml = document.html() .replaceAll(" ", " ");// nbsp does not exist in xhtml. - assertThat(outputXhtml).isNotBlank().isEqualTo(HEAD + DESIRED_XML_TAG + TRAIL); + assertThat(outputXhtml).isNotBlank().isEqualTo(HEAD + DESIRED_XML_IMG_TAG + TRAIL); } @Test void jsoupParseToXML_outerMethod() { - String doubleTag = INPUT; + String doubleTag = IMG_INPUT; final Document document = Jsoup.parse(doubleTag); document.outputSettings().syntax(Document.OutputSettings.Syntax.xml); @@ -84,6 +108,6 @@ void jsoupParseToXML_outerMethod() { String outputXhtml = document.outerHtml() .replaceAll(" ", " ");// nbsp does not exist in xhtml. - assertThat(outputXhtml).isNotBlank().isEqualTo(HEAD + DESIRED_XML_TAG + TRAIL); + assertThat(outputXhtml).isNotBlank().isEqualTo(HEAD + DESIRED_XML_IMG_TAG + TRAIL); } } \ No newline at end of file From b74a993e4a07254a72e09926dfbc782b80e6c526 Mon Sep 17 00:00:00 2001 From: MVP-D77 <11912714@mail.sustech.edu.cn> Date: Sun, 24 Apr 2022 21:15:03 +0800 Subject: [PATCH 3/3] Sorry, I ignore the warning of something and do finally modify.FixBug of jsoup Inconsistent Handling Of Duplicate Attributes #1719 and do some update --- src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index 6e739eeb0c..5ae363294f 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -329,8 +329,11 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) { final ArrayList stack; Element el; // cleanup duplicate attributes: - if (startTag.hasAttributes()) { - startTag.attributes.deduplicate(tb.settings); + if (startTag.hasAttributes() && !startTag.attributes.isEmpty()) { + int dupes = startTag.attributes.deduplicate(tb.settings); + if (dupes > 0) { + tb.error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); + } } switch (name) {