Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implements font styles in the output XML #936

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
140c7ec
implement style for paragraphs
lfoppiano Jul 22, 2022
e398686
correct missing paragraphs
lfoppiano Jul 22, 2022
e6ba12b
add style to figure and table caption
lfoppiano Jul 22, 2022
17a9146
add style to title
lfoppiano Jul 22, 2022
cc3a0e5
wrongly inverted if
lfoppiano Jul 24, 2022
d5ae544
missing parenthesis
lfoppiano Jul 24, 2022
064c2f6
add decoration in equation/formula
lfoppiano Jul 25, 2022
c5f607b
Fix spaces
lfoppiano Jul 25, 2022
386e6b4
add comments
lfoppiano Jul 25, 2022
44e70f3
add some more tests
lfoppiano Jul 25, 2022
a090297
some refactoring
lfoppiano Jul 25, 2022
599559e
minor changes
lfoppiano Jul 25, 2022
eee28ab
implement change when segmenting paragraphs in sentences
lfoppiano Jul 25, 2022
af94428
Test sentence segmentation with decoration and references
lfoppiano Jul 25, 2022
e8a00fe
Fix style extraction + adding more tests
lfoppiano Jul 26, 2022
ccce049
Split decoration between sentences if neeed
lfoppiano Jul 27, 2022
aaf211d
Fix bugs with the text accumulator
lfoppiano Jul 27, 2022
b82cc43
Fix incorrect split and position in sentence markers
lfoppiano Jul 28, 2022
053b235
remove suffix space when there is no more text
lfoppiano Sep 12, 2022
80b98c4
fix OOBE when applying sentence splitting
lfoppiano Sep 13, 2022
e3a44d4
Merge branch 'master' into feature/add-styles-xml
lfoppiano May 17, 2023
d57c82c
avoid adding styles in head sections
lfoppiano May 17, 2023
9adb8d8
fix inconsistency when having notes in the same page
lfoppiano May 17, 2023
a25de0d
Merge branch 'master' into feature/add-styles-xml
lfoppiano Dec 17, 2023
110ded5
Merge branch 'refs/heads/master' into feature/add-styles-xml
lfoppiano Apr 15, 2024
188cda5
Merge master into features/add-styles-xml
lfoppiano Apr 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions grobid-core/src/main/java/org/grobid/core/data/Equation.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,27 @@

import nu.xom.Attribute;
import nu.xom.Element;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Triple;
import org.grobid.core.document.xml.XmlBuilderUtils;
import org.grobid.core.engines.Engine;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.BoundingBoxCalculator;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.counters.CntManager;
import org.grobid.core.utilities.TextUtilities;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.SortedSet;

import static org.grobid.core.document.TEIFormatter.*;

/**
* Class for representing an equation.
*
Expand Down Expand Up @@ -56,9 +62,15 @@ public Element toTEIElement(GrobidAnalysisConfig config) {
XmlBuilderUtils.addCoords(formulaElement, LayoutTokensUtil.getCoordsStringForOneBox(getLayoutTokens()));
}

formulaElement.appendChild(LayoutTokensUtil.normalizeText(content.toString()).trim());
List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(getContentTokens(), Arrays.asList(TEI_STYLE_BOLD_NAME, TEI_STYLE_ITALIC_NAME)) ;

if (CollectionUtils.isNotEmpty(stylesList)) {
applyStyleList(formulaElement, getContent(), stylesList);
} else {
formulaElement.appendChild(LayoutTokensUtil.normalizeText(content.toString()).trim());
}

if ( (label != null) && (label.length()>0) ) {
if ( StringUtils.isNotEmpty(label) ) {
Element labelEl = XmlBuilderUtils.teiElement("label",
LayoutTokensUtil.normalizeText(label.toString()));
formulaElement.appendChild(labelEl);
Expand All @@ -79,6 +91,16 @@ public List<LayoutToken> getContentTokens() {
return contentTokens;
}

public void addContentTokens(List<LayoutToken> tokens) {
if (tokens == null)
return;

if (contentTokens == null)
contentTokens = new ArrayList<>();

contentTokens.addAll(tokens);
}

public List<LayoutToken> getLabelTokens() {
return labelTokens;
}
Expand Down Expand Up @@ -181,9 +203,9 @@ public void addLayoutTokens(List<LayoutToken> tokens) {
if (tokens == null)
return;
if (layoutTokens == null)
layoutTokens = new ArrayList<LayoutToken>();
for(LayoutToken token : tokens)
layoutTokens.add(token);
layoutTokens = new ArrayList<>();

layoutTokens.addAll(tokens);
}

public List<BoundingBox> getCoordinates() {
Expand Down
22 changes: 15 additions & 7 deletions grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import com.google.common.collect.Lists;
import com.google.common.base.Joiner;

import org.apache.commons.lang3.tuple.Triple;
import org.grobid.core.GrobidModels;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
Expand All @@ -17,12 +18,9 @@
import org.grobid.core.layout.GraphicObjectType;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.VectorGraphicBoxCalculator;
import org.grobid.core.utilities.BoundingBoxCalculator;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.utilities.*;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.utilities.KeyGen;
import org.grobid.core.engines.label.TaggingLabels;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType;
Expand All @@ -41,6 +39,8 @@
import java.util.SortedSet;
import java.util.Collections;

import static org.grobid.core.document.TEIFormatter.applyStyleList;
import static org.grobid.core.document.TEIFormatter.extractStylesList;
import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
import static org.grobid.core.document.xml.XmlBuilderUtils.textNode;
Expand Down Expand Up @@ -388,7 +388,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

// if the segment has been parsed with the full text model we further extract the clusters
// to get the bibliographical references
if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
if (StringUtils.isNotEmpty(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();

Expand All @@ -404,7 +404,9 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

TaggingLabel clusterLabel = cluster.getTaggingLabel();
//String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
List<LayoutToken> dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens());
String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " ");

if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) {
try {
List<Node> refNodes = formatter.markReferencesTEILuceneBased(
Expand All @@ -422,7 +424,13 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
LOGGER.warn("Problem when serializing TEI fragment for figure caption", e);
}
} else {
desc.appendChild(textNode(clusterContent));
List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(dehyphenized);

if (CollectionUtils.isNotEmpty(stylesList)) {
applyStyleList(desc, text, stylesList);
} else {
desc.appendChild(StringUtils.normalizeSpace(text));
}
}
}
} else {
Expand Down
19 changes: 15 additions & 4 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.grobid.core.data;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.tuple.Triple;
import org.grobid.core.GrobidModels;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.table.Cell;
Expand All @@ -15,6 +17,7 @@
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.BoundingBoxCalculator;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.counters.CntManager;
import org.grobid.core.engines.counters.TableRejectionCounters;
import org.grobid.core.tokenization.TaggingTokenCluster;
Expand All @@ -30,9 +33,9 @@
import nu.xom.Attribute;
import nu.xom.Element;
import nu.xom.Node;
import nu.xom.Text;

import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
import static org.grobid.core.document.TEIFormatter.applyStyleList;
import static org.grobid.core.document.TEIFormatter.extractStylesList;
import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
import static org.grobid.core.document.xml.XmlBuilderUtils.textNode;

Expand Down Expand Up @@ -119,7 +122,9 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

TaggingLabel clusterLabel = cluster.getTaggingLabel();
//String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
List<LayoutToken> dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens());
String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " ");

if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) {
try {
List<Node> refNodes = formatter.markReferencesTEILuceneBased(
Expand All @@ -137,7 +142,13 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
LOGGER.warn("Problem when serializing TEI fragment for table caption", e);
}
} else {
desc.appendChild(textNode(clusterContent));
List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(dehyphenized);

if (CollectionUtils.isNotEmpty(stylesList)) {
applyStyleList(desc, text, stylesList);
} else {
desc.appendChild(StringUtils.normalizeSpace(text));
}
}

if (desc != null && config.isWithSentenceSegmentation()) {
Expand Down
Loading
Loading