Skip to content

Commit

Permalink
Add possibility of concatenating elements of an array independently
Browse files Browse the repository at this point in the history
The filter now accepts an incrementGap integer parameter, that should be set as
the same position offset gap set for the array you are saving. Any token closer
than this offset will be merged, otherwise they will belong to a new token
  • Loading branch information
francesconero committed Aug 3, 2015
1 parent 266f430 commit 266ee71
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 15 deletions.
8 changes: 6 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<name>elasticsearch-concatenate</name>
<modelVersion>4.0.0</modelVersion>
<groupId>elasticsearch.concatenate</groupId>
<artifactId>elasticsearch-concatenate</artifactId>
<version>1.0.0</version>
<version>1.1.0</version>
<packaging>jar</packaging>
<description>Plugin that provides a Token Filter that recombines all of the tokens in a token stream back into one.</description>
<inceptionYear>2015</inceptionYear>
Expand All @@ -18,6 +18,7 @@
</licenses>
<properties>
<elasticsearch.version>1.5.2</elasticsearch.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<build>
<resources>
Expand All @@ -34,6 +35,9 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>3.2</version>
<configuration>
<compilerArgs>
<arg>-Xlint:deprecation</arg>
</compilerArgs>
<source>1.7</source>
<target>1.7</target>
</configuration>
Expand Down
55 changes: 43 additions & 12 deletions src/main/java/elasticsearch/concatenate/ConcatenateFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,70 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;

public final class ConcatenateFilter extends TokenFilter {

private final static String DEFAULT_TOKEN_SEPARATOR = " ";

private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private String tokenSeparator = null;
private int incrementGap = 100;
private StringBuilder builder = new StringBuilder();

private AttributeSource.State previousState = null;
private boolean recheckPrevious = false;

public ConcatenateFilter(Version matchVersion, TokenStream input, String tokenSeparator, int incrementGap) {
super(input);
this.tokenSeparator = tokenSeparator!=null ? tokenSeparator : DEFAULT_TOKEN_SEPARATOR;
this.incrementGap = incrementGap;
}

public ConcatenateFilter(Version matchVersion, TokenStream input, String tokenSeparator) {
super(input);
this.tokenSeparator = tokenSeparator!=null ? tokenSeparator : DEFAULT_TOKEN_SEPARATOR;
this.incrementGap = 100;
}

@Override
public boolean incrementToken() throws IOException {
boolean result = false;
boolean empty = false;
builder.setLength(0);

if(recheckPrevious) {
restoreState(previousState);
// append the term of the current token
builder.append(termAtt.buffer(), 0, termAtt.length());
recheckPrevious = false;
}

while (input.incrementToken()) {
if (builder.length()>0) {
// append the token separator
builder.append(tokenSeparator);
}
// append the term of the current token
builder.append(termAtt.buffer(), 0, termAtt.length());
}
if(posIncrAtt.getPositionIncrement() <= incrementGap) {
if (builder.length()>0) {
// append the token separator
builder.append(tokenSeparator);
}
// append the term of the current token
builder.append(termAtt.buffer(), 0, termAtt.length());
} else {
// we have found a new element in the array, the next token should start from this one
recheckPrevious = true;
previousState = captureState();
break;
}
}

if (builder.length()>0) {
termAtt.setEmpty().append(builder);
result = true;
if(!recheckPrevious) {
empty = true;
}
}
return result;

return empty;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,19 @@
public class ConcatenateTokenFilterFactory extends AbstractTokenFilterFactory {

private String tokenSeparator = null;
private int incrementGap = 100;

@Inject
public ConcatenateTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
// the token_separator is defined in the ES configuration file
tokenSeparator = settings.get("token_separator");
incrementGap = settings.getAsInt("increment_gap", 100);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new ConcatenateFilter(Version.LUCENE_CURRENT, tokenStream, tokenSeparator);
return new ConcatenateFilter(Version.LATEST, tokenStream, tokenSeparator, incrementGap);
}

}

0 comments on commit 266ee71

Please sign in to comment.