Add possibility of concatenating elements of an array independently

The filter now accepts an incrementGap integer parameter, that should be set as the same position offset gap set for the array you are saving. Any token closer than this offset will be merged, otherwise they will belong to a new token
francesconero · Aug 3, 2015 · 266ee71 · 266ee71
1 parent 266f430
commit 266ee71
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 15 deletions.
diff --git a/pom.xml b/pom.xml
@@ -1,11 +1,11 @@
-<?xml version="1.0" encoding="UTF-8"?>
+<?xml version="1.0" encoding="UTF-8"?>
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 	<name>elasticsearch-concatenate</name>
 	<modelVersion>4.0.0</modelVersion>
 	<groupId>elasticsearch.concatenate</groupId>
 	<artifactId>elasticsearch-concatenate</artifactId>
-	<version>1.0.0</version>
+	<version>1.1.0</version>
 	<packaging>jar</packaging>
 	<description>Plugin that provides a Token Filter that recombines all of the tokens in a token stream back into one.</description>
 	<inceptionYear>2015</inceptionYear>
@@ -18,6 +18,7 @@
 	</licenses>
 	<properties>
 		<elasticsearch.version>1.5.2</elasticsearch.version>
+		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 	</properties>
 	<build>
 		<resources>
@@ -34,6 +35,9 @@
 				<artifactId>maven-compiler-plugin</artifactId>
 				<version>3.2</version>
 				<configuration>
+				<compilerArgs>
+    <arg>-Xlint:deprecation</arg>
+  </compilerArgs>
 					<source>1.7</source>
 					<target>1.7</target>
 				</configuration>

diff --git a/src/main/java/elasticsearch/concatenate/ConcatenateFilter.java b/src/main/java/elasticsearch/concatenate/ConcatenateFilter.java
@@ -5,39 +5,70 @@
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.Version;
 
 public final class ConcatenateFilter extends TokenFilter {
 
     private final static String DEFAULT_TOKEN_SEPARATOR = " ";
 
     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
     private String tokenSeparator = null;
+    private int incrementGap = 100;
     private StringBuilder builder = new StringBuilder();
-
+    private AttributeSource.State previousState = null;
+    private boolean recheckPrevious = false;
+
+    public ConcatenateFilter(Version matchVersion, TokenStream input, String tokenSeparator, int incrementGap) {
+        super(input);
+        this.tokenSeparator = tokenSeparator!=null ? tokenSeparator : DEFAULT_TOKEN_SEPARATOR;
+        this.incrementGap = incrementGap;
+    }
+
     public ConcatenateFilter(Version matchVersion, TokenStream input, String tokenSeparator) {
         super(input);
         this.tokenSeparator = tokenSeparator!=null ? tokenSeparator : DEFAULT_TOKEN_SEPARATOR;
+        this.incrementGap = 100;
     }
 
     @Override
     public boolean incrementToken() throws IOException {
-        boolean result = false;
+        boolean empty = false;
         builder.setLength(0);
+
+	if(recheckPrevious) {
+	   restoreState(previousState);
+	   // append the term of the current token
+           builder.append(termAtt.buffer(), 0, termAtt.length());
+           recheckPrevious = false;
+	}
+
         while (input.incrementToken()) {
-            if (builder.length()>0) {
-                // append the token separator
-                builder.append(tokenSeparator);
-            }
-            // append the term of the current token
-            builder.append(termAtt.buffer(), 0, termAtt.length());
-        }
+	    if(posIncrAtt.getPositionIncrement() <= incrementGap) {
+                if (builder.length()>0) {
+                    // append the token separator
+                    builder.append(tokenSeparator);
+                }
+                // append the term of the current token
+                builder.append(termAtt.buffer(), 0, termAtt.length());
+            } else {
+		// we have found a new element in the array, the next token should start from this one
+                recheckPrevious = true;
+		previousState = captureState();
+	        break;
+	    }
+	}
+
         if (builder.length()>0) {
             termAtt.setEmpty().append(builder);
-            result = true;
+	    if(!recheckPrevious) {
+	        empty = true;
+	    }
         }
-        return result;
+
+        return empty;
     }
 
 }
diff --git a/src/main/java/elasticsearch/concatenate/ConcatenateTokenFilterFactory.java b/src/main/java/elasticsearch/concatenate/ConcatenateTokenFilterFactory.java
@@ -19,17 +19,19 @@
 public class ConcatenateTokenFilterFactory extends AbstractTokenFilterFactory {
 
     private String tokenSeparator = null;
+    private int incrementGap = 100;
 
     @Inject 
     public ConcatenateTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
         super(index, indexSettings, name, settings);
         // the token_separator is defined in the ES configuration file
         tokenSeparator = settings.get("token_separator");
+	incrementGap = settings.getAsInt("increment_gap", 100);
     }
 
     @Override 
     public TokenStream create(TokenStream tokenStream) {
-        return new ConcatenateFilter(Version.LUCENE_CURRENT, tokenStream, tokenSeparator);
+        return new ConcatenateFilter(Version.LATEST, tokenStream, tokenSeparator, incrementGap);
     }
 
 }