seen) {
+ for (int i = offset.startOffset(); i <= offset.endOffset(); i++) {
+ if (seen.contains(i))
+ return true;
+ }
+ return false;
+ }
+
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/RandomAccessCharOffsetContainer.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/RandomAccessCharOffsetContainer.java
new file mode 100644
index 000000000000..82973faf6621
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/RandomAccessCharOffsetContainer.java
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.charoffsets;
+
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Class to record results for looking up normalized terms (String) and
+ * character offsets for specified tokens. Will return NULL_TERM/NULL_OFFSET if
+ * a token offset was not found.
+ *
+ * Has utility methods for safely getting the closest found token. This is
+ * useful for when a concordance window ends in a stop word (no term/offset
+ * info).
+ */
+
+public class RandomAccessCharOffsetContainer {
+
+ public final static String NULL_TERM = "";
+ public final static int NULL_OFFSET = -1;
+
+ private BitSet set = new BitSet();
+ private int last = -1;
+ private Map terms = new HashMap();
+ private Map starts = new HashMap<>();
+ private Map ends = new HashMap<>();
+
+ /**
+ * @param tokenOffset token of interest
+ * @param startCharOffset start character offset within the string stored in StoredField[fieldIndex]
+ * @param endCharOffset end character offset within the string stored in StoredField[fieldIndex]
+ * @param term string term at that position
+ */
+ public void add(int tokenOffset, int startCharOffset,
+ int endCharOffset, String term) {
+ addStart(tokenOffset, startCharOffset);
+ addEnd(tokenOffset, endCharOffset);
+ addTerm(tokenOffset, term);
+ set.set(tokenOffset);
+ }
+
+ private void addTerm(int tokenOffset, String term) {
+ if (term != null) {
+ terms.put(tokenOffset, term);
+ }
+ last = (tokenOffset > last) ? tokenOffset : last;
+ }
+
+ private void addStart(int tokenOffset, int charOffset) {
+ starts.put(tokenOffset, charOffset);
+ last = (tokenOffset > last) ? tokenOffset : last;
+ }
+
+ private void addEnd(int tokenOffset, int charOffset) {
+ ends.put(tokenOffset, charOffset);
+ last = (tokenOffset > last) ? tokenOffset : last;
+ }
+
+ /**
+ * @param tokenOffset target token
+ * @return the character offset for the first character of the tokenOffset.
+ * returns {@link #NULL_OFFSET} if tokenOffset wasn't found
+ */
+ public int getCharacterOffsetStart(int tokenOffset) {
+ Integer start = starts.get(tokenOffset);
+ if (start == null) {
+ return NULL_OFFSET;
+ }
+ return start.intValue();
+ }
+
+ /**
+ * @param tokenOffset target token
+ * @return the character offset for the final character of the tokenOffset.
+ */
+ public int getCharacterOffsetEnd(int tokenOffset) {
+ Integer end = ends.get(tokenOffset);
+ if (end == null) {
+ return NULL_OFFSET;
+ }
+ return end.intValue();
+ }
+
+ /**
+ * @param tokenOffset tokenOffset
+ * @return term stored at this tokenOffset; can return {@link #NULL_TERM}
+ */
+ public String getTerm(int tokenOffset) {
+ String s = terms.get(tokenOffset);
+ if (s == null) {
+ return NULL_TERM;
+ }
+ return s;
+ }
+
+ /**
+ * @return last/largest token offset
+ */
+ public int getLast() {
+ return last;
+ }
+
+ /**
+ * reset state
+ */
+ public void clear() {
+ terms.clear();
+ starts.clear();
+ ends.clear();
+ last = -1;
+ set = new BitSet();
+ }
+
+ protected boolean isEmpty() {
+ return set.isEmpty();
+ }
+
+ /**
+ * Find the closest non-null token starting from startToken
+ * and ending with stopToken (inclusive).
+ *
+ * @param startToken start token
+ * @param stopToken end token
+ * @param map map to use
+ * @return closest non-null token offset to the startToken; can return
+ * {@link #NULL_OFFSET} if no non-null offset was found
+ */
+ private int getClosestToken(int startToken, int stopToken,
+ Map map) {
+
+ if (startToken < 0 || stopToken < 0) {
+ return NULL_OFFSET;
+ }
+ if (startToken == stopToken) {
+ return startToken;
+ }
+ if (startToken < stopToken) {
+ for (int i = startToken; i <= stopToken; i++) {
+ Integer charOffset = map.get(i);
+ if (charOffset != null && charOffset != NULL_OFFSET) {
+ return i;
+ }
+ }
+ } else if (startToken > stopToken) {
+ for (int i = startToken; i >= stopToken; i--) {
+ Integer charOffset = map.get(i);
+ if (charOffset != null && charOffset != NULL_OFFSET) {
+ return i;
+ }
+ }
+ }
+ return NULL_OFFSET;
+ }
+
+ public int getClosestCharStart(int startToken, int stopToken) {
+
+ int i = getClosestToken(startToken, stopToken, starts);
+ Integer charStart = getCharacterOffsetStart(i);
+ if (charStart == null) {
+ return NULL_OFFSET;
+ }
+ return charStart.intValue();
+ }
+
+ public int getClosestCharEnd(int startToken, int stopToken) {
+ int i = getClosestToken(startToken, stopToken, ends);
+ Integer charEnd = getCharacterOffsetEnd(i);
+ if (charEnd == null) {
+ return NULL_OFFSET;
+ }
+ return charEnd.intValue();
+ }
+
+ protected String getClosestTerm(int startToken, int stopToken) {
+ int i = getClosestToken(startToken, stopToken, starts);
+ return getTerm(i);
+ }
+
+ /*
+ * return: -1 if
+
+ public int getFieldIndex(int tokenOffset) {
+ CharCoordinate p = starts.get(tokenOffset);
+ if (p == null) {
+ return NULL_OFFSET;
+ }
+ return p.getFieldIndex();
+ }
+*/
+ protected String debugToString() {
+ StringBuilder sb = new StringBuilder();
+ for (Integer i : terms.keySet()) {
+ sb.append(i + " : " + terms.get(i) + " : " + starts.get(i) + " : "
+ + ends.get(i) + "\n");
+ }
+ return sb.toString();
+ }
+
+ protected BitSet getSet() {
+ return set;
+ }
+
+ public void remove(int token) {
+ if (token == last) {
+ last = getClosestToken(last - 1, 0, starts);
+ }
+ set.clear(token);
+ terms.remove(token);
+ starts.remove(token);
+ ends.remove(token);
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java
new file mode 100644
index 000000000000..b1f53e0dd3e4
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.charoffsets;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.Document;
+
+/**
+ * TokenCharOffsetsReader that captures character offsets by reanalyzing a
+ * field.
+ */
+public class ReanalyzingTokenCharOffsetsReader implements
+ TokenCharOffsetsReader {
+
+ private final static int GOT_ALL_REQUESTS = -2;
+ private Analyzer baseAnalyzer;
+
+ /**
+ * Constructor
+ *
+ * @param analyzer to use to get character offsets
+ */
+ public ReanalyzingTokenCharOffsetsReader(Analyzer analyzer) {
+ this.baseAnalyzer = analyzer;
+ }
+
+ @Override
+ public void getTokenCharOffsetResults(final Document d,
+ final String fieldName, final TokenCharOffsetRequests requests,
+ final RandomAccessCharOffsetContainer results) throws IOException {
+
+ int fieldIndex = 0;
+ int currPosInc = -1;
+ int posIncrementGap = baseAnalyzer.getPositionIncrementGap(fieldName);
+ int charOffsetGap = baseAnalyzer.getOffsetGap(fieldName);
+ int charBase = 0;
+ for (String fieldValue : d.getValues(fieldName)) {
+
+ currPosInc = addFieldValue(fieldName, currPosInc, charBase, fieldValue, requests,
+ results);
+
+ if (currPosInc == GOT_ALL_REQUESTS) {
+ break;
+ }
+ charBase += fieldValue.length() + charOffsetGap;
+ currPosInc += posIncrementGap;
+ fieldIndex++;
+ }
+
+ }
+
+ private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue,
+ TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results)
+ throws IOException {
+ //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true);
+ TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue);
+ stream.reset();
+
+ int defaultInc = 1;
+
+ CharTermAttribute termAtt = stream
+ .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
+ OffsetAttribute offsetAtt = stream
+ .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
+ PositionIncrementAttribute incAtt = null;
+ if (stream
+ .hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) {
+ incAtt = stream
+ .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);
+ }
+
+ while (stream.incrementToken()) {
+
+ currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc;
+ if (requests.contains(currInd)) {
+ results.add(currInd, offsetAtt.startOffset() + charBase,
+ offsetAtt.endOffset() + charBase, termAtt.toString());
+ }
+ if (currInd > requests.getLast()) {
+ // TODO: Is there a way to avoid this? Or, is this
+ // an imaginary performance hit?
+ while (stream.incrementToken()) {
+ //NO-OP
+ }
+ stream.end();
+ stream.close();
+ return GOT_ALL_REQUESTS;
+ }
+ }
+ stream.end();
+ stream.close();
+ return currInd;
+ }
+
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SimpleAnalyzerUtil.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SimpleAnalyzerUtil.java
new file mode 100644
index 000000000000..dbc0a01d54ea
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SimpleAnalyzerUtil.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.charoffsets;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+
+/**
+ * Simple util class for Analyzers
+ */
+public class SimpleAnalyzerUtil {
+ private final static String DEFAULT_FIELD = "FIELD";
+
+ /**
+ *
+ * @param s string to analyze
+ * @param field field to analyze
+ * @param analyzer analyzer to use
+ * @return list of analyzed terms
+ * @throws IOException if there's an IOException during analysis
+ */
+ public static List getTermStrings(String s, String field, Analyzer analyzer)
+ throws IOException {
+ List terms = new ArrayList<>();
+ return getTermStrings(s, field, analyzer, terms);
+ }
+
+ /**
+ * allows reuse of terms, this method calls terms.clear() before adding new
+ * terms
+ *
+ * @param s string to analyze
+ * @param field to use in analysis
+ * @param analyzer analyzer
+ * @param terms list for reuse
+ * @return list of strings
+ * @throws IOException if there's an IOException during analysis
+ */
+ public static List getTermStrings(String s, String field, Analyzer analyzer,
+ List terms) throws IOException {
+ if (terms == null) {
+ terms = new ArrayList<>();
+ }
+ terms.clear();
+ TokenStream stream = analyzer.tokenStream(field, s);
+ stream.reset();
+ CharTermAttribute termAtt = stream
+ .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
+
+ while (stream.incrementToken()) {
+ terms.add(termAtt.toString());
+ }
+ stream.end();
+ stream.close();
+
+ return terms;
+ }
+
+ /**
+ * This calculates a substring from an array of StorableFields.
+ *
+ * This attempts to do the best job possible, and at worst will
+ * return an empty string. If the start or end is within a gap,
+ * or before 0 or after the total number of characters, this will
+ * gracefully (blithely?) handle those cases.
+ *
+ * @param start character offset to start
+ * @param end character offset to end
+ * @param fieldValues array of Strings to process
+ * @param offsetGap offsetGap as typically returned by Analyzer's .getOffsetGap()
+ * @param interFieldJoiner string to use to mark that a substring goes beyond a single
+ * field entry
+ * @return substring, potentially empty, never null.
+ */
+ public static String substringFromMultiValuedFields(int start,
+ int end, String[] fieldValues, int offsetGap, String interFieldJoiner) {
+ start = (start < 0) ? 0 : start;
+ end = (end < 0) ? 0 : end;
+
+ if (start > end) {
+ start = end;
+ }
+
+ int charBase = 0;
+ StringBuilder sb = new StringBuilder();
+ int lastFieldIndex = 0;
+ int localStart = 0;
+ boolean foundStart = false;
+ //get start
+ for (int fieldIndex = 0; fieldIndex < fieldValues.length; fieldIndex++) {
+ String fString = fieldValues[fieldIndex];
+ if (start < charBase + fString.length()) {
+ localStart = start - charBase;
+ lastFieldIndex = fieldIndex;
+ foundStart = true;
+ break;
+ }
+ charBase += fString.length() + offsetGap;
+ }
+ if (foundStart == false) {
+ return "";
+ }
+ //if start occurred in a gap, reset localStart to 0
+ if (localStart < 0) {
+ sb.append(interFieldJoiner);
+ localStart = 0;
+ }
+ //now append and look for end
+ for (int fieldIndex = lastFieldIndex; fieldIndex < fieldValues.length; fieldIndex++) {
+ String fString = fieldValues[fieldIndex];
+
+ if (end <= charBase + fString.length()) {
+ int localEnd = end - charBase;
+ //must be in gap
+ if (charBase > end) {
+ return sb.toString();
+ }
+ if (fieldIndex != lastFieldIndex) {
+ sb.append(interFieldJoiner);
+ }
+ sb.append(fString.substring(localStart, localEnd));
+ break;
+ } else {
+ if (fieldIndex != lastFieldIndex) {
+ sb.append(interFieldJoiner);
+ }
+ sb.append(fString.substring(localStart));
+ localStart = 0;
+ }
+ charBase += fString.length() + offsetGap;
+ }
+ return sb.toString();
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SpansCrawler.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SpansCrawler.java
new file mode 100644
index 000000000000..ecbb9a477ab1
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SpansCrawler.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.charoffsets;
+
+import java.io.IOException;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanWeight;
+import org.apache.lucene.search.spans.Spans;
+
+
+/**
+ * Utility class to crawl spans.
+ */
+public class SpansCrawler {
+
+ /**
+ *
+ * @param query span query to use
+ * @param filter filter
+ * @param searcher searcher
+ * @param visitor visitor to call for each span
+ * @throws IOException on IOException
+ * @throws TargetTokenNotFoundException if the visitor can't find the target token
+ */
+ public static void crawl(SpanQuery query, Query filter, IndexSearcher searcher,
+ DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException {
+
+ query = (SpanQuery) query.rewrite(searcher.getIndexReader());
+
+ SpanWeight w = query.createWeight(searcher, false, 1.0f);
+ if (filter == null) {
+ for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
+
+ Spans spans = w.getSpans(ctx, SpanWeight.Postings.POSITIONS);
+ if (spans == null) {
+ continue;
+ }
+ boolean cont = visitLeafReader(ctx, spans, visitor);
+ if (!cont) {
+ break;
+ }
+ }
+ } else {
+ filter = searcher.rewrite(filter);
+ Weight searcherWeight = searcher.createWeight(filter, false, 1.0f);
+ for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
+ Scorer leafReaderContextScorer = searcherWeight.scorer(ctx);
+ if (leafReaderContextScorer == null) {
+ continue;
+ }
+ //Can we tell from the scorer that there were no hits?
+ //in <= 5.x we could stop here if the filter query had no hits.
+
+ Spans spans = w.getSpans(ctx, SpanWeight.Postings.POSITIONS);
+ if (spans == null) {
+ continue;
+ }
+ DocIdSetIterator filterItr = leafReaderContextScorer.iterator();
+
+ if (filterItr == null || filterItr.equals(DocIdSetIterator.empty())) {
+ continue;
+ }
+ boolean cont = visitLeafReader(ctx, spans, filterItr, visitor);
+ if (!cont) {
+ break;
+ }
+ }
+ }
+ }
+
+ static boolean visitLeafReader(LeafReaderContext leafCtx,
+ Spans spans, DocIdSetIterator filterItr, DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException {
+ int filterDoc = -1;
+ int spansDoc = spans.nextDoc();
+ while (true) {
+ if (spansDoc == DocIdSetIterator.NO_MORE_DOCS) {
+ break;
+ }
+ filterDoc = filterItr.advance(spansDoc);
+ if (filterDoc == DocIdSetIterator.NO_MORE_DOCS) {
+ break;
+ } else if (filterDoc > spansDoc) {
+ while (spansDoc <= filterDoc) {
+ spansDoc = spans.nextDoc();
+ if (spansDoc == filterDoc) {
+ boolean cont = visit(leafCtx, spans, visitor);
+ if (! cont) {
+ return false;
+ }
+
+ } else {
+ continue;
+ }
+ }
+ } else if (filterDoc == spansDoc) {
+ boolean cont = visit(leafCtx, spans, visitor);
+ if (! cont) {
+ return false;
+ }
+ //then iterate spans
+ spansDoc = spans.nextDoc();
+ } else if (filterDoc < spansDoc) {
+ throw new IllegalArgumentException("FILTER doc is < spansdoc!!!");
+ } else {
+ throw new IllegalArgumentException("Something horrible happened");
+ }
+ }
+ return true;
+ }
+
+ static boolean visitLeafReader(LeafReaderContext leafCtx,
+ Spans spans,
+ DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException {
+ while (spans.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+ boolean cont = visit(leafCtx, spans, visitor);
+ if (! cont) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ static boolean visit(LeafReaderContext leafCtx, Spans spans, DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException {
+ Document document = leafCtx.reader().document(spans.docID(), visitor.getFields());
+ DocTokenOffsets offsets = visitor.getDocTokenOffsets();
+ offsets.reset(leafCtx.docBase, spans.docID(), document);
+ while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
+ offsets.addOffset(spans.startPosition(), spans.endPosition());
+ }
+ return visitor.visit(offsets);
+ }
+
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TargetTokenNotFoundException.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TargetTokenNotFoundException.java
new file mode 100644
index 000000000000..a63ff775cb82
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TargetTokenNotFoundException.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.charoffsets;
+
+public class TargetTokenNotFoundException extends Exception {
+
+ /**
+ * Token offset identified by .getSpans() is not found in the
+ * TokenCharOffsetResults. Typical cause is a mismatch btwn analyzers at index
+ * and search times. When this happens, something very bad has happened and
+ * this should be its own exception.
+ */
+ private static final long serialVersionUID = 1L;
+
+ public TargetTokenNotFoundException(String message) {
+ super(message);
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetRequests.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetRequests.java
new file mode 100644
index 000000000000..1e87c275b99c
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetRequests.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.charoffsets;
+
+import java.util.BitSet;
+
+/**
+ * Util class used to specify the tokens for which character offsets are requested.
+ */
+
+
+public class TokenCharOffsetRequests {
+ private BitSet set = new BitSet();
+ private int last = -1;
+
+ /**
+ * Is a specific token requested?
+ *
+ * @param i token number to test
+ * @return whether or not this token is requested
+ */
+ public boolean contains(int i) {
+ return set.get(i);
+ }
+
+ /**
+ * add a request from start to end inclusive
+ *
+ * @param start range of token offsets to request (inclusive)
+ * @param end end range of token offsets to request (inclusive)
+ */
+ public void add(int start, int end) {
+ for (int i = start; i <= end; i++) {
+ add(i);
+ }
+ }
+
+ /**
+ * add a request for a specific token
+ *
+ * @param i token offset to request the character offsets for
+ */
+ public void add(int i) {
+ set.set(i);
+ last = (i > last) ? i : last;
+ }
+
+ /**
+ * clear the state of this request object for reuse
+ */
+ public void clear() {
+ set.clear();
+ last = -1;
+ }
+
+ /**
+ * @return greatest/last token offset in the request
+ */
+ public int getLast() {
+ return last;
+ }
+
+ /**
+ * @return the set of tokens whose character offsets are requested
+ */
+ protected BitSet getSet() {
+ return set;
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetsReader.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetsReader.java
new file mode 100644
index 000000000000..9b81e47f9da4
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetsReader.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.charoffsets;
+
+import java.io.IOException;
+
+import org.apache.lucene.document.Document;
+
+
+/**
+ * Interface to allow flexibility/optimizations in returning character offsets
+ * for tokens
+ */
+public interface TokenCharOffsetsReader {
+
+ public void getTokenCharOffsetResults(final Document document,
+ final String fieldName, final TokenCharOffsetRequests requests,
+ final RandomAccessCharOffsetContainer results) throws IOException;
+
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/AbstractConcordanceWindowCollector.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/AbstractConcordanceWindowCollector.java
new file mode 100644
index 000000000000..c055adeb7344
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/AbstractConcordanceWindowCollector.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Abstract class to handle basic information for a ConcordanceWindowSearcher
+ */
+public abstract class AbstractConcordanceWindowCollector {
+ //value to use if all windows should be collected
+ public static final int COLLECT_ALL = -1;
+
+ private final ConcordanceSorter sorter = new ConcordanceSorter();
+ private final int maxWindows;
+ private Set docIds = new HashSet();
+ private boolean hitMax = false;
+ private long totalDocs = 0;
+
+ /**
+ * @param maxWindows maximum windows to collect
+ */
+ public AbstractConcordanceWindowCollector(int maxWindows) {
+ this.maxWindows = maxWindows;
+ }
+
+ /**
+ * Collect/process this window
+ *
+ * @param w window to be processed
+ */
+ public abstract void collect(ConcordanceWindow w);
+
+ /**
+ * @return number of windows collected
+ */
+ public abstract int size();
+
+ /**
+ * @return collected windows (unsorted)
+ */
+ public abstract List getWindows();
+
+ /**
+ * @param docId unique key for a document
+ */
+ public void addDocId(String docId) {
+ docIds.add(docId);
+ }
+
+ /**
+ * Sort according to {@link #sorter} and return windows
+ *
+ * @return sorted list of windows
+ */
+ public List getSortedWindows() {
+ List windows = getWindows();
+ Collections.sort(windows, sorter);
+ return windows;
+ }
+
+ /**
+ * @return whether or not the searcher collected the maximum number of
+ * windows and stopped early.
+ */
+ public boolean getHitMax() {
+ return hitMax;
+ }
+
+ /**
+ * @param hitMax did the searcher collect the maximum number of windows
+ * and stop early
+ */
+ public void setHitMax(boolean hitMax) {
+ this.hitMax = hitMax;
+ }
+
+ /**
+ * @return the maximum number of windows to collect.
+ * Can be equal to {@link #COLLECT_ALL}
+ */
+ public int getMaxWindows() {
+ return maxWindows;
+ }
+
+ /**
+ * @param totalDocs add this value to {@link #totalDocs}
+ */
+ public void incrementTotalDocs(long totalDocs) {
+ this.totalDocs += totalDocs;
+ }
+
+ /**
+ * @return total number of documents in all indices
+ */
+ public long getTotalDocs() {
+ return totalDocs;
+ }
+
+ /**
+ * @param totalDocs see {@link #getTotalDocs()}
+ */
+ public void setTotalDocs(long totalDocs) {
+ this.totalDocs = totalDocs;
+ }
+
+ /**
+ * @return number of windows in results
+ */
+ public int getNumWindows() {
+ List windows = getWindows();
+ if (windows != null) {
+ return windows.size();
+ }
+ return 0;
+ }
+
+ /**
+ * @return number of documents in results
+ */
+ public int getNumDocs() {
+ return docIds.size();
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSearcher.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSearcher.java
new file mode 100644
index 000000000000..803df2dc6681
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSearcher.java
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.concordance.charoffsets.DocTokenOffsets;
+import org.apache.lucene.concordance.charoffsets.DocTokenOffsetsVisitor;
+import org.apache.lucene.concordance.charoffsets.OffsetLengthStartComparator;
+import org.apache.lucene.concordance.charoffsets.OffsetUtil;
+import org.apache.lucene.concordance.charoffsets.RandomAccessCharOffsetContainer;
+import org.apache.lucene.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader;
+import org.apache.lucene.concordance.charoffsets.SpansCrawler;
+import org.apache.lucene.concordance.charoffsets.TargetTokenNotFoundException;
+import org.apache.lucene.concordance.charoffsets.TokenCharOffsetRequests;
+import org.apache.lucene.concordance.charoffsets.TokenCharOffsetsReader;
+import org.apache.lucene.concordance.util.ConcordanceSearcherUtil;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SimpleSpanQueryConverter;
+import org.apache.lucene.search.spans.SpanQuery;
+
+
+/**
+ * Searches an IndexReader and returns a list of ConcordanceWindows
+ */
+public class ConcordanceSearcher {
+
+ /**
+ * Allow overlapping targets in hits, default = false
+ */
+ private boolean allowTargetOverlaps = false;
+
+ private WindowBuilder windowBuilder;
+
+ private SimpleSpanQueryConverter spanQueryConverter;
+
+ /**
+ * Constructor with default WindowBuilder and SimpleSpanQueryConverter
+ */
+ public ConcordanceSearcher() {
+ this(new WindowBuilder(), new SimpleSpanQueryConverter());
+ }
+
+ /**
+ * Constructor for windowbuilder and SimpleSpanQueryConverter
+ *
+ * @param windowBuilder window builder
+ */
+ public ConcordanceSearcher(WindowBuilder windowBuilder) {
+ this(windowBuilder, new SimpleSpanQueryConverter());
+ }
+
+ /**
+ * Constructor for windowBuilder and converter
+ *
+ * @param windowBuilder windowBuilder to use to build windows
+ * @param converter converter to use to convert Query to SpanQuery
+ */
+ public ConcordanceSearcher(WindowBuilder windowBuilder,
+ SimpleSpanQueryConverter converter) {
+ this.windowBuilder = windowBuilder;
+ this.spanQueryConverter = converter;
+ }
+
+
+ /**
+ * @param searcher searcher to search
+ * @param fieldName field to build the windows on
+ * @param mainQuery if SpanQuery, this gets passed through as is. If a regular Query, the
+ * Query is first converted to a SpanQuery and the filterQuery is modified
+ * to include the original Query.
+ * @param filterQuery include a filterQuery mainQuery. Value can be null
+ * @param analyzer analyzer to use for (re)calculating character offsets and for normalizing
+ * the sort keys
+ * @param collector collector to use for search
+ * @throws TargetTokenNotFoundException if target token is not found
+ * @throws IllegalArgumentException if the field can't be found in the main query
+ * @throws IOException if there is an underlying IOException in the reader
+ */
+ public void search(IndexSearcher searcher, String fieldName, Query mainQuery,
+ Query filterQuery, Analyzer analyzer, AbstractConcordanceWindowCollector collector)
+ throws TargetTokenNotFoundException, IllegalArgumentException,
+ IOException {
+ if (mainQuery == null) {
+ return;
+ }
+ if (mainQuery instanceof SpanQuery) {
+ // pass through
+ searchSpan(searcher, (SpanQuery) mainQuery, filterQuery, analyzer, collector);
+ } else {
+ // convert regular mainQuery to a SpanQuery.
+ SpanQuery spanQuery = spanQueryConverter.convert(fieldName, mainQuery);
+
+ Query updatedFilter = mainQuery;
+
+ if (filterQuery != null) {
+ updatedFilter = new BooleanQuery.Builder()
+ .add(mainQuery, BooleanClause.Occur.MUST)
+ .add(filterQuery, BooleanClause.Occur.FILTER).build();
+ }
+ searchSpan(searcher, spanQuery, updatedFilter, analyzer, collector);
+ }
+ }
+
+ /**
+ * Like
+ * {@link #search(IndexSearcher, String, Query, Query, Analyzer, AbstractConcordanceWindowCollector)}
+ * but this takes a SpanQuery
+ *
+ * @param searcher searcher
+ * @param spanQuery query to use to identify the targets
+ * @param filter filter for document retrieval
+ * @param analyzer to re-analyze terms for window calculations and sort key building
+ * @param collector to process (and store) the results
+ * @throws TargetTokenNotFoundException if target token is not found
+ * @throws IllegalArgumentException if the field can't be found in the main query
+ * @throws IOException if there is an underlying IOException in the reader
+ */
+ public void searchSpan(IndexSearcher searcher,
+ SpanQuery spanQuery,
+ Query filter, Analyzer analyzer, AbstractConcordanceWindowCollector collector)
+ throws TargetTokenNotFoundException, IllegalArgumentException,
+ IOException {
+
+ Set fields = new HashSet<>(
+ windowBuilder.getFieldSelector());
+ fields.add(spanQuery.getField());
+ DocTokenOffsetsVisitor visitor = new ConcDTOffsetVisitor(spanQuery.getField(), analyzer,
+ fields, collector);
+ SpansCrawler.crawl(spanQuery, filter, searcher, visitor);
+
+ collector.setTotalDocs(searcher.getIndexReader().numDocs());
+ }
+
+
+ /**
+ * Spans can overlap: a search for ["ab cd" "ab"] would have
+ * two spans on the string "ab cd" if this is set to true.
+ * If this is set to false, this will return the longest span
+ * that appears earliest in the string if there is overlap.
+ *
+ * @param allowTargetOverlaps are targets allowed to overlap.
+ */
+ public void setAllowTargetOverlaps(boolean allowTargetOverlaps) {
+ this.allowTargetOverlaps = allowTargetOverlaps;
+ }
+
+ private void throwMissingField(Document document) throws IllegalArgumentException {
+ StringBuilder sb = new StringBuilder();
+ sb.append("Did you forget to load or specify the correct content field?!");
+ sb.append("\n");
+ sb.append("I only see these fields:\n");
+ for (IndexableField f : document.getFields()) {
+ sb.append(f.name()).append("\n");
+ }
+ throw new IllegalArgumentException(sb.toString());
+ }
+
+ /**
+ * Set the converter to use to convert a Query to a SpanQuery.
+ * The need for this will go away when LUCENE-2878 is completed.
+ *
+ * @param converter converter to use to convert queries into SpanQueries
+ */
+ public void setSpanQueryConverter(SimpleSpanQueryConverter converter) {
+ this.spanQueryConverter = converter;
+ }
+
+ class ConcDTOffsetVisitor implements DocTokenOffsetsVisitor {
+ final Set fields;
+ final DocTokenOffsets docTokenOffsets = new DocTokenOffsets();
+ final Analyzer analyzer;
+ final String fieldName;
+ final AbstractConcordanceWindowCollector collector;
+ TokenCharOffsetRequests requests = new TokenCharOffsetRequests();
+
+ TokenCharOffsetsReader tokenOffsetsRecordReader;
+
+
+ RandomAccessCharOffsetContainer offsetResults = new RandomAccessCharOffsetContainer();
+ OffsetLengthStartComparator offsetLengthStartComparator = new OffsetLengthStartComparator();
+
+
+ ConcDTOffsetVisitor(String fieldName, Analyzer analyzer, Set fields,
+ AbstractConcordanceWindowCollector collector) {
+ this.fieldName = fieldName;
+ this.analyzer = analyzer;
+ this.fields = fields;
+ this.collector = collector;
+ tokenOffsetsRecordReader = new ReanalyzingTokenCharOffsetsReader(analyzer);
+
+ }
+ @Override
+ public DocTokenOffsets getDocTokenOffsets() {
+ return docTokenOffsets;
+ }
+
+ @Override
+ public Set getFields() {
+ return fields;
+ }
+
+ @Override
+ public boolean visit(DocTokenOffsets docTokenOffsets) throws IOException {
+ Document document = docTokenOffsets.getDocument();
+
+ String[] fieldValues = document.getValues(fieldName);
+
+ if (fieldValues == null || fieldValues.length == 0) {
+ throwMissingField(document);
+ }
+ Map metadata = windowBuilder.extractMetadata(document);
+ String docId = windowBuilder.getUniqueDocumentId(document, docTokenOffsets.getUniqueDocId());
+
+ List tokenOffsets = docTokenOffsets.getOffsets();
+ if (!allowTargetOverlaps) {
+ // remove overlapping hits!!!
+ tokenOffsets = OffsetUtil.removeOverlapsAndSort(tokenOffsets,
+ offsetLengthStartComparator, null);
+ }
+
+ //clear then get new requests
+ requests.clear();
+ ConcordanceSearcherUtil.getCharOffsetRequests(tokenOffsets,
+ windowBuilder.getTokensBefore(), windowBuilder.getTokensAfter(), requests);
+
+ offsetResults.clear();
+
+ tokenOffsetsRecordReader.getTokenCharOffsetResults(
+ document, fieldName, requests, offsetResults);
+
+ for (OffsetAttribute offset : tokenOffsets) {
+ try {
+ ConcordanceWindow w = windowBuilder.buildConcordanceWindow(
+ docId, offset.startOffset(),
+ offset.endOffset() - 1, fieldValues,
+ offsetResults, metadata);
+ collector.collect(w);
+ } catch (TargetTokenNotFoundException e) {
+ throw new IllegalArgumentException(e);
+ }
+ if (collector.getHitMax()) {
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortKey.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortKey.java
new file mode 100644
index 000000000000..2ffa02a8bf5f
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortKey.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic;
+
+/**
+ * Simple comparable class to allow for subclassing.
+ */
+public class ConcordanceSortKey implements Comparable {
+
+ private final String concSortString;
+
+ public ConcordanceSortKey(String s) {
+ this.concSortString = s;
+ }
+
+ @Override
+ public int compareTo(ConcordanceSortKey other) {
+ return concSortString.compareTo(other.concSortString);
+ }
+
+ @Override
+ public int hashCode() {
+ return concSortString.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (!(obj instanceof ConcordanceSortKey))
+ return false;
+ ConcordanceSortKey other = (ConcordanceSortKey) obj;
+ if (concSortString == null) {
+ if (other.concSortString != null)
+ return false;
+ } else if (!concSortString.equals(other.concSortString))
+ return false;
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ return concSortString;
+ }
+
+
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortOrder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortOrder.java
new file mode 100644
index 000000000000..2e7fa757e89e
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortOrder.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic;
+
+/**
+ * Options for sorting ConcordanceWindows
+ */
+public enum ConcordanceSortOrder {
+ PRE, // sort on the first token before the target, then the second word, etc.
+ POST, // sort on words after the target
+ TARGET_PRE, // sort on the target and then words before the target
+ TARGET_POST, // sort on the target and then words after the target
+ DOC, // sort on a string representing a doc id and then by target char offset within the document
+ NONE // no sort
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSorter.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSorter.java
new file mode 100644
index 000000000000..6dee6fb075cd
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSorter.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic;
+
+import java.util.Comparator;
+
+
+public class ConcordanceSorter implements Comparator {
+ private static final long serialVersionUID = 7526472295622776147L;
+
+ @Override
+ public int compare(ConcordanceWindow w1, ConcordanceWindow w2) {
+ return w1.getSortKey().compareTo(w2.getSortKey());
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceWindow.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceWindow.java
new file mode 100644
index 000000000000..863adfa6cfed
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceWindow.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic;
+
+import java.util.Map;
+
+/**
+ * Key element in a concordance view of data. A window consists of the words
+ * before a target term (pre), the target term and then the words after the
+ * target term (post). A window also has a sort key to allow for various methods
+ * of sorting.
+ *
+ * For various applications, it has also been useful to store a unique document key,
+ * character offset (start and end) of the full
+ * window as well as metadata from the document for the given window.
+ *
+ * This class is experimental and may change in incompatible ways in the future.
+ */
+public class ConcordanceWindow {
+
+ private final ConcordanceSortKey sortKey;
+ private final String pre;
+ private final String target;
+ private final String post;
+ private final int charStart;
+ private final int charEnd;
+ private final String uniqueDocID;
+ //used by hide duplicates to count more than one occurrence of a window
+ private int count = 1;
+ private Map metadata;
+
+ /**
+ * @param uniqueDocID string representing what should be a unique document identifier
+ * @param charStart character offset start for the window
+ * @param charEnd character offset end for the window
+ * @param pre words before the target in reading order and unanalyzed
+ * @param target target string
+ * @param post string after the target in reading order and unanalyzed
+ * @param sortKey key to use for sorting this window
+ * @param metadata metadata to store with this window
+ */
+ public ConcordanceWindow(String uniqueDocID, int charStart, int charEnd, String pre,
+ String target, String post, ConcordanceSortKey sortKey, Map metadata) {
+ this.pre = pre;
+ this.target = target;
+ this.post = post;
+ this.uniqueDocID = uniqueDocID;
+ this.charStart = charStart;
+ this.charEnd = charEnd;
+ this.metadata = metadata;
+ this.sortKey = sortKey;
+ }
+
+ public String getUniqueDocID() {
+ return uniqueDocID;
+ }
+
+ public int getStart() {
+ return charStart;
+ }
+
+ public int getEnd() {
+ return charEnd;
+ }
+
+ public Map getMetadata() {
+ return metadata;
+ }
+
+ public String getPre() {
+ return pre;
+ }
+
+ public String getPost() {
+ return post;
+ }
+
+ public String getTarget() {
+ return target;
+ }
+
+ public int getCount() {
+ return count;
+ }
+
+ public void setCount(int count) {
+ this.count = count;
+ }
+
+ public void incrementCount() {
+ count++;
+ }
+
+ public int getSize() {
+ int size = 0;
+ if (pre != null) {
+ size += pre.length();
+ }
+ if (target != null) {
+ size += target.length();
+ }
+ if (post != null) {
+ size += post.length();
+ }
+ return size;
+ }
+
+ public ConcordanceSortKey getSortKey() {
+ return sortKey;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((post == null) ? 0 : post.hashCode());
+ result = prime * result + ((pre == null) ? 0 : pre.hashCode());
+ result = prime * result + ((target == null) ? 0 : target.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (!(obj instanceof ConcordanceWindow)) {
+ return false;
+ }
+ ConcordanceWindow other = (ConcordanceWindow) obj;
+ if (post == null) {
+ if (other.post != null) {
+ return false;
+ }
+ } else if (!post.equals(other.post)) {
+ return false;
+ }
+ if (pre == null) {
+ if (other.pre != null) {
+ return false;
+ }
+ } else if (!pre.equals(other.pre)) {
+ return false;
+ }
+ if (target == null) {
+ if (other.target != null) {
+ return false;
+ }
+ } else if (!target.equals(other.target)) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(pre).append(">>>").append(target).append("<<<").append(post);
+ return sb.toString();
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocIdBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocIdBuilder.java
new file mode 100644
index 000000000000..9dbfce7f2738
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocIdBuilder.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic;
+
+import java.util.Set;
+
+import org.apache.lucene.document.Document;
+
+
+/**
+ * Returns a unique string for each document.
+ * Some implementations may be able to rely only
+ * on the ephemeral Lucene docId. Others, may
+ * want to use a field within the document.
+ */
+public interface DocIdBuilder {
+
+ public Set getFields();
+ public String build(Document document, long docId);
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocMetadataExtractor.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocMetadataExtractor.java
new file mode 100644
index 000000000000..e16c50ec3469
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocMetadataExtractor.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic;
+
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.document.Document;
+
+/**
+ * Simple interface for a component that extracts metadata from
+ * a document to be stored with a ConcordanceWindow
+ */
+public interface DocMetadataExtractor {
+
+ /**
+ * @return the fields that need to be retrieved for the document
+ * for proper processing
+ */
+ public Set getFieldSelector();
+
+ /**
+ * @param document to be processed for metadata. Only those fields
+ * that were returned by {@link #getFieldSelector()} will be loaded
+ * in the document
+ * @return document metadata to be stored with each window
+ */
+ public Map extract(Document document);
+
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/SortKeyBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/SortKeyBuilder.java
new file mode 100644
index 000000000000..c05c14879549
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/SortKeyBuilder.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic;
+
+import java.util.Map;
+
+import org.apache.lucene.concordance.charoffsets.RandomAccessCharOffsetContainer;
+
+public interface SortKeyBuilder {
+
+ /**
+ * Builds a sort key from the classic TokenCharOffsetResults object
+ *
+ * @param docKey to be used if sorting by document key
+ * @param startTargetTokenOffset start target token offest
+ * @param endTargetTokenOffset end target token offset
+ * @param charOffsets charoffsets
+ * @param numTokensPre number of tokens before
+ * @param numTokensPost number of tokens after
+ * @param metadata metadata
+ * @return ConcordanceSortKey
+ */
+ ConcordanceSortKey buildKey(String docKey,
+ int startTargetTokenOffset, int endTargetTokenOffset,
+ RandomAccessCharOffsetContainer charOffsets,
+ int numTokensPre, int numTokensPost, Map metadata);
+
+ public boolean requiresAnalysisOfPre();
+
+ public boolean requiresAnalysisOfPost();
+
+ public boolean requiresAnalysisOfTarget();
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/WindowBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/WindowBuilder.java
new file mode 100644
index 000000000000..d8a4fb5d6b7d
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/WindowBuilder.java
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic;
+
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.concordance.charoffsets.RandomAccessCharOffsetContainer;
+import org.apache.lucene.concordance.charoffsets.SimpleAnalyzerUtil;
+import org.apache.lucene.concordance.charoffsets.TargetTokenNotFoundException;
+import org.apache.lucene.concordance.classic.impl.DefaultSortKeyBuilder;
+import org.apache.lucene.concordance.classic.impl.FieldBasedDocIdBuilder;
+import org.apache.lucene.concordance.classic.impl.IndexIdDocIdBuilder;
+import org.apache.lucene.concordance.classic.impl.SimpleDocMetadataExtractor;
+
+
+/**
+ * Builds a ConcordanceWindow.
+ *
+ * This class includes basic functionality for building a window from token offsets.
+ *
+ * It also calls three other components:
+ *
+ * - DocIdBuilder - extracts or builds a unique key for each document
+ * - DocMetadataExtractor - extracts metadata from a document to be stored with each window
+ * - SortKeyBuilder - builds a window's sort key
+ *
+ */
+public class WindowBuilder {
+
+ private final static String EMPTY_STRING = "";
+ private static String INTER_MULTIVALUE_FIELD_PADDING = " | ";
+ private final int tokensBefore;
+ private final int tokensAfter;
+ private final SortKeyBuilder sortKeyBuilder;
+ private final DocMetadataExtractor metadataExtractor;
+ private final DocIdBuilder docIdBuilder;
+ private final int offsetGap;
+
+ public WindowBuilder() {
+ this(
+ 10, //tokens before
+ 10, //tokens after
+ 0,
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE),
+ new SimpleDocMetadataExtractor(),
+ new IndexIdDocIdBuilder()
+ );
+ }
+
+ public WindowBuilder(int tokensBefore, int tokensAfter, int offsetGap) {
+ this(
+ tokensBefore,
+ tokensAfter,
+ offsetGap,
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE),
+ new SimpleDocMetadataExtractor(),
+ new IndexIdDocIdBuilder()
+ );
+ }
+
+ public WindowBuilder(int tokensBefore, int tokensAfter, int offsetGap, SortKeyBuilder sortKeyBuilder,
+ DocMetadataExtractor metadataExtractor, DocIdBuilder docIdBuilder) {
+ this.tokensBefore = tokensBefore;
+ this.tokensAfter = tokensAfter;
+ this.offsetGap = offsetGap;
+ this.sortKeyBuilder = sortKeyBuilder;
+ this.metadataExtractor = metadataExtractor;
+ this.docIdBuilder = docIdBuilder;
+ }
+
+
+ /**
+ *
+ * Makes the assumption that the target token start and target token end can
+ * be found. If not, this returns a null.
+ *
+ * @param uniqueDocID ephemeral internal lucene unique document id
+ * @param targetTokenStart Target's start token
+ * @param targetTokenEnd Target's end token
+ * @param fieldValues field values
+ * @param metadata Metadata to be stored with the window
+ * @param offsets TokenOffsetResults from
+ * @return ConcordanceWindow or null if character offset information cannot be
+ * found for both the targetTokenStart and the targetTokenEnd
+
+ * @throws TargetTokenNotFoundException if target token cannot be found
+ * @throws IllegalArgumentException if the start token comes after the end token, e.g.
+ */
+ public ConcordanceWindow buildConcordanceWindow(String uniqueDocID,
+ int targetTokenStart, int targetTokenEnd,
+ String[] fieldValues,
+ RandomAccessCharOffsetContainer offsets,
+ Map metadata)
+ throws TargetTokenNotFoundException,
+ IllegalArgumentException {
+
+ if (targetTokenStart < 0 || targetTokenEnd < 0) {
+ throw new IllegalArgumentException(
+ "targetTokenStart and targetTokenEnd must be >= 0");
+ }
+ if (targetTokenEnd < targetTokenStart) {
+ throw new IllegalArgumentException(
+ "targetTokenEnd must be >= targetTokenStart");
+ }
+
+ int targetCharStart = offsets.getCharacterOffsetStart(targetTokenStart);
+ int targetCharEnd = offsets.getCharacterOffsetEnd(targetTokenEnd);
+
+ if (targetCharStart < 0 ||
+ targetCharEnd < 0) {
+ throw new TargetTokenNotFoundException(
+ "couldn't find character offsets for a target token.\n"
+ + "Check that your analyzers are configured properly.\n");
+ }
+
+ OffsetAttribute preCharOffset = getPreCharOffset(targetTokenStart,
+ targetCharStart, offsets);
+ String preString = (preCharOffset == null) ? EMPTY_STRING :
+ SimpleAnalyzerUtil.substringFromMultiValuedFields(
+ preCharOffset.startOffset(), preCharOffset.endOffset(), fieldValues,
+ offsetGap, INTER_MULTIVALUE_FIELD_PADDING);
+
+ OffsetAttribute postCharOffset = getPostCharOffset(targetTokenEnd,
+ targetCharEnd, offsets);
+
+ String postString = (postCharOffset == null) ? EMPTY_STRING :
+ SimpleAnalyzerUtil.substringFromMultiValuedFields(
+ postCharOffset.startOffset(), postCharOffset.endOffset(), fieldValues,
+ offsetGap, INTER_MULTIVALUE_FIELD_PADDING);
+
+ String targString = SimpleAnalyzerUtil.substringFromMultiValuedFields(
+ targetCharStart, targetCharEnd, fieldValues,
+ offsetGap, INTER_MULTIVALUE_FIELD_PADDING);
+ ConcordanceSortKey sortKey = sortKeyBuilder.buildKey(uniqueDocID,
+ targetTokenStart, targetTokenEnd, offsets, tokensBefore, tokensAfter, metadata);
+ int charStart = (preCharOffset == null) ? targetCharStart :
+ preCharOffset.startOffset();
+
+ int charEnd = (postCharOffset == null) ? targetCharEnd : postCharOffset.endOffset();
+ return new ConcordanceWindow(uniqueDocID, charStart, charEnd, preString, targString,
+ postString, sortKey, metadata);
+
+ }
+
+
+ private OffsetAttribute getPreCharOffset(int targetTokenStart,
+ int targetCharStart,
+ RandomAccessCharOffsetContainer charOffsets) {
+ if (tokensBefore == 0)
+ return null;
+
+ if (targetTokenStart == 0) {
+ return null;
+ }
+ int contextTokenStart = Math.max(0,
+ targetTokenStart - tokensBefore);
+
+ int contextCharStart = charOffsets.getClosestCharStart(contextTokenStart, targetTokenStart);
+ //closest start wasn't actually found
+ //this can happen if there is a large posInc and the target
+ //lands at the start of a field index
+ if (contextCharStart < 0) {
+ return null;
+ }
+ int contextCharEnd = Math.max(contextCharStart, targetCharStart - 1);
+
+ return buildOffsetAttribute(contextCharStart, contextCharEnd);
+ }
+
+ private OffsetAttribute getPostCharOffset(int targetTokenEnd,
+ int targetCharEnd,
+ RandomAccessCharOffsetContainer charOffsets) {
+
+ if (tokensAfter == 0)
+ return null;
+
+ int contextTokenEnd = targetTokenEnd + tokensAfter;
+ int contextCharStart = targetCharEnd;
+ int contextCharEnd = charOffsets.getClosestCharEnd(
+ contextTokenEnd, targetTokenEnd + 1);
+
+ if (contextCharStart >= contextCharEnd) {
+ return null;
+ }
+ return buildOffsetAttribute(contextCharStart, contextCharEnd);
+ }
+
+ private OffsetAttribute buildOffsetAttribute(int start, int end) {
+ OffsetAttribute off = new OffsetAttributeImpl();
+ off.setOffset(start, end);
+ return off;
+ }
+
+
+ public Set getFieldSelector() {
+ Set set = new HashSet<>();
+ set.addAll(metadataExtractor.getFieldSelector());
+ if (docIdBuilder instanceof FieldBasedDocIdBuilder) {
+ set.addAll(((FieldBasedDocIdBuilder) docIdBuilder).getFields());
+ }
+ return set;
+ }
+
+ /**
+ * Simple wrapper around metadataExtractor
+ *
+ * @param document document from which to extract metadata
+ * @return map
+ */
+ public Map extractMetadata(Document document) {
+ return metadataExtractor.extract(document);
+ }
+
+ public String getUniqueDocumentId(Document document, long docId) {
+ return docIdBuilder.build(document, docId);
+ }
+
+ public int getTokensBefore() {
+ return tokensBefore;
+ }
+
+ public int getTokensAfter() {
+ return tokensAfter;
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/ConcordanceWindowCollector.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/ConcordanceWindowCollector.java
new file mode 100644
index 000000000000..6315339ef13a
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/ConcordanceWindowCollector.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic.impl;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.concordance.classic.AbstractConcordanceWindowCollector;
+import org.apache.lucene.concordance.classic.ConcordanceWindow;
+
+public class ConcordanceWindowCollector extends AbstractConcordanceWindowCollector {
+
+ private List windows = new ArrayList();
+
+ public ConcordanceWindowCollector(int maxWindows) {
+ super(maxWindows);
+ }
+
+ @Override
+ public void collect(ConcordanceWindow w) {
+ if (getMaxWindows() != AbstractConcordanceWindowCollector.COLLECT_ALL
+ && windows.size() >= getMaxWindows()) {
+ setHitMax(true);
+ return;
+ }
+ windows.add(w);
+ addDocId(w.getUniqueDocID());
+ }
+
+ @Override
+ public int size() {
+ return windows.size();
+ }
+
+ @Override
+ public List getWindows() {
+ return windows;
+ }
+
+
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DedupingConcordanceWindowCollector.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DedupingConcordanceWindowCollector.java
new file mode 100644
index 000000000000..201ef11a73f8
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DedupingConcordanceWindowCollector.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic.impl;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.concordance.classic.AbstractConcordanceWindowCollector;
+import org.apache.lucene.concordance.classic.ConcordanceWindow;
+
+/**
+ * Like ConcordanceWindowCollector, but this collector
+ * doesn't store duplicate windows. Windows are defined as duplicates by
+ * {@link #buildEqualityKey(ConcordanceWindow, StringBuilder)}.
+ */
+public class DedupingConcordanceWindowCollector extends AbstractConcordanceWindowCollector {
+
+ Map map = new HashMap();
+ private StringBuilder sb = new StringBuilder();
+
+ /**
+ * @param maxHits maximum number of windows to store. This could potentially
+ * visit lots more windows than maxHits.
+ */
+ public DedupingConcordanceWindowCollector(int maxHits) {
+ super(maxHits);
+ }
+
+ @Override
+ public void collect(ConcordanceWindow w) {
+ if (getHitMax() == true) {
+ return;
+ }
+ buildEqualityKey(w, sb);
+ String key = sb.toString();
+ ConcordanceWindow oldWindow = map.get(key);
+ if (oldWindow == null) {
+ //we would have added a new window here
+ if (getMaxWindows() != AbstractConcordanceWindowCollector.COLLECT_ALL &&
+ map.size() >= getMaxWindows()) {
+ setHitMax(true);
+ return;
+ }
+ oldWindow = w;
+ } else {
+ //if the old window existed (i.e. new window is a duplicate)
+ //keep incrementing the count
+ oldWindow.incrementCount();
+ }
+
+ map.put(key, oldWindow);
+ addDocId(w.getUniqueDocID());
+ }
+
+
+ /**
+ * number of windows collected
+ */
+ @Override
+ public int size() {
+ return map.size();
+ }
+
+ @Override
+ public List getWindows() {
+ List windows = new ArrayList<>();
+ windows.addAll(map.values());
+ return windows;
+ }
+
+ /**
+ * Public for easy overriding. Generate a key to be used to determine
+ * whether two windows are the same. Some implementations
+ * might want to lowercase, some might want genuine case folding,
+ * some might want to strip non-alphanumerics, etc.
+ *
+ * If you are overriding this, make sure to call sb.setLength(0)!
+ *
+ * @param w ConcordanceWindow
+ * @param sb reuseable StringBuilder; sb.setLength(0) is called before use!
+ */
+ public void buildEqualityKey(ConcordanceWindow w, StringBuilder sb) {
+ sb.setLength(0);
+ sb.append(w.getPre().toLowerCase());
+ sb.append(">>>");
+ sb.append(w.getTarget().toLowerCase());
+ sb.append("<<<");
+ sb.append(w.getPost().toLowerCase());
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DefaultSortKeyBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DefaultSortKeyBuilder.java
new file mode 100644
index 000000000000..4e9026ccca60
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DefaultSortKeyBuilder.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic.impl;
+
+import java.util.Map;
+
+import org.apache.lucene.concordance.charoffsets.RandomAccessCharOffsetContainer;
+import org.apache.lucene.concordance.classic.ConcordanceSortKey;
+import org.apache.lucene.concordance.classic.ConcordanceSortOrder;
+import org.apache.lucene.concordance.classic.SortKeyBuilder;
+
+/**
+ * Builds basic sort key for the values available in ConcordanceSortOrder
+ */
+public class DefaultSortKeyBuilder implements SortKeyBuilder {
+
+ private final static String SPACE = " ";
+ private final static String EMPTY_STRING = "";
+ //what filler to use when a "term" comes back as null from the
+ //TokenCharOffsetResults
+ private static String NULL_FILLER = "";
+ private final ConcordanceSortOrder sortOrder;
+
+ /**
+ * Calls {@link #DefaultSortKeyBuilder(ConcordanceSortOrder)}
+ * with value of: ConcordanceSortOrder.PRE
+ */
+ public DefaultSortKeyBuilder() {
+ this.sortOrder = ConcordanceSortOrder.PRE;
+ }
+
+ /**
+ * @param sortOrder sort order to use
+ */
+ public DefaultSortKeyBuilder(ConcordanceSortOrder sortOrder) {
+ this.sortOrder = sortOrder;
+ }
+
+ @Override
+ public ConcordanceSortKey buildKey(String docKey,
+ int startTargetTokenOffset,
+ int endTargetTokenOffset,
+ RandomAccessCharOffsetContainer charOffsets,
+ int tokensBefore, int tokensAfter,
+ Map metadata) {
+
+ if (sortOrder == ConcordanceSortOrder.NONE) {
+ return new ConcordanceSortKey(EMPTY_STRING);
+ }
+
+ if (sortOrder == ConcordanceSortOrder.DOC) {
+ int targCharStart = charOffsets.getCharacterOffsetStart(startTargetTokenOffset);
+ return new DocumentOrderSortKey(docKey, targCharStart);
+ }
+
+ StringBuilder sb = new StringBuilder();
+ //order is important for appending to sb, target must come before pre/post
+ if (sortOrder == ConcordanceSortOrder.TARGET_POST
+ || sortOrder == ConcordanceSortOrder.TARGET_PRE) {
+
+ for (int i = startTargetTokenOffset; i <= endTargetTokenOffset; i++) {
+ String tmp = charOffsets.getTerm(i);
+ if (tmp != null && tmp.length() > 0) {
+ sb.append(tmp).append(SPACE);
+ } else {
+ sb.append(NULL_FILLER);
+ }
+ }
+ }
+ if (sortOrder == ConcordanceSortOrder.PRE
+ || sortOrder == ConcordanceSortOrder.TARGET_PRE) {
+ int tmpStart = startTargetTokenOffset - 1;
+ int tmpEnd = Math.max(0, startTargetTokenOffset - tokensBefore);
+ if (tmpStart < 0) {
+ sb.append(SPACE);
+ }
+
+ for (int i = tmpStart; i >= tmpEnd; i--) {
+ String tmp = charOffsets.getTerm(i);
+ if (tmp != null && tmp.length() > 0) {
+ sb.append(tmp).append(SPACE);
+ } else {
+ sb.append(NULL_FILLER);
+ }
+ }
+
+ } else if (sortOrder == ConcordanceSortOrder.POST
+ || sortOrder == ConcordanceSortOrder.TARGET_POST) {
+
+ int tmpStart = endTargetTokenOffset + 1;
+ int tmpEnd = Math.min(charOffsets.getLast(), endTargetTokenOffset + tokensAfter);
+
+ if (tmpStart > charOffsets.getLast()) {
+ sb.append(SPACE);
+ }
+ for (int i = tmpStart; i <= tmpEnd; i++) {
+ String tmp = charOffsets.getTerm(i);
+ if (tmp != null && tmp.length() > 0) {
+ sb.append(tmp).append(SPACE);
+ } else {
+ sb.append(NULL_FILLER);
+ }
+ }
+ }
+ return new ConcordanceSortKey(sb.toString().trim());
+ }
+
+ @Override
+ public boolean requiresAnalysisOfPre() {
+ if (sortOrder == ConcordanceSortOrder.PRE
+ || sortOrder == ConcordanceSortOrder.TARGET_PRE) {
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public boolean requiresAnalysisOfPost() {
+ if (sortOrder == ConcordanceSortOrder.POST
+ || sortOrder == ConcordanceSortOrder.TARGET_POST) {
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public boolean requiresAnalysisOfTarget() {
+ if (sortOrder == ConcordanceSortOrder.TARGET_PRE
+ || sortOrder == ConcordanceSortOrder.TARGET_POST) {
+ return true;
+ }
+ return false;
+ }
+
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DocumentOrderSortKey.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DocumentOrderSortKey.java
new file mode 100644
index 000000000000..99a70abde9d2
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DocumentOrderSortKey.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic.impl;
+
+import org.apache.lucene.concordance.classic.ConcordanceSortKey;
+
+/**
+ * This sorts based alphabetically on the document key
+ * and then numerically on the
+ */
+public class DocumentOrderSortKey extends ConcordanceSortKey {
+
+ protected final int targetCharStart;
+
+ public DocumentOrderSortKey(String docKey, int targetCharStart) {
+ super(docKey);
+ this.targetCharStart = targetCharStart;
+ }
+
+ @Override
+ public int compareTo(ConcordanceSortKey o) {
+ if (o instanceof DocumentOrderSortKey) {
+ DocumentOrderSortKey other = (DocumentOrderSortKey) o;
+ int cmp = super.compareTo(o);
+ if (cmp == 0) {
+ return Integer.compare(targetCharStart, other.targetCharStart);
+ }
+ return cmp;
+ } else {
+ return super.compareTo(o);
+ }
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/FieldBasedDocIdBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/FieldBasedDocIdBuilder.java
new file mode 100644
index 000000000000..aeb43eefd441
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/FieldBasedDocIdBuilder.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic.impl;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.concordance.classic.DocIdBuilder;
+
+/**
+ * Simple class that grabs the stringValue() of a specified
+ * field to use as a document's unique key for the ConcordanceWindow
+ * building process.
+ *
+ * Note that this takes only the first value of the field.
+ * If a multi-valued field is selected, surprises might happen.
+ *
+ * Also, note that if the field is not found, this returns
+ * a string representation of the ephemeral Lucene docId.
+ *
+ * Some users might want to throw an exception instead of this behavior.
+ */
+public class FieldBasedDocIdBuilder implements DocIdBuilder {
+
+ private final String fieldName;
+
+ /**
+ * @param fieldName, name of field to be used as a document's unique key
+ */
+ public FieldBasedDocIdBuilder(String fieldName) {
+ this.fieldName = fieldName;
+ }
+
+ @Override
+ public String build(Document d, long docId) {
+ IndexableField field = d.getField(fieldName);
+ //should probably throw exception, no?!
+ if (field == null) {
+ return Long.toString(docId);
+ }
+ return field.stringValue();
+ }
+
+ /**
+ * Instead of getField(String fieldName), this allows for extension
+ *
+ * @return fields to use
+ */
+ public Set getFields() {
+ Set fields = new HashSet();
+ fields.add(fieldName);
+ return fields;
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/IndexIdDocIdBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/IndexIdDocIdBuilder.java
new file mode 100644
index 000000000000..c4a4c42ee824
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/IndexIdDocIdBuilder.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic.impl;
+
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.concordance.classic.DocIdBuilder;
+
+/**
+ * Simple id builder based on ephemeral Lucene doc ids.
+ * Use this only if your documents do not have a unique key.
+ * Then, use only with great care.
+ */
+public class IndexIdDocIdBuilder implements DocIdBuilder {
+
+
+ @Override
+ public Set getFields() {
+ return Collections.EMPTY_SET;
+ }
+
+ @Override
+ public String build(Document d, long docId) {
+ return Long.toString(docId);
+ }
+
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/SimpleDocMetadataExtractor.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/SimpleDocMetadataExtractor.java
new file mode 100644
index 000000000000..6e2bafcf1a9a
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/SimpleDocMetadataExtractor.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.classic.impl;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.concordance.classic.DocMetadataExtractor;
+
+/**
+ * Simple class that returns a map of key value pairs
+ * for the fields specified during initialization.
+ *
+ * Beware! For multi-valued fields, this will take only the first value.
+ */
+public class SimpleDocMetadataExtractor implements DocMetadataExtractor {
+
+ private Set fields = new HashSet<>();
+
+ public SimpleDocMetadataExtractor(String... fields) {
+ for (String f : fields) {
+ this.fields.add(f);
+ }
+ }
+
+ public SimpleDocMetadataExtractor(Set fields) {
+ this.fields.addAll(fields);
+ }
+
+ public void addField(String f) {
+ fields.add(f);
+ }
+
+ @Override
+ public Set getFieldSelector() {
+ return Collections.unmodifiableSet(fields);
+ }
+
+ @Override
+ public Map extract(Document d) {
+ Map map = new HashMap<>();
+ // only takes the first value in a multi-valued field!!!
+ for (String fieldName : getFieldSelector()) {
+ String[] fieldValues = d.getValues(fieldName);
+
+ if (fieldValues != null && fieldValues.length > 0) {
+ map.put(fieldName, fieldValues[0]);
+ }
+ }
+ return map;
+ }
+
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/package.html b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/package.html
new file mode 100644
index 000000000000..3635ce638caa
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/package.html
@@ -0,0 +1,31 @@
+
+
+
+
+
+ ConcordanceSearcher performs a search on an index and returns concordance windows.
+
+
+
+ This currently relies heavily on SpanQueries. When they are nuked (LUCENE-2878),
+ this can will be modified to perform the same behavior with a Scorer.
+
+
+
+
+
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/package.html b/lucene/concordance/src/java/org/apache/lucene/concordance/package.html
new file mode 100644
index 000000000000..ef21389527f1
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/package.html
@@ -0,0 +1,30 @@
+
+
+
+
+The concordance package includes two primary areas of functionality:
+
+ - a traditional concordancer to produce concordance results for human use (see: oal.corpus.concordance.classic)
+
+ - a concordance window visitor to enable calculations of statistics on target terms or
+ context terms (see: oal.corpus.concordance.windowvisitor)
+
+
+
+
+
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/util/ConcordanceSearcherUtil.java b/lucene/concordance/src/java/org/apache/lucene/concordance/util/ConcordanceSearcherUtil.java
new file mode 100644
index 000000000000..d5292179f365
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/util/ConcordanceSearcherUtil.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.util;
+
+import java.util.List;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.concordance.charoffsets.TokenCharOffsetRequests;
+
+/**
+ * In other applications with variations on the ConcordanceSearcher, it has been
+ * useful to factor out the getCharOffsetRequests.
+ *
+ * This class should be used for functionality that is generally useful for
+ * concordance searching.
+ */
+public class ConcordanceSearcherUtil {
+
+
+ /**
+ * Simple utility method to build a TokenCharOffsetRequests object
+ * from a list of desired tokenOffsets, the number of tokensBefore
+ * and the number of tokensAfter.
+ *
+ * @param tokenOffsets the tokenOffsets that are desired
+ * @param tokensBefore the number of tokens before a desired tokenOffset
+ * @param tokensAfter the number of tokens after a desired tokenOffset
+ * @param requests an empty requests to be filled in
+ */
+ public static void getCharOffsetRequests(
+ List tokenOffsets,
+ int tokensBefore, int tokensAfter,
+ TokenCharOffsetRequests requests) {
+
+ for (OffsetAttribute tokenOffset : tokenOffsets) {
+ int start = tokenOffset.startOffset() - tokensBefore;
+ start = (start < 0) ? 0 : start;
+ int end = tokenOffset.endOffset() + tokensAfter + 1;
+ for (int i = start; i < end; i++) {
+ requests.add(i);
+ }
+ }
+ }
+
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/util/SimpleTargetTermResults.java b/lucene/concordance/src/java/org/apache/lucene/concordance/util/SimpleTargetTermResults.java
new file mode 100644
index 000000000000..200cff13bd32
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/concordance/util/SimpleTargetTermResults.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance.util;
+
+import java.util.Map;
+
+/**
+ * Simple class to hold document frequencies and term frequencies
+ * for terms.
+ */
+public class SimpleTargetTermResults {
+ private final Map tfs;
+ private final Map dfs;
+
+ /**
+ * @param dfs document frequencies
+ * @param tfs term frequencies
+ */
+ protected SimpleTargetTermResults(Map dfs,
+ Map tfs) {
+ this.dfs = dfs;
+ this.tfs = tfs;
+ }
+
+ /**
+ * @return term frequency map
+ */
+ public Map getTermFreqs() {
+ return tfs;
+ }
+
+ /**
+ * @return document frequency map
+ */
+ public Map getDocFreqs() {
+ return dfs;
+ }
+}
diff --git a/lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java b/lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java
new file mode 100644
index 000000000000..fbfa59a37c68
--- /dev/null
+++ b/lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java
@@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.SynonymQuery;
+import org.apache.lucene.search.TermQuery;
+
+public class SimpleSpanQueryConverter {
+ /**
+ * Converts a regular query to a {@link org.apache.lucene.search.spans.SpanQuery} for use in a highlighter.
+ * Because of subtle differences in {@link org.apache.lucene.search.spans.SpanQuery} and {@link org.apache.lucene.search.Query}, this
+ * {@link org.apache.lucene.search.spans.SpanQuery} will not necessarily return the same documents as the
+ * initial Query. For example, the generated SpanQuery will not include
+ * clauses of type BooleanClause.Occur.MUST_NOT. Also, the
+ * {@link org.apache.lucene.search.spans.SpanQuery} will only cover a single field, whereas the {@link org.apache.lucene.search.Query}
+ * might contain multiple fields.
+ *
+ * Returns an empty SpanQuery if the {@link org.apache.lucene.search.Query} is a class that
+ * is handled, but for some reason can't be converted from a {@link org.apache.lucene.search.Query} to a
+ * {@link org.apache.lucene.search.spans.SpanQuery}. This can happen for many reasons: e.g. if the Query
+ * contains no terms in the requested "field" or the Query is a MatchAllDocsQuery.
+ *
+ * Throws IllegalArgumentException if the Query is a class that is
+ * is not yet handled.
+ *
+ * This class does not rewrite the SpanQuery before returning it.
+ * Clients are required to rewrite if necessary.
+ *
+ * Much of this code is copied directly from
+ * oal.search.highlight.WeightedSpanTermExtractor. There are some subtle
+ * differences.
+ *
+ * Throws IllegalArgumentException if an unknown query type is passed in.
+ *
+ * @param field single field to extract SpanQueries for
+ * @param queryToConvert query to convert
+ * @return SpanQuery for use in highlighting; can return empty SpanQuery
+ * @throws IOException if encountered during parse
+ */
+ public SpanQuery convert(String field, Query queryToConvert) throws IOException {
+
+ Float boost = null;
+ Query query = queryToConvert;
+ if (queryToConvert instanceof BoostQuery) {
+ query = ((BoostQuery)query).getQuery();
+ boost = ((BoostQuery)query).getBoost();
+ }
+ /*
+ * copied nearly verbatim from
+ * org.apache.lucene.search.highlight.WeightedSpanTermExtractor
+ * TODO:refactor to avoid duplication of code if possible.
+ * Beware: there are some subtle differences.
+ */
+ if (query instanceof SpanQuery) {
+ SpanQuery sq = (SpanQuery) query;
+ if (sq.getField().equals(field)) {
+ return (SpanQuery) query;
+ } else {
+ return getEmptySpanQuery();
+ }
+ } else if (query instanceof BooleanQuery) {
+ List queryClauses = ((BooleanQuery) query).clauses();
+ List spanQs = new ArrayList();
+ for (int i = 0; i < queryClauses.size(); i++) {
+ if (!queryClauses.get(i).isProhibited()) {
+ tryToAdd(field, convert(field, queryClauses.get(i).getQuery()), spanQs);
+ }
+ }
+ return addBoost(buildSpanOr(spanQs), boost);
+ } else if (query instanceof PhraseQuery) {
+ PhraseQuery phraseQuery = ((PhraseQuery) query);
+
+ Term[] phraseQueryTerms = phraseQuery.getTerms();
+ if (phraseQueryTerms.length == 0) {
+ return getEmptySpanQuery();
+ } else if (!phraseQueryTerms[0].field().equals(field)) {
+ return getEmptySpanQuery();
+ }
+ SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length];
+ for (int i = 0; i < phraseQueryTerms.length; i++) {
+ clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
+ }
+ int slop = phraseQuery.getSlop();
+ int[] positions = phraseQuery.getPositions();
+ // sum position increments (>1) and add to slop
+ if (positions.length > 0) {
+ int lastPos = positions[0];
+ int sz = positions.length;
+ for (int i = 1; i < sz; i++) {
+ int pos = positions[i];
+ int inc = pos - lastPos - 1;
+ slop += inc;
+ lastPos = pos;
+ }
+ }
+
+ boolean inorder = false;
+
+ if (phraseQuery.getSlop() == 0) {
+ inorder = true;
+ }
+
+ SpanQuery sp = new SpanNearQuery(clauses, slop, inorder);
+ return addBoost(sp, boost);
+ } else if (query instanceof TermQuery) {
+ TermQuery tq = (TermQuery) query;
+ if (tq.getTerm().field().equals(field)) {
+ return addBoost(new SpanTermQuery(tq.getTerm()), boost);
+ } else {
+ return getEmptySpanQuery();
+ }
+ } else if (query instanceof ConstantScoreQuery) {
+ return convert(field, ((ConstantScoreQuery) query).getQuery());
+ } else if (query instanceof DisjunctionMaxQuery) {
+ List spanQs = new ArrayList<>();
+ for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator
+ .hasNext(); ) {
+ tryToAdd(field, convert(field, iterator.next()), spanQs);
+ }
+ if (spanQs.size() == 0) {
+ return getEmptySpanQuery();
+ } else if (spanQs.size() == 1) {
+ return addBoost(spanQs.get(0), boost);
+ } else {
+ return addBoost(new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()])), boost);
+ }
+ } else if (query instanceof MatchAllDocsQuery) {
+ return getEmptySpanQuery();
+ } else if (query instanceof MultiPhraseQuery) {
+
+ final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
+
+ final Term[][] termArrays = mpq.getTermArrays();
+ //test for empty or wrong field
+ if (termArrays.length == 0) {
+ return getEmptySpanQuery();
+ } else if (termArrays.length > 1) {
+ Term[] ts = termArrays[0];
+ if (ts.length > 0) {
+ Term t = ts[0];
+ if (!t.field().equals(field)) {
+ return getEmptySpanQuery();
+ }
+ }
+ }
+ final int[] positions = mpq.getPositions();
+ if (positions.length > 0) {
+
+ int maxPosition = positions[positions.length - 1];
+ for (int i = 0; i < positions.length - 1; ++i) {
+ if (positions[i] > maxPosition) {
+ maxPosition = positions[i];
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ final List[] disjunctLists = new List[maxPosition + 1];
+ int distinctPositions = 0;
+
+ for (int i = 0; i < termArrays.length; ++i) {
+ final Term[] termArray = termArrays[i];
+ List disjuncts = disjunctLists[positions[i]];
+ if (disjuncts == null) {
+ disjuncts = (disjunctLists[positions[i]] = new ArrayList(
+ termArray.length));
+ ++distinctPositions;
+ }
+ for (int j = 0; j < termArray.length; ++j) {
+ disjuncts.add(new SpanTermQuery(termArray[j]));
+ }
+ }
+
+ int positionGaps = 0;
+ int position = 0;
+ final SpanQuery[] clauses = new SpanQuery[distinctPositions];
+ for (int i = 0; i < disjunctLists.length; ++i) {
+ List disjuncts = disjunctLists[i];
+ if (disjuncts != null) {
+ if (disjuncts.size() == 1) {
+ clauses[position++] = disjuncts.get(0);
+ } else {
+ clauses[position++] = new SpanOrQuery(
+ disjuncts.toArray(new SpanQuery[disjuncts.size()]));
+ }
+ } else {
+ ++positionGaps;
+ }
+ }
+
+ final int slop = mpq.getSlop();
+ final boolean inorder = (slop == 0);
+
+ SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps,
+ inorder);
+ return addBoost(sp, boost);
+ }
+ } else if (query instanceof MultiTermQuery) {
+ MultiTermQuery tq = (MultiTermQuery) query;
+ if (! tq.getField().equals(field)) {
+ return getEmptySpanQuery();
+ }
+ return addBoost(
+ new SpanMultiTermQueryWrapper<>((MultiTermQuery) query), boost);
+ } else if (query instanceof SynonymQuery) {
+ SynonymQuery sq = (SynonymQuery)query;
+ List spanQs = new ArrayList<>();
+ for (Term t : sq.getTerms()) {
+ spanQs.add(new SpanTermQuery(t));
+ }
+ return addBoost(buildSpanOr(spanQs), boost);
+ }
+ return convertUnknownQuery(field, queryToConvert);
+ }
+
+ private SpanQuery buildSpanOr(List spanQs) {
+ if (spanQs.size() == 0) {
+ return getEmptySpanQuery();
+ } else if (spanQs.size() == 1) {
+ return spanQs.get(0);
+ } else {
+ return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()]));
+ }
+
+ }
+
+ private SpanQuery addBoost(SpanQuery sq, Float boost) {
+ if (boost == null) {
+ return sq;
+ }
+ return new SpanBoostQuery(sq, boost);
+ }
+
+ private void tryToAdd(String field, SpanQuery q, List qs) {
+ if (q == null || isEmptyQuery(q) || !q.getField().equals(field)) {
+ return;
+ }
+ qs.add(q);
+ }
+
+ /**
+ * Extend this to handle queries that are not currently handled.
+ * Might consider extending SpanQueryConverter in the queries compilation unit;
+ * that includes CommonTermsQuery.
+ *
+ * In this class, this always throws an IllegalArgumentException
+ *
+ * @param field field to convert
+ * @param query query to convert
+ * @return nothing. Throws IllegalArgumentException
+ */
+ protected SpanQuery convertUnknownQuery(String field, Query query) {
+ throw new IllegalArgumentException("SpanQueryConverter is unable to convert this class " +
+ query.getClass().toString());
+ }
+
+ /**
+ * @return an empty SpanQuery (SpanOrQuery with no cluases)
+ */
+ protected SpanQuery getEmptySpanQuery() {
+ return new SpanOrQuery(new SpanTermQuery[0]);
+ }
+
+ /**
+ * Is this a null or empty SpanQuery
+ *
+ * @param q query to test
+ * @return whether a null or empty SpanQuery
+ */
+ private boolean isEmptyQuery(SpanQuery q) {
+ if (q == null) {
+ return true;
+ }
+ if (q instanceof SpanOrQuery) {
+ SpanOrQuery soq = (SpanOrQuery) q;
+ for (SpanQuery sq : soq.getClauses()) {
+ if (!isEmptyQuery(sq)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ return false;
+ }
+}
diff --git a/lucene/concordance/src/test/org/apache/lucene/concordance/ConcordanceTestBase.java b/lucene/concordance/src/test/org/apache/lucene/concordance/ConcordanceTestBase.java
new file mode 100644
index 000000000000..7d78d31fbadf
--- /dev/null
+++ b/lucene/concordance/src/test/org/apache/lucene/concordance/ConcordanceTestBase.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.concordance;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.concordance.charoffsets.SimpleAnalyzerUtil;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+public class ConcordanceTestBase extends LuceneTestCase {
+
+ protected final static String FIELD = "f1";
+
+ public static Analyzer getAnalyzer(final CharacterRunAutomaton stops) {
+ return getAnalyzer(stops, random().nextInt(10000), random().nextInt(10000));
+ }
+
+ public static Analyzer getAnalyzer(final CharacterRunAutomaton stops,
+ final int posIncGap, final int charOffsetGap) {
+
+ return new Analyzer() {
+
+ @Override
+ public TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+ TokenFilter filter = new MockTokenFilter(tokenizer, stops);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+
+ @Override
+ public int getPositionIncrementGap(String fieldName) {
+ return posIncGap;
+ }
+
+ @Override
+ public int getOffsetGap(String fieldName) {
+ return charOffsetGap;
+ }
+ };
+ }
+
+ public Directory getDirectory(Analyzer analyzer, String[] vals)
+ throws IOException {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(analyzer)
+ .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy()));
+
+ for (String s : vals) {
+ Document d = new Document();
+ d.add(newTextField(FIELD, s, Field.Store.YES));
+ writer.addDocument(d);
+ }
+ writer.close();
+ return directory;
+ }
+
+ public Directory getDirectory(Analyzer analyzer, List input)
+ throws IOException {
+
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(analyzer)
+ .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy()));
+
+ for (String[] vals : input) {
+ Document d = new Document();
+ for (String s : vals) {
+ d.add(newTextField(FIELD, s, Field.Store.YES));
+ }
+ writer.addDocument(d);
+ }
+ writer.close();
+ return directory;
+ }
+
+ Directory buildNeedleIndex(String needle,
+ Analyzer analyzer, int numFieldValues) throws Exception {
+
+ IndexWriterConfig config = newIndexWriterConfig(random(), analyzer)
+ .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy());
+
+ Directory directory = newDirectory();
+
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config);
+ //create document with multivalued field
+ String[] fs = new String[numFieldValues];
+ for (int i = 0; i < numFieldValues; i++) {
+ float r = random().nextFloat();
+ String doc = "";
+ if (r <= 0.33) {
+ doc = needle + " " + getRandomWords(29, needle, analyzer);
+ } else if (r <= 0.66) {
+ doc = getRandomWords(13, needle, analyzer) + " " + needle + " " + getRandomWords(17, needle, analyzer);
+ } else {
+ doc = getRandomWords(31, needle, analyzer) + " " + needle;
+ }
+ fs[i] = doc;
+ }
+
+ Document d = new Document();
+ FieldType type = new FieldType();
+ type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ type.setStored(true);
+ type.setTokenized(true);
+
+ //IndexableField field = new IndexableField(type);
+ for (String s : fs) {
+ d.add(newField(random(), FIELD, s, type));
+ }
+ writer.addDocument(d);
+ writer.close();
+ return directory;
+ }
+
+
+ /**
+ * this assumes no stop filter in the analyzer.
+ * Best to use whitespace tokenizer.
+ */
+ private String getRandomWords(int numWords, String needle, Analyzer analyzer) throws Exception {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < numWords; i++) {
+ sb.append(TestUtil.randomUnicodeString(random(), 31));
+ sb.append(" ");
+ }
+ List terms = SimpleAnalyzerUtil.getTermStrings(sb.toString(),FIELD, analyzer);
+ StringBuilder rsb = new StringBuilder();
+ int words = -1;
+ while (words++ < numWords && words < terms.size()) {
+ String cand = terms.get(words);
+ if (!needle.equals(cand)) {
+ if (words > 0) {
+ rsb.append(" ");
+ }
+ rsb.append(cand);
+ }
+ }
+ return rsb.toString();
+ }
+
+
+ String getNeedle(Analyzer analyzer) {
+ //try to get a term that would come out of the analyzer
+ for (int i = 0; i < 10; i++) {
+ //start with a random base string
+ String baseString = TestUtil.randomUnicodeString(random(), random().nextInt(10) + 2);
+
+ try {
+ //run it through the analyzer, and take the first thing
+ //that comes out of it if the length > 0
+ List terms = SimpleAnalyzerUtil.getTermStrings(baseString, FIELD, analyzer);
+ for (String t : terms) {
+ if (t.length() > 0) {
+ return t;
+ }
+ }
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+ //if nothing is found in 10 tries,
+ //return literal string "needle"
+
+ return "needle";
+ }
+}
diff --git a/lucene/concordance/src/test/org/apache/lucene/concordance/TestConcordanceSearcher.java b/lucene/concordance/src/test/org/apache/lucene/concordance/TestConcordanceSearcher.java
new file mode 100644
index 000000000000..e8b0cc8a0dff
--- /dev/null
+++ b/lucene/concordance/src/test/org/apache/lucene/concordance/TestConcordanceSearcher.java
@@ -0,0 +1,560 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.concordance;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.concordance.classic.AbstractConcordanceWindowCollector;
+import org.apache.lucene.concordance.classic.ConcordanceSearcher;
+import org.apache.lucene.concordance.classic.ConcordanceSortOrder;
+import org.apache.lucene.concordance.classic.ConcordanceWindow;
+import org.apache.lucene.concordance.classic.DocIdBuilder;
+import org.apache.lucene.concordance.classic.DocMetadataExtractor;
+import org.apache.lucene.concordance.classic.WindowBuilder;
+import org.apache.lucene.concordance.classic.impl.ConcordanceWindowCollector;
+import org.apache.lucene.concordance.classic.impl.DedupingConcordanceWindowCollector;
+import org.apache.lucene.concordance.classic.impl.DefaultSortKeyBuilder;
+import org.apache.lucene.concordance.classic.impl.IndexIdDocIdBuilder;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestConcordanceSearcher extends ConcordanceTestBase {
+
+ private final static DocMetadataExtractor metadataExtractor =
+ new DocMetadataExtractor() {
+ private final Set fields = new HashSet<>();
+ private final Map data = new HashMap<>();
+
+ @Override
+ public Set getFieldSelector() {
+ return fields;
+ }
+
+ @Override
+ public Map extract(Document d) {
+ return data;
+ }
+ };
+
+ private final static DocIdBuilder docIdBuilder = new IndexIdDocIdBuilder();
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ // NOOP for now
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ // NOOP for now
+ }
+
+ @Test
+ public void testSimple() throws Exception {
+ String[] docs = new String[]{"a b c a b c", "c b a c b a"};
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher indexSearcher = new IndexSearcher(reader);
+ WindowBuilder wb = new WindowBuilder(10, 10,
+ analyzer.getOffsetGap(FIELD),
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "a"));
+
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);
+ searcher.search(indexSearcher, FIELD,
+ q, null, analyzer, collector);
+
+ assertEquals(3, collector.size());
+
+ collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL);
+ searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
+
+ // test result size
+ assertEquals(4, collector.size());
+
+ // test result with sort order = pre
+ List windows = collector.getSortedWindows();
+ String[] pres = new String[]{"", "c b", "c b a c b", "a b c"};
+ String[] posts = new String[]{" b c a b c", " c b a", "", " b c"};
+
+ for (int i = 0; i < windows.size(); i++) {
+ ConcordanceWindow w = windows.get(i);
+
+ assertEquals(pres[i], w.getPre());
+ assertEquals(posts[i], w.getPost());
+ }
+
+ // test sort order post
+ // sort key is built at search time, so must re-search
+ wb = new WindowBuilder(10, 10,
+ analyzer.getOffsetGap(FIELD),
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder);
+ searcher = new ConcordanceSearcher(wb);
+
+ collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL);
+ searcher.search(indexSearcher, FIELD, q,
+ null, analyzer, collector);
+
+ windows = collector.getSortedWindows();
+
+ posts = new String[]{"", " b c", " b c a b c", " c b a",};
+ for (int i = 0; i < windows.size(); i++) {
+ ConcordanceWindow w = windows.get(i);
+ assertEquals(posts[i], w.getPost());
+ }
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testSimpleMultiValuedField() throws Exception {
+ String[] doc = new String[]{"a b c a b c", "c b a c b a"};
+ List docs = new ArrayList<>();
+ docs.add(doc);
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher indexSearcher = new IndexSearcher(reader);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(
+ new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "a"));
+
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100);
+
+ searcher.search(indexSearcher, FIELD,
+ q, null, analyzer, collector);
+
+ // test result size
+ assertEquals(4, collector.size());
+
+ // test result with sort order = pre
+ List windows = collector.getSortedWindows();
+ String[] pres = new String[]{"", "c b", "c b a c b", "a b c"};
+ String[] posts = new String[]{" b c a b c", " c b a", "", " b c"};
+
+ for (int i = 0; i < pres.length; i++) {
+ ConcordanceWindow w = windows.get(i);
+
+ assertEquals("pres: " + i, pres[i], w.getPre());
+
+ assertEquals("posts: " + i, posts[i], w.getPost());
+ }
+
+ // test sort order post
+ // sort key is built at search time, so must re-search
+ WindowBuilder wb = new WindowBuilder(10, 10,
+ analyzer.getOffsetGap(FIELD),
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder);
+ searcher = new ConcordanceSearcher(wb);
+
+ collector = new ConcordanceWindowCollector(100);
+
+ searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
+
+ windows = collector.getSortedWindows();
+
+ posts = new String[]{"", " b c", " b c a b c", " c b a",};
+ for (int i = 0; i < posts.length; i++) {
+ ConcordanceWindow w = windows.get(i);
+ assertEquals(posts[i], w.getPost());
+ }
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testWindowLengths() throws Exception {
+ String[] doc = new String[]{"a b c d e f g"};
+ List docs = new ArrayList<>();
+ docs.add(doc);
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher indexSearcher = new IndexSearcher(reader);
+
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
+
+ String[] pres = {"", "c", "b c", "a b c", "a b c", "a b c"};
+ String[] posts = {"", " e", " e f", " e f g", " e f g", " e f g"};
+
+ for (int tokensBefore = 0; tokensBefore < pres.length; tokensBefore++) {
+ for (int tokensAfter = 0; tokensAfter < posts.length; tokensAfter++) {
+ WindowBuilder wb = new WindowBuilder(tokensBefore, tokensAfter,
+ analyzer.getOffsetGap(FIELD));
+ ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100);
+ searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
+ ConcordanceWindow w = collector.getSortedWindows().get(0);
+ assertEquals(tokensBefore + " : " + tokensAfter, pres[tokensBefore], w.getPre());
+ assertEquals(tokensBefore + " : " + tokensAfter, posts[tokensAfter], w.getPost());
+ }
+ }
+
+ reader.close();
+ directory.close();
+
+ }
+
+ @Test
+ public void testClockworkOrangeMultiValuedFieldProblem() throws Exception {
+ /*
+ * test handling of target match (or not) over different indices into multivalued
+ * field array
+ */
+ String[] doc = new String[]{"a b c a b the", "clockwork",
+ "orange b a c b a"};
+ List docs = new ArrayList<>();
+ docs.add(doc);
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 10);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher indexSearcher = new IndexSearcher(reader);
+ WindowBuilder wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD));
+
+
+ ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
+ SpanQuery q1 = new SpanTermQuery(
+ new Term(FIELD, "the"));
+ SpanQuery q2 = new SpanTermQuery(new Term(FIELD,
+ "clockwork"));
+ SpanQuery q3 = new SpanTermQuery(new Term(FIELD,
+ "orange"));
+ SpanQuery q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 3, true);
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);
+
+ searcher.search(indexSearcher, FIELD,
+ q, null, analyzer, collector);
+ assertEquals(1, collector.size());
+
+ ConcordanceWindow w = collector.getSortedWindows().iterator().next();
+ assertEquals("target", "the | clockwork | orange", w.getTarget());
+ assertEquals("pre", "c a b", w.getPre());
+ assertEquals("post", " b a c", w.getPost());
+
+ reader.close();
+ directory.close();
+
+ // test hit even over long inter-field gap
+ analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 20, 50);
+ directory = getDirectory(analyzer, docs);
+ reader = DirectoryReader.open(directory);
+ indexSearcher = new IndexSearcher(reader);
+
+ wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD));
+
+ searcher = new ConcordanceSearcher(wb);
+ q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 120, true);
+ collector = new ConcordanceWindowCollector(100);
+
+ searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
+
+ assertEquals(1, collector.size());
+ w = collector.getSortedWindows().iterator().next();
+ assertEquals("target", "the | clockwork | orange", w.getTarget());
+ assertEquals("pre", "c a b", w.getPre());
+ assertEquals("post", " b a c", w.getPost());
+
+ reader.close();
+ directory.close();
+ // test miss
+ analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 100, 100);
+ directory = getDirectory(analyzer, docs);
+ reader = DirectoryReader.open(directory);
+ indexSearcher = new IndexSearcher(reader);
+
+ wb = new WindowBuilder();
+ searcher = new ConcordanceSearcher(wb);
+ q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 5, true);
+ collector = new ConcordanceWindowCollector(100);
+
+ searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
+
+ assertEquals(0, collector.size());
+
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testWithStops() throws Exception {
+ String[] docs = new String[]{"a b the d e the f", "g h the d the j"};
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.ENGLISH_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher indexSearcher = new IndexSearcher(reader);
+ WindowBuilder wb = new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD));
+
+ ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);
+
+ searcher.search(indexSearcher, FIELD,
+ q, null, analyzer, collector);
+ List windows = collector.getSortedWindows();
+ assertEquals(2, windows.size());
+
+ // the second word after the target is a stop word
+ // this post-component of this window should only go to the first word after
+ // the target
+ assertEquals("b the", windows.get(0).getPre());
+ assertEquals("d", windows.get(0).getTarget());
+ assertEquals(" e", windows.get(0).getPost());
+
+ assertEquals("h the", windows.get(1).getPre());
+ assertEquals("d", windows.get(1).getTarget());
+ assertEquals(" the j", windows.get(1).getPost());
+
+
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testBasicStandardQueryConversion() throws Exception {
+ String[] docs = new String[]{"a b c a b c", "c b a c b a d e a",
+ "c b a c b a e a b c a"};
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher indexSearcher = new IndexSearcher(reader);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(
+ new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
+ BooleanQuery q = new BooleanQuery.Builder()
+ .add(new TermQuery(new Term(FIELD, "a")), Occur.MUST)
+ .add(new TermQuery(new Term(FIELD, "d")),
+ Occur.MUST_NOT).build();
+
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
+ searcher.search(indexSearcher,
+ FIELD, q, null,
+ analyzer, collector);
+ // shouldn't include document with "d"
+ assertEquals(6, collector.size());
+
+ // should only include document with "e" and not "d"
+ Query filter = new TermQuery(new Term(
+ FIELD, "e"));
+ collector = new ConcordanceWindowCollector(10);
+
+ searcher.search(indexSearcher, FIELD, (Query) q, filter, analyzer, collector);
+ assertEquals(4, collector.size());
+
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testMismatchingFieldsInStandardQueryConversion() throws Exception {
+ // tests what happens if a Query doesn't contain a term in the "span" field
+ // in the searcher...should be no exception and zero documents returned.
+
+ String[] docs = new String[]{"a b c a b c",};
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher indexSearcher = new IndexSearcher(reader);
+
+ ConcordanceSearcher searcher = new ConcordanceSearcher(
+ new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
+
+ Query q = new TermQuery(new Term("_" + FIELD, "a"));
+
+ int windowCount = -1;
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
+
+ searcher.search(indexSearcher, FIELD,
+ q, null, analyzer, collector);
+ windowCount = collector.size();
+ assertEquals(0, windowCount);
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testUniqueCollector() throws Exception {
+ String[] docs = new String[]{"a b c d c b a",
+ "a B C d c b a",
+ "a b C d C B a",
+ "a b c d C B A",
+ "e f g d g f e",
+ "h i j d j i h"
+ };
+
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher indexSearcher = new IndexSearcher(reader);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(
+ new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
+
+ DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(2);
+ searcher.search(indexSearcher,
+ FIELD, (Query) q, null,
+ analyzer, collector);
+ assertEquals(2, collector.size());
+
+
+ collector =
+ new DedupingConcordanceWindowCollector(AbstractConcordanceWindowCollector.COLLECT_ALL);
+ searcher.search(indexSearcher,
+ FIELD, (Query) q, null,
+ analyzer, collector);
+ assertEquals(3, collector.size());
+
+
+ reader.close();
+ directory.close();
+
+ }
+
+
+ @Test
+ public void testUniqueCollectorWithSameWindowOverflow() throws Exception {
+ String[] docs = new String[]{"a b c d c b a",
+ "a b c d c b a",
+ "a b c d c b a",
+ "a b c d c b a",
+ "e f g d g f e",
+ "h i j d j i h"
+ };
+
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher indexSearcher = new IndexSearcher(reader);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(
+ new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
+
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
+
+ DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(3);
+ searcher.search(indexSearcher,
+ FIELD, (Query) q, null,
+ analyzer, collector);
+ assertEquals(3, collector.size());
+ assertEquals(4, collector.getSortedWindows().get(0).getCount());
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testAllowTargetOverlaps() throws Exception {
+ String[] docs = new String[]{"a b c"};
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher indexSearcher = new IndexSearcher(reader);
+ WindowBuilder wb = new WindowBuilder(10, 10,
+ analyzer.getOffsetGap(FIELD),
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
+ SpanQuery term = new SpanTermQuery(new Term(FIELD, "a"));
+ SpanQuery phrase = new SpanNearQuery(
+ new SpanQuery[]{
+ new SpanTermQuery(new Term(FIELD, "a")),
+ new SpanTermQuery(new Term(FIELD, "b"))
+ }, 0, true);
+ SpanOrQuery q = new SpanOrQuery(
+ new SpanQuery[]{
+ term,
+ phrase
+ }
+ );
+
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
+ searcher.search(indexSearcher, FIELD,
+ q, null, analyzer, collector);
+
+ //default should be: don't allow target overlaps
+ assertEquals(1, collector.size());
+
+ searcher.setAllowTargetOverlaps(true);
+ collector = new ConcordanceWindowCollector(10);
+ searcher.search(indexSearcher, FIELD,
+ q, null, analyzer, collector);
+
+ //now there should be two windows with allowTargetOverlaps = true
+ assertEquals(2, collector.size());
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testRewrites() throws Exception {
+ //test to make sure that queries are rewritten
+ //first test straight prefix queries
+ String[] docs = new String[]{"aa ba ca aa ba ca", "ca ba aa ca ba aa da ea za",
+ "ca ba aa ca ba aa ea aa ba ca za"};
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher indexSearcher = new IndexSearcher(reader);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(
+ new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
+ BooleanQuery q = new BooleanQuery.Builder()
+ .add(new PrefixQuery(new Term(FIELD, "a")), Occur.MUST)
+ .add(new PrefixQuery(new Term(FIELD, "d")),
+ Occur.MUST_NOT).build();
+
+ //now test straight and span wrapper
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
+ searcher.search(indexSearcher,
+ FIELD, q, new PrefixQuery(new Term(FIELD, "z")),
+ analyzer, collector);
+ // shouldn't include document with "da", but must include one with za
+ assertEquals(3, collector.size());
+
+ collector = new ConcordanceWindowCollector(10);
+ searcher.search(indexSearcher,
+ FIELD, q, new SpanMultiTermQueryWrapper<>(new PrefixQuery(new Term(FIELD, "z"))),
+ analyzer, collector);
+ // shouldn't include document with "da", but must include one with za
+ assertEquals(3, collector.size());
+
+ reader.close();
+ directory.close();
+ }
+
+}
diff --git a/lucene/concordance/src/test/org/apache/lucene/concordance/TestSimpleAnalyzerUtil.java b/lucene/concordance/src/test/org/apache/lucene/concordance/TestSimpleAnalyzerUtil.java
new file mode 100644
index 000000000000..7eaf31127ddb
--- /dev/null
+++ b/lucene/concordance/src/test/org/apache/lucene/concordance/TestSimpleAnalyzerUtil.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.concordance.charoffsets.SimpleAnalyzerUtil;
+import org.apache.lucene.store.Directory;
+import org.junit.BeforeClass;
+
+public class TestSimpleAnalyzerUtil extends ConcordanceTestBase {
+
+ private static Analyzer defaultCharOffsetGapAnalyzer;
+
+ private static Analyzer customCharOffsetGapAnalyzer;
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ defaultCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 1);
+ //customCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50, 213);
+ customCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50, 213);
+ }
+ /*
+ public void testDebug() throws Exception {
+ String[] values = new String[]{
+ "the quick brown fox jumped over the lazy dog",
+ "the fast green toad slid under the slothful rabbit",
+ "the happy blue wolverine devoured the lazy moose",
+ "the depressed purple aardvark the the the the the the the devoured the energetic komodo",
+ "the exasperated lavender lion",
+ "the excited orange tiger the the the the the",
+ "the colorless green idea slept furiously the"
+ };
+ System.out.println(values[0].length());
+ List docs = new ArrayList<>();
+ docs.add(values);
+
+ Directory directory = getDirectory(defaultCharOffsetGapAnalyzer, docs);
+
+ String joiner = " | ";
+ int gap = defaultCharOffsetGapAnalyzer.getOffsetGap(FIELD);
+ IndexReader reader = DirectoryReader.open(directory);
+ Document d = reader.document(0);
+ String[] fieldValues = d.getValues(FIELD);
+ //69, 103
+ assertEquals("basic", "", testSimple(42, 45, fieldValues, gap, joiner));
+ reader.close();
+ directory.close();
+ }*/
+
+ public void testHitInGaps() throws Exception {
+ String[] values = new String[]{
+ "abc",
+ "def",
+ "ghi",
+ "jkl"
+ };
+ List docs = new ArrayList<>();
+ docs.add(values);
+
+ Directory directory = getDirectory(customCharOffsetGapAnalyzer, docs);
+
+ String joiner = " | ";
+ int gap = customCharOffsetGapAnalyzer.getOffsetGap(FIELD);
+ IndexReader reader = DirectoryReader.open(directory);
+ Document d = reader.document(0);
+ String[] fieldValues = d.getValues(FIELD);
+
+ assertEquals("two negs", "", testSimple(-10, -1, fieldValues, gap, joiner));
+
+ assertEquals("two way beyonds", "", testSimple(1000, 1020, fieldValues, gap, joiner));
+
+ assertEquals("two in betweens", " | ", testSimple(100, 110, fieldValues, gap, joiner));
+
+
+ assertEquals("one neg", "abc", testSimple(-20, 3, fieldValues, gap, joiner));
+ assertEquals("end < start 1", "", testSimple(3, -20, fieldValues, gap, joiner));
+ assertEquals("end < start 2", "", testSimple(3, 2, fieldValues, gap, joiner));
+ assertEquals("end in between", "abc", testSimple(0, 50, fieldValues, gap, joiner));
+ //TODO: these used to be "def"; need to fix
+ assertEquals("start in between", " | def", testSimple(5, 219, fieldValues, gap, joiner));
+ assertEquals("start in between and end in between1", " | def", testSimple(5, 300, fieldValues, gap, joiner));
+ assertEquals("start in between and end in between2", " | def | ghi", testSimple(5, 600, fieldValues, gap, joiner));
+ assertEquals("", "def | ghi | jkl", testSimple(216, 10000, fieldValues, gap, joiner));
+
+ reader.close();
+ directory.close();
+
+ }
+
+ public void testRandomWithNeedleOnGaps() throws Exception {
+ try {
+ executeNeedleTests(defaultCharOffsetGapAnalyzer);
+ executeNeedleTests(customCharOffsetGapAnalyzer);
+ } catch (Throwable e) {
+ e.printStackTrace();
+ throw e;
+ }
+ }
+
+ private void executeNeedleTests(Analyzer analyzer) throws Exception {
+
+ String needle = getNeedle(analyzer);
+ int numFieldValues = 23;
+
+ Directory directory = buildNeedleIndex(needle, analyzer, numFieldValues);
+
+ IndexReader reader = DirectoryReader.open(directory);
+
+ LeafReaderContext ctx = reader.leaves().get(0);
+ LeafReader r = ctx.reader();
+
+ PostingsEnum dpe = r.postings(new Term(FIELD, needle), PostingsEnum.ALL);
+ int numTests = 0;
+ try {
+ while (dpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+ int frq = dpe.freq();
+ int advanced = 0;
+
+ String[] fieldValues = r.document(dpe.docID()).getValues(FIELD);
+ while (++advanced < frq) {
+ dpe.nextPosition();
+ String rebuilt = SimpleAnalyzerUtil.substringFromMultiValuedFields(dpe.startOffset(),
+ dpe.endOffset(), fieldValues, analyzer.getOffsetGap(FIELD), " | ");
+ assertEquals(needle, rebuilt);
+ numTests++;
+ }
+ }
+ } finally {
+ reader.close();
+ directory.close();
+ }
+ assertEquals("number of tests", numFieldValues - 1, numTests);
+ }
+
+ private String testSimple(int start, int end, String[] fieldValues, int gap, String joiner) {
+ return SimpleAnalyzerUtil.substringFromMultiValuedFields(start, end, fieldValues, gap, joiner);
+ }
+}
diff --git a/lucene/concordance/src/test/org/apache/lucene/concordance/TestSpanQueryConverter.java b/lucene/concordance/src/test/org/apache/lucene/concordance/TestSpanQueryConverter.java
new file mode 100644
index 000000000000..2eb65b64e229
--- /dev/null
+++ b/lucene/concordance/src/test/org/apache/lucene/concordance/TestSpanQueryConverter.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.concordance;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SimpleSpanQueryConverter;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class TestSpanQueryConverter {
+
+ @Test
+ public void testMultiTerm() throws IOException {
+ //test to make sure multiterm returns empty query for different field
+ String f1 = "f1";
+ String f2 = "f2";
+ Query q = new PrefixQuery(new Term(f1, "f*"));
+ SimpleSpanQueryConverter c = new SimpleSpanQueryConverter();
+ SpanQuery sq = c.convert(f2, q);
+ assertTrue(sq instanceof SpanOrQuery);
+ assertEquals(0, ((SpanOrQuery)sq).getClauses().length);
+ }
+ //TODO: add more tests
+}
diff --git a/lucene/module-build.xml b/lucene/module-build.xml
index d48ae37f89c3..7f952e5feb4b 100644
--- a/lucene/module-build.xml
+++ b/lucene/module-build.xml
@@ -464,6 +464,28 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index cd5d4af576ff..3b516fe747ba 100644
--- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -261,7 +261,7 @@ public void load(SolrQueryRequest req, SolrQueryResponse rsp,
}
public static class MostlyPassthroughHtmlMapper implements HtmlMapper {
- public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper();
+ public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper();
/**
* Keep all elements and their content.