diff --git a/dev-tools/idea/.idea/ant.xml b/dev-tools/idea/.idea/ant.xml index 8d454ad8bcbf..4d3564d6f5c7 100644 --- a/dev-tools/idea/.idea/ant.xml +++ b/dev-tools/idea/.idea/ant.xml @@ -18,6 +18,7 @@ + diff --git a/dev-tools/idea/.idea/modules.xml b/dev-tools/idea/.idea/modules.xml index 6fbe496772f8..8d8917ba2d35 100644 --- a/dev-tools/idea/.idea/modules.xml +++ b/dev-tools/idea/.idea/modules.xml @@ -23,6 +23,7 @@ + diff --git a/dev-tools/idea/.idea/workspace.xml b/dev-tools/idea/.idea/workspace.xml index 7750e90e15b8..01f330a1c213 100644 --- a/dev-tools/idea/.idea/workspace.xml +++ b/dev-tools/idea/.idea/workspace.xml @@ -108,6 +108,14 @@ + + + + + - + @@ -363,35 +371,36 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dev-tools/idea/lucene/concordance/concordance.iml b/dev-tools/idea/lucene/concordance/concordance.iml new file mode 100644 index 000000000000..141f1ad4c398 --- /dev/null +++ b/dev-tools/idea/lucene/concordance/concordance.iml @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/dev-tools/maven/lucene/concordance/pom.xml.template b/dev-tools/maven/lucene/concordance/pom.xml.template new file mode 100644 index 000000000000..dd4382ecc051 --- /dev/null +++ b/dev-tools/maven/lucene/concordance/pom.xml.template @@ -0,0 +1,69 @@ + + + 4.0.0 + + org.apache.lucene + lucene-parent + @version@ + ../pom.xml + + org.apache.lucene + lucene-concordance + jar + Lucene Concordance + Lucene Concordance Module + + lucene/concordance + ../../.. + ${relative-top-level}/${module-directory} + + + scm:svn:${vc-anonymous-base-url}/${module-directory} + scm:svn:${vc-dev-base-url}/${module-directory} + ${vc-browse-base-url}/${module-directory} + + + + + org.apache.lucene + lucene-test-framework + test + + @lucene-concordance.internal.dependencies@ + @lucene-concordance.external.dependencies@ + @lucene-concordance.internal.test.dependencies@ + @lucene-concordance.external.test.dependencies@ + + + ${module-path}/src/java + ${module-path}/src/test + + + ${project.build.testSourceDirectory} + + **/*.java + + + + + + diff --git a/dev-tools/maven/lucene/pom.xml.template b/dev-tools/maven/lucene/pom.xml.template index 8db3fd1c98c8..a4fab6d2d445 100644 --- a/dev-tools/maven/lucene/pom.xml.template +++ b/dev-tools/maven/lucene/pom.xml.template @@ -47,6 +47,7 @@ analysis benchmark classification + concordance demo expressions facet diff --git a/lucene/build.xml b/lucene/build.xml index 383fbeb46f8f..8f470d2a6c2c 100644 --- a/lucene/build.xml +++ b/lucene/build.xml @@ -182,6 +182,7 @@ + diff --git a/lucene/concordance/build.xml b/lucene/concordance/build.xml new file mode 100644 index 000000000000..20d955db5b26 --- /dev/null +++ b/lucene/concordance/build.xml @@ -0,0 +1,40 @@ + + + + + + Executes concordance search + + + + + + + + + + + + + + + + + + diff --git a/lucene/concordance/ivy.xml b/lucene/concordance/ivy.xml new file mode 100644 index 000000000000..3ad64e34e1d3 --- /dev/null +++ b/lucene/concordance/ivy.xml @@ -0,0 +1,21 @@ + + + + diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/DocTokenOffsets.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/DocTokenOffsets.java new file mode 100644 index 000000000000..65b0ee1b8735 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/DocTokenOffsets.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl; +import org.apache.lucene.document.Document; + + + +/** + * Simple class to store a document id (atomic and unique), a StoredDocument, + * and the offsets for a SpanQuery hit + */ +public class DocTokenOffsets { + private int atomicDocId = -1; + private long uniqueId = -1; + private Document document = null; + private List offsets = new ArrayList<>(); + + public void addOffset(int start, int end) { + OffsetAttributeImpl offset = new OffsetAttributeImpl(); + offset.setOffset(start, end); + offsets.add(offset); + } + + public void reset(int base, int atomicDocId, Document d) { + this.atomicDocId = atomicDocId; + this.uniqueId = base + atomicDocId; + setDocument(d); + offsets.clear(); + } + + public List getOffsets() { + return offsets; + } + + public Document getDocument() { + return document; + } + + public void setDocument(Document d) { + this.document = d; + } + + /* + * required by DocTokenOffsetsIterator + */ + protected int getAtomicDocId() { + return atomicDocId; + } + + public long getUniqueDocId() { + return uniqueId; + } + + public DocTokenOffsets deepishCopy() { + DocTokenOffsets copy = new DocTokenOffsets(); + copy.atomicDocId = atomicDocId; + copy.uniqueId = uniqueId; + copy.document = document; + List copyOffsets = new ArrayList(); + copyOffsets.addAll(offsets); + copy.offsets = copyOffsets; + return copy; + } + + public boolean isEmpty() { + if (atomicDocId < 0) + return true; + return false; + } + + public void pseudoEmpty() { + atomicDocId = -1; + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/DocTokenOffsetsVisitor.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/DocTokenOffsetsVisitor.java new file mode 100644 index 000000000000..6f8b8e7d25dd --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/DocTokenOffsetsVisitor.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +import java.io.IOException; +import java.util.Set; + +public interface DocTokenOffsetsVisitor { + + /** + * + * @return doctokenoffsets for reuse + */ + public DocTokenOffsets getDocTokenOffsets(); + public Set getFields(); + public boolean visit(DocTokenOffsets docTokenOffsets) throws IOException, TargetTokenNotFoundException; +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/OffsetLengthStartComparator.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/OffsetLengthStartComparator.java new file mode 100644 index 000000000000..1f54ce256941 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/OffsetLengthStartComparator.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +import java.io.Serializable; +import java.util.Comparator; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * Sorts length desc, start offset asc + */ + +public class OffsetLengthStartComparator implements + Comparator, Serializable { + private static final long serialVersionUID = 7526472295622776147L; + + @Override + public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) { + + int lenA = offsetA.endOffset() - offsetA.startOffset(); + int lenB = offsetB.endOffset() - offsetB.startOffset(); + if (lenA < lenB) { + return 1; + } else if (lenA > lenB) { + return -1; + // by here, the length is the same + } else if (offsetA.startOffset() < offsetB.startOffset()) { + return -1; + } else if (offsetA.startOffset() > offsetB.startOffset()) { + return 1; + } + return 0; + } + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/OffsetStartComparator.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/OffsetStartComparator.java new file mode 100644 index 000000000000..c4973b06dd6c --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/OffsetStartComparator.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +import java.io.Serializable; +import java.util.Comparator; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * sort on offset start + */ +public class OffsetStartComparator implements Comparator, + Serializable { + private static final long serialVersionUID = 7526472295622776147L; + + @Override + public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) { + + if (offsetA.startOffset() < offsetB.startOffset()) { + return -1; + } else if (offsetA.startOffset() > offsetB.startOffset()) { + return 1; + } + return 0; + } + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/OffsetUtil.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/OffsetUtil.java new file mode 100644 index 000000000000..3ed278b46dc4 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/OffsetUtil.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * In some versions of Lucene, getSpans returned overlapping spans. This class + * can remove the overlapping spans and will sort them if startComparator is not + * null. + */ + +public class OffsetUtil { + + /** + * @param offsets offsets to process + * @param comparator initial OffsetLengthStartComparator to use to rule out overlaps + * @param startComparator comparator for final sort + * @return sorted list of offsets + */ + public static List removeOverlapsAndSort( + List offsets, OffsetLengthStartComparator comparator, + OffsetStartComparator startComparator) { + if (offsets == null || offsets.size() < 2) + return offsets; + + Collections.sort(offsets, comparator); + Set seen = new HashSet<>(); + List filtered = new ArrayList<>(); + for (OffsetAttribute offset : offsets) { + if (!alreadySeen(offset, seen)) { + filtered.add(offset); + for (int i = offset.startOffset(); i < offset.endOffset(); i++) { + seen.add(i); + } + } + } + if (startComparator != null) { + Collections.sort(filtered, startComparator); + } + return filtered; + } + + private static boolean alreadySeen(OffsetAttribute offset, Set seen) { + for (int i = offset.startOffset(); i <= offset.endOffset(); i++) { + if (seen.contains(i)) + return true; + } + return false; + } + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/RandomAccessCharOffsetContainer.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/RandomAccessCharOffsetContainer.java new file mode 100644 index 000000000000..82973faf6621 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/RandomAccessCharOffsetContainer.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +import java.util.BitSet; +import java.util.HashMap; +import java.util.Map; + +/** + * Class to record results for looking up normalized terms (String) and + * character offsets for specified tokens. Will return NULL_TERM/NULL_OFFSET if + * a token offset was not found. + *

+ * Has utility methods for safely getting the closest found token. This is + * useful for when a concordance window ends in a stop word (no term/offset + * info). + */ + +public class RandomAccessCharOffsetContainer { + + public final static String NULL_TERM = ""; + public final static int NULL_OFFSET = -1; + + private BitSet set = new BitSet(); + private int last = -1; + private Map terms = new HashMap(); + private Map starts = new HashMap<>(); + private Map ends = new HashMap<>(); + + /** + * @param tokenOffset token of interest + * @param startCharOffset start character offset within the string stored in StoredField[fieldIndex] + * @param endCharOffset end character offset within the string stored in StoredField[fieldIndex] + * @param term string term at that position + */ + public void add(int tokenOffset, int startCharOffset, + int endCharOffset, String term) { + addStart(tokenOffset, startCharOffset); + addEnd(tokenOffset, endCharOffset); + addTerm(tokenOffset, term); + set.set(tokenOffset); + } + + private void addTerm(int tokenOffset, String term) { + if (term != null) { + terms.put(tokenOffset, term); + } + last = (tokenOffset > last) ? tokenOffset : last; + } + + private void addStart(int tokenOffset, int charOffset) { + starts.put(tokenOffset, charOffset); + last = (tokenOffset > last) ? tokenOffset : last; + } + + private void addEnd(int tokenOffset, int charOffset) { + ends.put(tokenOffset, charOffset); + last = (tokenOffset > last) ? tokenOffset : last; + } + + /** + * @param tokenOffset target token + * @return the character offset for the first character of the tokenOffset. + * returns {@link #NULL_OFFSET} if tokenOffset wasn't found + */ + public int getCharacterOffsetStart(int tokenOffset) { + Integer start = starts.get(tokenOffset); + if (start == null) { + return NULL_OFFSET; + } + return start.intValue(); + } + + /** + * @param tokenOffset target token + * @return the character offset for the final character of the tokenOffset. + */ + public int getCharacterOffsetEnd(int tokenOffset) { + Integer end = ends.get(tokenOffset); + if (end == null) { + return NULL_OFFSET; + } + return end.intValue(); + } + + /** + * @param tokenOffset tokenOffset + * @return term stored at this tokenOffset; can return {@link #NULL_TERM} + */ + public String getTerm(int tokenOffset) { + String s = terms.get(tokenOffset); + if (s == null) { + return NULL_TERM; + } + return s; + } + + /** + * @return last/largest token offset + */ + public int getLast() { + return last; + } + + /** + * reset state + */ + public void clear() { + terms.clear(); + starts.clear(); + ends.clear(); + last = -1; + set = new BitSet(); + } + + protected boolean isEmpty() { + return set.isEmpty(); + } + + /** + * Find the closest non-null token starting from startToken + * and ending with stopToken (inclusive). + * + * @param startToken start token + * @param stopToken end token + * @param map map to use + * @return closest non-null token offset to the startToken; can return + * {@link #NULL_OFFSET} if no non-null offset was found + */ + private int getClosestToken(int startToken, int stopToken, + Map map) { + + if (startToken < 0 || stopToken < 0) { + return NULL_OFFSET; + } + if (startToken == stopToken) { + return startToken; + } + if (startToken < stopToken) { + for (int i = startToken; i <= stopToken; i++) { + Integer charOffset = map.get(i); + if (charOffset != null && charOffset != NULL_OFFSET) { + return i; + } + } + } else if (startToken > stopToken) { + for (int i = startToken; i >= stopToken; i--) { + Integer charOffset = map.get(i); + if (charOffset != null && charOffset != NULL_OFFSET) { + return i; + } + } + } + return NULL_OFFSET; + } + + public int getClosestCharStart(int startToken, int stopToken) { + + int i = getClosestToken(startToken, stopToken, starts); + Integer charStart = getCharacterOffsetStart(i); + if (charStart == null) { + return NULL_OFFSET; + } + return charStart.intValue(); + } + + public int getClosestCharEnd(int startToken, int stopToken) { + int i = getClosestToken(startToken, stopToken, ends); + Integer charEnd = getCharacterOffsetEnd(i); + if (charEnd == null) { + return NULL_OFFSET; + } + return charEnd.intValue(); + } + + protected String getClosestTerm(int startToken, int stopToken) { + int i = getClosestToken(startToken, stopToken, starts); + return getTerm(i); + } + + /* + * return: -1 if + + public int getFieldIndex(int tokenOffset) { + CharCoordinate p = starts.get(tokenOffset); + if (p == null) { + return NULL_OFFSET; + } + return p.getFieldIndex(); + } +*/ + protected String debugToString() { + StringBuilder sb = new StringBuilder(); + for (Integer i : terms.keySet()) { + sb.append(i + " : " + terms.get(i) + " : " + starts.get(i) + " : " + + ends.get(i) + "\n"); + } + return sb.toString(); + } + + protected BitSet getSet() { + return set; + } + + public void remove(int token) { + if (token == last) { + last = getClosestToken(last - 1, 0, starts); + } + set.clear(token); + terms.remove(token); + starts.remove(token); + ends.remove(token); + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java new file mode 100644 index 000000000000..b1f53e0dd3e4 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.document.Document; + +/** + * TokenCharOffsetsReader that captures character offsets by reanalyzing a + * field. + */ +public class ReanalyzingTokenCharOffsetsReader implements + TokenCharOffsetsReader { + + private final static int GOT_ALL_REQUESTS = -2; + private Analyzer baseAnalyzer; + + /** + * Constructor + * + * @param analyzer to use to get character offsets + */ + public ReanalyzingTokenCharOffsetsReader(Analyzer analyzer) { + this.baseAnalyzer = analyzer; + } + + @Override + public void getTokenCharOffsetResults(final Document d, + final String fieldName, final TokenCharOffsetRequests requests, + final RandomAccessCharOffsetContainer results) throws IOException { + + int fieldIndex = 0; + int currPosInc = -1; + int posIncrementGap = baseAnalyzer.getPositionIncrementGap(fieldName); + int charOffsetGap = baseAnalyzer.getOffsetGap(fieldName); + int charBase = 0; + for (String fieldValue : d.getValues(fieldName)) { + + currPosInc = addFieldValue(fieldName, currPosInc, charBase, fieldValue, requests, + results); + + if (currPosInc == GOT_ALL_REQUESTS) { + break; + } + charBase += fieldValue.length() + charOffsetGap; + currPosInc += posIncrementGap; + fieldIndex++; + } + + } + + private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue, + TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) + throws IOException { + //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true); + TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue); + stream.reset(); + + int defaultInc = 1; + + CharTermAttribute termAtt = stream + .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); + OffsetAttribute offsetAtt = stream + .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); + PositionIncrementAttribute incAtt = null; + if (stream + .hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) { + incAtt = stream + .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class); + } + + while (stream.incrementToken()) { + + currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc; + if (requests.contains(currInd)) { + results.add(currInd, offsetAtt.startOffset() + charBase, + offsetAtt.endOffset() + charBase, termAtt.toString()); + } + if (currInd > requests.getLast()) { + // TODO: Is there a way to avoid this? Or, is this + // an imaginary performance hit? + while (stream.incrementToken()) { + //NO-OP + } + stream.end(); + stream.close(); + return GOT_ALL_REQUESTS; + } + } + stream.end(); + stream.close(); + return currInd; + } + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SimpleAnalyzerUtil.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SimpleAnalyzerUtil.java new file mode 100644 index 000000000000..dbc0a01d54ea --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SimpleAnalyzerUtil.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + + +/** + * Simple util class for Analyzers + */ +public class SimpleAnalyzerUtil { + private final static String DEFAULT_FIELD = "FIELD"; + + /** + * + * @param s string to analyze + * @param field field to analyze + * @param analyzer analyzer to use + * @return list of analyzed terms + * @throws IOException if there's an IOException during analysis + */ + public static List getTermStrings(String s, String field, Analyzer analyzer) + throws IOException { + List terms = new ArrayList<>(); + return getTermStrings(s, field, analyzer, terms); + } + + /** + * allows reuse of terms, this method calls terms.clear() before adding new + * terms + * + * @param s string to analyze + * @param field to use in analysis + * @param analyzer analyzer + * @param terms list for reuse + * @return list of strings + * @throws IOException if there's an IOException during analysis + */ + public static List getTermStrings(String s, String field, Analyzer analyzer, + List terms) throws IOException { + if (terms == null) { + terms = new ArrayList<>(); + } + terms.clear(); + TokenStream stream = analyzer.tokenStream(field, s); + stream.reset(); + CharTermAttribute termAtt = stream + .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); + + while (stream.incrementToken()) { + terms.add(termAtt.toString()); + } + stream.end(); + stream.close(); + + return terms; + } + + /** + * This calculates a substring from an array of StorableFields. + *

+ * This attempts to do the best job possible, and at worst will + * return an empty string. If the start or end is within a gap, + * or before 0 or after the total number of characters, this will + * gracefully (blithely?) handle those cases. + * + * @param start character offset to start + * @param end character offset to end + * @param fieldValues array of Strings to process + * @param offsetGap offsetGap as typically returned by Analyzer's .getOffsetGap() + * @param interFieldJoiner string to use to mark that a substring goes beyond a single + * field entry + * @return substring, potentially empty, never null. + */ + public static String substringFromMultiValuedFields(int start, + int end, String[] fieldValues, int offsetGap, String interFieldJoiner) { + start = (start < 0) ? 0 : start; + end = (end < 0) ? 0 : end; + + if (start > end) { + start = end; + } + + int charBase = 0; + StringBuilder sb = new StringBuilder(); + int lastFieldIndex = 0; + int localStart = 0; + boolean foundStart = false; + //get start + for (int fieldIndex = 0; fieldIndex < fieldValues.length; fieldIndex++) { + String fString = fieldValues[fieldIndex]; + if (start < charBase + fString.length()) { + localStart = start - charBase; + lastFieldIndex = fieldIndex; + foundStart = true; + break; + } + charBase += fString.length() + offsetGap; + } + if (foundStart == false) { + return ""; + } + //if start occurred in a gap, reset localStart to 0 + if (localStart < 0) { + sb.append(interFieldJoiner); + localStart = 0; + } + //now append and look for end + for (int fieldIndex = lastFieldIndex; fieldIndex < fieldValues.length; fieldIndex++) { + String fString = fieldValues[fieldIndex]; + + if (end <= charBase + fString.length()) { + int localEnd = end - charBase; + //must be in gap + if (charBase > end) { + return sb.toString(); + } + if (fieldIndex != lastFieldIndex) { + sb.append(interFieldJoiner); + } + sb.append(fString.substring(localStart, localEnd)); + break; + } else { + if (fieldIndex != lastFieldIndex) { + sb.append(interFieldJoiner); + } + sb.append(fString.substring(localStart)); + localStart = 0; + } + charBase += fString.length() + offsetGap; + } + return sb.toString(); + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SpansCrawler.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SpansCrawler.java new file mode 100644 index 000000000000..ecbb9a477ab1 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/SpansCrawler.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +import java.io.IOException; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanWeight; +import org.apache.lucene.search.spans.Spans; + + +/** + * Utility class to crawl spans. + */ +public class SpansCrawler { + + /** + * + * @param query span query to use + * @param filter filter + * @param searcher searcher + * @param visitor visitor to call for each span + * @throws IOException on IOException + * @throws TargetTokenNotFoundException if the visitor can't find the target token + */ + public static void crawl(SpanQuery query, Query filter, IndexSearcher searcher, + DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException { + + query = (SpanQuery) query.rewrite(searcher.getIndexReader()); + + SpanWeight w = query.createWeight(searcher, false, 1.0f); + if (filter == null) { + for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) { + + Spans spans = w.getSpans(ctx, SpanWeight.Postings.POSITIONS); + if (spans == null) { + continue; + } + boolean cont = visitLeafReader(ctx, spans, visitor); + if (!cont) { + break; + } + } + } else { + filter = searcher.rewrite(filter); + Weight searcherWeight = searcher.createWeight(filter, false, 1.0f); + for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) { + Scorer leafReaderContextScorer = searcherWeight.scorer(ctx); + if (leafReaderContextScorer == null) { + continue; + } + //Can we tell from the scorer that there were no hits? + //in <= 5.x we could stop here if the filter query had no hits. + + Spans spans = w.getSpans(ctx, SpanWeight.Postings.POSITIONS); + if (spans == null) { + continue; + } + DocIdSetIterator filterItr = leafReaderContextScorer.iterator(); + + if (filterItr == null || filterItr.equals(DocIdSetIterator.empty())) { + continue; + } + boolean cont = visitLeafReader(ctx, spans, filterItr, visitor); + if (!cont) { + break; + } + } + } + } + + static boolean visitLeafReader(LeafReaderContext leafCtx, + Spans spans, DocIdSetIterator filterItr, DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException { + int filterDoc = -1; + int spansDoc = spans.nextDoc(); + while (true) { + if (spansDoc == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + filterDoc = filterItr.advance(spansDoc); + if (filterDoc == DocIdSetIterator.NO_MORE_DOCS) { + break; + } else if (filterDoc > spansDoc) { + while (spansDoc <= filterDoc) { + spansDoc = spans.nextDoc(); + if (spansDoc == filterDoc) { + boolean cont = visit(leafCtx, spans, visitor); + if (! cont) { + return false; + } + + } else { + continue; + } + } + } else if (filterDoc == spansDoc) { + boolean cont = visit(leafCtx, spans, visitor); + if (! cont) { + return false; + } + //then iterate spans + spansDoc = spans.nextDoc(); + } else if (filterDoc < spansDoc) { + throw new IllegalArgumentException("FILTER doc is < spansdoc!!!"); + } else { + throw new IllegalArgumentException("Something horrible happened"); + } + } + return true; + } + + static boolean visitLeafReader(LeafReaderContext leafCtx, + Spans spans, + DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException { + while (spans.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + boolean cont = visit(leafCtx, spans, visitor); + if (! cont) { + return false; + } + } + return true; + } + + + static boolean visit(LeafReaderContext leafCtx, Spans spans, DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException { + Document document = leafCtx.reader().document(spans.docID(), visitor.getFields()); + DocTokenOffsets offsets = visitor.getDocTokenOffsets(); + offsets.reset(leafCtx.docBase, spans.docID(), document); + while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { + offsets.addOffset(spans.startPosition(), spans.endPosition()); + } + return visitor.visit(offsets); + } + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TargetTokenNotFoundException.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TargetTokenNotFoundException.java new file mode 100644 index 000000000000..a63ff775cb82 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TargetTokenNotFoundException.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +public class TargetTokenNotFoundException extends Exception { + + /** + * Token offset identified by .getSpans() is not found in the + * TokenCharOffsetResults. Typical cause is a mismatch btwn analyzers at index + * and search times. When this happens, something very bad has happened and + * this should be its own exception. + */ + private static final long serialVersionUID = 1L; + + public TargetTokenNotFoundException(String message) { + super(message); + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetRequests.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetRequests.java new file mode 100644 index 000000000000..1e87c275b99c --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetRequests.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +import java.util.BitSet; + +/** + * Util class used to specify the tokens for which character offsets are requested. + */ + + +public class TokenCharOffsetRequests { + private BitSet set = new BitSet(); + private int last = -1; + + /** + * Is a specific token requested? + * + * @param i token number to test + * @return whether or not this token is requested + */ + public boolean contains(int i) { + return set.get(i); + } + + /** + * add a request from start to end inclusive + * + * @param start range of token offsets to request (inclusive) + * @param end end range of token offsets to request (inclusive) + */ + public void add(int start, int end) { + for (int i = start; i <= end; i++) { + add(i); + } + } + + /** + * add a request for a specific token + * + * @param i token offset to request the character offsets for + */ + public void add(int i) { + set.set(i); + last = (i > last) ? i : last; + } + + /** + * clear the state of this request object for reuse + */ + public void clear() { + set.clear(); + last = -1; + } + + /** + * @return greatest/last token offset in the request + */ + public int getLast() { + return last; + } + + /** + * @return the set of tokens whose character offsets are requested + */ + protected BitSet getSet() { + return set; + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetsReader.java b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetsReader.java new file mode 100644 index 000000000000..9b81e47f9da4 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/charoffsets/TokenCharOffsetsReader.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.charoffsets; + +import java.io.IOException; + +import org.apache.lucene.document.Document; + + +/** + * Interface to allow flexibility/optimizations in returning character offsets + * for tokens + */ +public interface TokenCharOffsetsReader { + + public void getTokenCharOffsetResults(final Document document, + final String fieldName, final TokenCharOffsetRequests requests, + final RandomAccessCharOffsetContainer results) throws IOException; + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/AbstractConcordanceWindowCollector.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/AbstractConcordanceWindowCollector.java new file mode 100644 index 000000000000..c055adeb7344 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/AbstractConcordanceWindowCollector.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic; + +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Abstract class to handle basic information for a ConcordanceWindowSearcher + */ +public abstract class AbstractConcordanceWindowCollector { + //value to use if all windows should be collected + public static final int COLLECT_ALL = -1; + + private final ConcordanceSorter sorter = new ConcordanceSorter(); + private final int maxWindows; + private Set docIds = new HashSet(); + private boolean hitMax = false; + private long totalDocs = 0; + + /** + * @param maxWindows maximum windows to collect + */ + public AbstractConcordanceWindowCollector(int maxWindows) { + this.maxWindows = maxWindows; + } + + /** + * Collect/process this window + * + * @param w window to be processed + */ + public abstract void collect(ConcordanceWindow w); + + /** + * @return number of windows collected + */ + public abstract int size(); + + /** + * @return collected windows (unsorted) + */ + public abstract List getWindows(); + + /** + * @param docId unique key for a document + */ + public void addDocId(String docId) { + docIds.add(docId); + } + + /** + * Sort according to {@link #sorter} and return windows + * + * @return sorted list of windows + */ + public List getSortedWindows() { + List windows = getWindows(); + Collections.sort(windows, sorter); + return windows; + } + + /** + * @return whether or not the searcher collected the maximum number of + * windows and stopped early. + */ + public boolean getHitMax() { + return hitMax; + } + + /** + * @param hitMax did the searcher collect the maximum number of windows + * and stop early + */ + public void setHitMax(boolean hitMax) { + this.hitMax = hitMax; + } + + /** + * @return the maximum number of windows to collect. + * Can be equal to {@link #COLLECT_ALL} + */ + public int getMaxWindows() { + return maxWindows; + } + + /** + * @param totalDocs add this value to {@link #totalDocs} + */ + public void incrementTotalDocs(long totalDocs) { + this.totalDocs += totalDocs; + } + + /** + * @return total number of documents in all indices + */ + public long getTotalDocs() { + return totalDocs; + } + + /** + * @param totalDocs see {@link #getTotalDocs()} + */ + public void setTotalDocs(long totalDocs) { + this.totalDocs = totalDocs; + } + + /** + * @return number of windows in results + */ + public int getNumWindows() { + List windows = getWindows(); + if (windows != null) { + return windows.size(); + } + return 0; + } + + /** + * @return number of documents in results + */ + public int getNumDocs() { + return docIds.size(); + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSearcher.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSearcher.java new file mode 100644 index 000000000000..803df2dc6681 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSearcher.java @@ -0,0 +1,275 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic; + +import java.io.IOException; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.concordance.charoffsets.DocTokenOffsets; +import org.apache.lucene.concordance.charoffsets.DocTokenOffsetsVisitor; +import org.apache.lucene.concordance.charoffsets.OffsetLengthStartComparator; +import org.apache.lucene.concordance.charoffsets.OffsetUtil; +import org.apache.lucene.concordance.charoffsets.RandomAccessCharOffsetContainer; +import org.apache.lucene.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader; +import org.apache.lucene.concordance.charoffsets.SpansCrawler; +import org.apache.lucene.concordance.charoffsets.TargetTokenNotFoundException; +import org.apache.lucene.concordance.charoffsets.TokenCharOffsetRequests; +import org.apache.lucene.concordance.charoffsets.TokenCharOffsetsReader; +import org.apache.lucene.concordance.util.ConcordanceSearcherUtil; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.spans.SimpleSpanQueryConverter; +import org.apache.lucene.search.spans.SpanQuery; + + +/** + * Searches an IndexReader and returns a list of ConcordanceWindows + */ +public class ConcordanceSearcher { + + /** + * Allow overlapping targets in hits, default = false + */ + private boolean allowTargetOverlaps = false; + + private WindowBuilder windowBuilder; + + private SimpleSpanQueryConverter spanQueryConverter; + + /** + * Constructor with default WindowBuilder and SimpleSpanQueryConverter + */ + public ConcordanceSearcher() { + this(new WindowBuilder(), new SimpleSpanQueryConverter()); + } + + /** + * Constructor for windowbuilder and SimpleSpanQueryConverter + * + * @param windowBuilder window builder + */ + public ConcordanceSearcher(WindowBuilder windowBuilder) { + this(windowBuilder, new SimpleSpanQueryConverter()); + } + + /** + * Constructor for windowBuilder and converter + * + * @param windowBuilder windowBuilder to use to build windows + * @param converter converter to use to convert Query to SpanQuery + */ + public ConcordanceSearcher(WindowBuilder windowBuilder, + SimpleSpanQueryConverter converter) { + this.windowBuilder = windowBuilder; + this.spanQueryConverter = converter; + } + + + /** + * @param searcher searcher to search + * @param fieldName field to build the windows on + * @param mainQuery if SpanQuery, this gets passed through as is. If a regular Query, the + * Query is first converted to a SpanQuery and the filterQuery is modified + * to include the original Query. + * @param filterQuery include a filterQuery mainQuery. Value can be null + * @param analyzer analyzer to use for (re)calculating character offsets and for normalizing + * the sort keys + * @param collector collector to use for search + * @throws TargetTokenNotFoundException if target token is not found + * @throws IllegalArgumentException if the field can't be found in the main query + * @throws IOException if there is an underlying IOException in the reader + */ + public void search(IndexSearcher searcher, String fieldName, Query mainQuery, + Query filterQuery, Analyzer analyzer, AbstractConcordanceWindowCollector collector) + throws TargetTokenNotFoundException, IllegalArgumentException, + IOException { + if (mainQuery == null) { + return; + } + if (mainQuery instanceof SpanQuery) { + // pass through + searchSpan(searcher, (SpanQuery) mainQuery, filterQuery, analyzer, collector); + } else { + // convert regular mainQuery to a SpanQuery. + SpanQuery spanQuery = spanQueryConverter.convert(fieldName, mainQuery); + + Query updatedFilter = mainQuery; + + if (filterQuery != null) { + updatedFilter = new BooleanQuery.Builder() + .add(mainQuery, BooleanClause.Occur.MUST) + .add(filterQuery, BooleanClause.Occur.FILTER).build(); + } + searchSpan(searcher, spanQuery, updatedFilter, analyzer, collector); + } + } + + /** + * Like + * {@link #search(IndexSearcher, String, Query, Query, Analyzer, AbstractConcordanceWindowCollector)} + * but this takes a SpanQuery + * + * @param searcher searcher + * @param spanQuery query to use to identify the targets + * @param filter filter for document retrieval + * @param analyzer to re-analyze terms for window calculations and sort key building + * @param collector to process (and store) the results + * @throws TargetTokenNotFoundException if target token is not found + * @throws IllegalArgumentException if the field can't be found in the main query + * @throws IOException if there is an underlying IOException in the reader + */ + public void searchSpan(IndexSearcher searcher, + SpanQuery spanQuery, + Query filter, Analyzer analyzer, AbstractConcordanceWindowCollector collector) + throws TargetTokenNotFoundException, IllegalArgumentException, + IOException { + + Set fields = new HashSet<>( + windowBuilder.getFieldSelector()); + fields.add(spanQuery.getField()); + DocTokenOffsetsVisitor visitor = new ConcDTOffsetVisitor(spanQuery.getField(), analyzer, + fields, collector); + SpansCrawler.crawl(spanQuery, filter, searcher, visitor); + + collector.setTotalDocs(searcher.getIndexReader().numDocs()); + } + + + /** + * Spans can overlap: a search for ["ab cd" "ab"] would have + * two spans on the string "ab cd" if this is set to true. + * If this is set to false, this will return the longest span + * that appears earliest in the string if there is overlap. + * + * @param allowTargetOverlaps are targets allowed to overlap. + */ + public void setAllowTargetOverlaps(boolean allowTargetOverlaps) { + this.allowTargetOverlaps = allowTargetOverlaps; + } + + private void throwMissingField(Document document) throws IllegalArgumentException { + StringBuilder sb = new StringBuilder(); + sb.append("Did you forget to load or specify the correct content field?!"); + sb.append("\n"); + sb.append("I only see these fields:\n"); + for (IndexableField f : document.getFields()) { + sb.append(f.name()).append("\n"); + } + throw new IllegalArgumentException(sb.toString()); + } + + /** + * Set the converter to use to convert a Query to a SpanQuery. + * The need for this will go away when LUCENE-2878 is completed. + * + * @param converter converter to use to convert queries into SpanQueries + */ + public void setSpanQueryConverter(SimpleSpanQueryConverter converter) { + this.spanQueryConverter = converter; + } + + class ConcDTOffsetVisitor implements DocTokenOffsetsVisitor { + final Set fields; + final DocTokenOffsets docTokenOffsets = new DocTokenOffsets(); + final Analyzer analyzer; + final String fieldName; + final AbstractConcordanceWindowCollector collector; + TokenCharOffsetRequests requests = new TokenCharOffsetRequests(); + + TokenCharOffsetsReader tokenOffsetsRecordReader; + + + RandomAccessCharOffsetContainer offsetResults = new RandomAccessCharOffsetContainer(); + OffsetLengthStartComparator offsetLengthStartComparator = new OffsetLengthStartComparator(); + + + ConcDTOffsetVisitor(String fieldName, Analyzer analyzer, Set fields, + AbstractConcordanceWindowCollector collector) { + this.fieldName = fieldName; + this.analyzer = analyzer; + this.fields = fields; + this.collector = collector; + tokenOffsetsRecordReader = new ReanalyzingTokenCharOffsetsReader(analyzer); + + } + @Override + public DocTokenOffsets getDocTokenOffsets() { + return docTokenOffsets; + } + + @Override + public Set getFields() { + return fields; + } + + @Override + public boolean visit(DocTokenOffsets docTokenOffsets) throws IOException { + Document document = docTokenOffsets.getDocument(); + + String[] fieldValues = document.getValues(fieldName); + + if (fieldValues == null || fieldValues.length == 0) { + throwMissingField(document); + } + Map metadata = windowBuilder.extractMetadata(document); + String docId = windowBuilder.getUniqueDocumentId(document, docTokenOffsets.getUniqueDocId()); + + List tokenOffsets = docTokenOffsets.getOffsets(); + if (!allowTargetOverlaps) { + // remove overlapping hits!!! + tokenOffsets = OffsetUtil.removeOverlapsAndSort(tokenOffsets, + offsetLengthStartComparator, null); + } + + //clear then get new requests + requests.clear(); + ConcordanceSearcherUtil.getCharOffsetRequests(tokenOffsets, + windowBuilder.getTokensBefore(), windowBuilder.getTokensAfter(), requests); + + offsetResults.clear(); + + tokenOffsetsRecordReader.getTokenCharOffsetResults( + document, fieldName, requests, offsetResults); + + for (OffsetAttribute offset : tokenOffsets) { + try { + ConcordanceWindow w = windowBuilder.buildConcordanceWindow( + docId, offset.startOffset(), + offset.endOffset() - 1, fieldValues, + offsetResults, metadata); + collector.collect(w); + } catch (TargetTokenNotFoundException e) { + throw new IllegalArgumentException(e); + } + if (collector.getHitMax()) { + return false; + } + } + return true; + } + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortKey.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortKey.java new file mode 100644 index 000000000000..2ffa02a8bf5f --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortKey.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic; + +/** + * Simple comparable class to allow for subclassing. + */ +public class ConcordanceSortKey implements Comparable { + + private final String concSortString; + + public ConcordanceSortKey(String s) { + this.concSortString = s; + } + + @Override + public int compareTo(ConcordanceSortKey other) { + return concSortString.compareTo(other.concSortString); + } + + @Override + public int hashCode() { + return concSortString.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (!(obj instanceof ConcordanceSortKey)) + return false; + ConcordanceSortKey other = (ConcordanceSortKey) obj; + if (concSortString == null) { + if (other.concSortString != null) + return false; + } else if (!concSortString.equals(other.concSortString)) + return false; + return true; + } + + @Override + public String toString() { + return concSortString; + } + + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortOrder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortOrder.java new file mode 100644 index 000000000000..2e7fa757e89e --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSortOrder.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic; + +/** + * Options for sorting ConcordanceWindows + */ +public enum ConcordanceSortOrder { + PRE, // sort on the first token before the target, then the second word, etc. + POST, // sort on words after the target + TARGET_PRE, // sort on the target and then words before the target + TARGET_POST, // sort on the target and then words after the target + DOC, // sort on a string representing a doc id and then by target char offset within the document + NONE // no sort +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSorter.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSorter.java new file mode 100644 index 000000000000..6dee6fb075cd --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceSorter.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic; + +import java.util.Comparator; + + +public class ConcordanceSorter implements Comparator { + private static final long serialVersionUID = 7526472295622776147L; + + @Override + public int compare(ConcordanceWindow w1, ConcordanceWindow w2) { + return w1.getSortKey().compareTo(w2.getSortKey()); + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceWindow.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceWindow.java new file mode 100644 index 000000000000..863adfa6cfed --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/ConcordanceWindow.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic; + +import java.util.Map; + +/** + * Key element in a concordance view of data. A window consists of the words + * before a target term (pre), the target term and then the words after the + * target term (post). A window also has a sort key to allow for various methods + * of sorting. + *

+ * For various applications, it has also been useful to store a unique document key, + * character offset (start and end) of the full + * window as well as metadata from the document for the given window. + *

+ * This class is experimental and may change in incompatible ways in the future. + */ +public class ConcordanceWindow { + + private final ConcordanceSortKey sortKey; + private final String pre; + private final String target; + private final String post; + private final int charStart; + private final int charEnd; + private final String uniqueDocID; + //used by hide duplicates to count more than one occurrence of a window + private int count = 1; + private Map metadata; + + /** + * @param uniqueDocID string representing what should be a unique document identifier + * @param charStart character offset start for the window + * @param charEnd character offset end for the window + * @param pre words before the target in reading order and unanalyzed + * @param target target string + * @param post string after the target in reading order and unanalyzed + * @param sortKey key to use for sorting this window + * @param metadata metadata to store with this window + */ + public ConcordanceWindow(String uniqueDocID, int charStart, int charEnd, String pre, + String target, String post, ConcordanceSortKey sortKey, Map metadata) { + this.pre = pre; + this.target = target; + this.post = post; + this.uniqueDocID = uniqueDocID; + this.charStart = charStart; + this.charEnd = charEnd; + this.metadata = metadata; + this.sortKey = sortKey; + } + + public String getUniqueDocID() { + return uniqueDocID; + } + + public int getStart() { + return charStart; + } + + public int getEnd() { + return charEnd; + } + + public Map getMetadata() { + return metadata; + } + + public String getPre() { + return pre; + } + + public String getPost() { + return post; + } + + public String getTarget() { + return target; + } + + public int getCount() { + return count; + } + + public void setCount(int count) { + this.count = count; + } + + public void incrementCount() { + count++; + } + + public int getSize() { + int size = 0; + if (pre != null) { + size += pre.length(); + } + if (target != null) { + size += target.length(); + } + if (post != null) { + size += post.length(); + } + return size; + } + + public ConcordanceSortKey getSortKey() { + return sortKey; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((post == null) ? 0 : post.hashCode()); + result = prime * result + ((pre == null) ? 0 : pre.hashCode()); + result = prime * result + ((target == null) ? 0 : target.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof ConcordanceWindow)) { + return false; + } + ConcordanceWindow other = (ConcordanceWindow) obj; + if (post == null) { + if (other.post != null) { + return false; + } + } else if (!post.equals(other.post)) { + return false; + } + if (pre == null) { + if (other.pre != null) { + return false; + } + } else if (!pre.equals(other.pre)) { + return false; + } + if (target == null) { + if (other.target != null) { + return false; + } + } else if (!target.equals(other.target)) { + return false; + } + return true; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(pre).append(">>>").append(target).append("<<<").append(post); + return sb.toString(); + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocIdBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocIdBuilder.java new file mode 100644 index 000000000000..9dbfce7f2738 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocIdBuilder.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic; + +import java.util.Set; + +import org.apache.lucene.document.Document; + + +/** + * Returns a unique string for each document. + * Some implementations may be able to rely only + * on the ephemeral Lucene docId. Others, may + * want to use a field within the document. + */ +public interface DocIdBuilder { + + public Set getFields(); + public String build(Document document, long docId); +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocMetadataExtractor.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocMetadataExtractor.java new file mode 100644 index 000000000000..e16c50ec3469 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/DocMetadataExtractor.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic; + +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.document.Document; + +/** + * Simple interface for a component that extracts metadata from + * a document to be stored with a ConcordanceWindow + */ +public interface DocMetadataExtractor { + + /** + * @return the fields that need to be retrieved for the document + * for proper processing + */ + public Set getFieldSelector(); + + /** + * @param document to be processed for metadata. Only those fields + * that were returned by {@link #getFieldSelector()} will be loaded + * in the document + * @return document metadata to be stored with each window + */ + public Map extract(Document document); + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/SortKeyBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/SortKeyBuilder.java new file mode 100644 index 000000000000..c05c14879549 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/SortKeyBuilder.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic; + +import java.util.Map; + +import org.apache.lucene.concordance.charoffsets.RandomAccessCharOffsetContainer; + +public interface SortKeyBuilder { + + /** + * Builds a sort key from the classic TokenCharOffsetResults object + * + * @param docKey to be used if sorting by document key + * @param startTargetTokenOffset start target token offest + * @param endTargetTokenOffset end target token offset + * @param charOffsets charoffsets + * @param numTokensPre number of tokens before + * @param numTokensPost number of tokens after + * @param metadata metadata + * @return ConcordanceSortKey + */ + ConcordanceSortKey buildKey(String docKey, + int startTargetTokenOffset, int endTargetTokenOffset, + RandomAccessCharOffsetContainer charOffsets, + int numTokensPre, int numTokensPost, Map metadata); + + public boolean requiresAnalysisOfPre(); + + public boolean requiresAnalysisOfPost(); + + public boolean requiresAnalysisOfTarget(); +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/WindowBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/WindowBuilder.java new file mode 100644 index 000000000000..d8a4fb5d6b7d --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/WindowBuilder.java @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic; + +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl; +import org.apache.lucene.document.Document; +import org.apache.lucene.concordance.charoffsets.RandomAccessCharOffsetContainer; +import org.apache.lucene.concordance.charoffsets.SimpleAnalyzerUtil; +import org.apache.lucene.concordance.charoffsets.TargetTokenNotFoundException; +import org.apache.lucene.concordance.classic.impl.DefaultSortKeyBuilder; +import org.apache.lucene.concordance.classic.impl.FieldBasedDocIdBuilder; +import org.apache.lucene.concordance.classic.impl.IndexIdDocIdBuilder; +import org.apache.lucene.concordance.classic.impl.SimpleDocMetadataExtractor; + + +/** + * Builds a ConcordanceWindow. + *

+ * This class includes basic functionality for building a window from token offsets. + *

+ * It also calls three other components: + *

    + *
  1. DocIdBuilder - extracts or builds a unique key for each document
  2. + *
  3. DocMetadataExtractor - extracts metadata from a document to be stored with each window
  4. + *
  5. SortKeyBuilder - builds a window's sort key
  6. + *
+ */ +public class WindowBuilder { + + private final static String EMPTY_STRING = ""; + private static String INTER_MULTIVALUE_FIELD_PADDING = " | "; + private final int tokensBefore; + private final int tokensAfter; + private final SortKeyBuilder sortKeyBuilder; + private final DocMetadataExtractor metadataExtractor; + private final DocIdBuilder docIdBuilder; + private final int offsetGap; + + public WindowBuilder() { + this( + 10, //tokens before + 10, //tokens after + 0, + new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), + new SimpleDocMetadataExtractor(), + new IndexIdDocIdBuilder() + ); + } + + public WindowBuilder(int tokensBefore, int tokensAfter, int offsetGap) { + this( + tokensBefore, + tokensAfter, + offsetGap, + new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), + new SimpleDocMetadataExtractor(), + new IndexIdDocIdBuilder() + ); + } + + public WindowBuilder(int tokensBefore, int tokensAfter, int offsetGap, SortKeyBuilder sortKeyBuilder, + DocMetadataExtractor metadataExtractor, DocIdBuilder docIdBuilder) { + this.tokensBefore = tokensBefore; + this.tokensAfter = tokensAfter; + this.offsetGap = offsetGap; + this.sortKeyBuilder = sortKeyBuilder; + this.metadataExtractor = metadataExtractor; + this.docIdBuilder = docIdBuilder; + } + + + /** + * + * Makes the assumption that the target token start and target token end can + * be found. If not, this returns a null. + * + * @param uniqueDocID ephemeral internal lucene unique document id + * @param targetTokenStart Target's start token + * @param targetTokenEnd Target's end token + * @param fieldValues field values + * @param metadata Metadata to be stored with the window + * @param offsets TokenOffsetResults from + * @return ConcordanceWindow or null if character offset information cannot be + * found for both the targetTokenStart and the targetTokenEnd + + * @throws TargetTokenNotFoundException if target token cannot be found + * @throws IllegalArgumentException if the start token comes after the end token, e.g. + */ + public ConcordanceWindow buildConcordanceWindow(String uniqueDocID, + int targetTokenStart, int targetTokenEnd, + String[] fieldValues, + RandomAccessCharOffsetContainer offsets, + Map metadata) + throws TargetTokenNotFoundException, + IllegalArgumentException { + + if (targetTokenStart < 0 || targetTokenEnd < 0) { + throw new IllegalArgumentException( + "targetTokenStart and targetTokenEnd must be >= 0"); + } + if (targetTokenEnd < targetTokenStart) { + throw new IllegalArgumentException( + "targetTokenEnd must be >= targetTokenStart"); + } + + int targetCharStart = offsets.getCharacterOffsetStart(targetTokenStart); + int targetCharEnd = offsets.getCharacterOffsetEnd(targetTokenEnd); + + if (targetCharStart < 0 || + targetCharEnd < 0) { + throw new TargetTokenNotFoundException( + "couldn't find character offsets for a target token.\n" + + "Check that your analyzers are configured properly.\n"); + } + + OffsetAttribute preCharOffset = getPreCharOffset(targetTokenStart, + targetCharStart, offsets); + String preString = (preCharOffset == null) ? EMPTY_STRING : + SimpleAnalyzerUtil.substringFromMultiValuedFields( + preCharOffset.startOffset(), preCharOffset.endOffset(), fieldValues, + offsetGap, INTER_MULTIVALUE_FIELD_PADDING); + + OffsetAttribute postCharOffset = getPostCharOffset(targetTokenEnd, + targetCharEnd, offsets); + + String postString = (postCharOffset == null) ? EMPTY_STRING : + SimpleAnalyzerUtil.substringFromMultiValuedFields( + postCharOffset.startOffset(), postCharOffset.endOffset(), fieldValues, + offsetGap, INTER_MULTIVALUE_FIELD_PADDING); + + String targString = SimpleAnalyzerUtil.substringFromMultiValuedFields( + targetCharStart, targetCharEnd, fieldValues, + offsetGap, INTER_MULTIVALUE_FIELD_PADDING); + ConcordanceSortKey sortKey = sortKeyBuilder.buildKey(uniqueDocID, + targetTokenStart, targetTokenEnd, offsets, tokensBefore, tokensAfter, metadata); + int charStart = (preCharOffset == null) ? targetCharStart : + preCharOffset.startOffset(); + + int charEnd = (postCharOffset == null) ? targetCharEnd : postCharOffset.endOffset(); + return new ConcordanceWindow(uniqueDocID, charStart, charEnd, preString, targString, + postString, sortKey, metadata); + + } + + + private OffsetAttribute getPreCharOffset(int targetTokenStart, + int targetCharStart, + RandomAccessCharOffsetContainer charOffsets) { + if (tokensBefore == 0) + return null; + + if (targetTokenStart == 0) { + return null; + } + int contextTokenStart = Math.max(0, + targetTokenStart - tokensBefore); + + int contextCharStart = charOffsets.getClosestCharStart(contextTokenStart, targetTokenStart); + //closest start wasn't actually found + //this can happen if there is a large posInc and the target + //lands at the start of a field index + if (contextCharStart < 0) { + return null; + } + int contextCharEnd = Math.max(contextCharStart, targetCharStart - 1); + + return buildOffsetAttribute(contextCharStart, contextCharEnd); + } + + private OffsetAttribute getPostCharOffset(int targetTokenEnd, + int targetCharEnd, + RandomAccessCharOffsetContainer charOffsets) { + + if (tokensAfter == 0) + return null; + + int contextTokenEnd = targetTokenEnd + tokensAfter; + int contextCharStart = targetCharEnd; + int contextCharEnd = charOffsets.getClosestCharEnd( + contextTokenEnd, targetTokenEnd + 1); + + if (contextCharStart >= contextCharEnd) { + return null; + } + return buildOffsetAttribute(contextCharStart, contextCharEnd); + } + + private OffsetAttribute buildOffsetAttribute(int start, int end) { + OffsetAttribute off = new OffsetAttributeImpl(); + off.setOffset(start, end); + return off; + } + + + public Set getFieldSelector() { + Set set = new HashSet<>(); + set.addAll(metadataExtractor.getFieldSelector()); + if (docIdBuilder instanceof FieldBasedDocIdBuilder) { + set.addAll(((FieldBasedDocIdBuilder) docIdBuilder).getFields()); + } + return set; + } + + /** + * Simple wrapper around metadataExtractor + * + * @param document document from which to extract metadata + * @return map + */ + public Map extractMetadata(Document document) { + return metadataExtractor.extract(document); + } + + public String getUniqueDocumentId(Document document, long docId) { + return docIdBuilder.build(document, docId); + } + + public int getTokensBefore() { + return tokensBefore; + } + + public int getTokensAfter() { + return tokensAfter; + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/ConcordanceWindowCollector.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/ConcordanceWindowCollector.java new file mode 100644 index 000000000000..6315339ef13a --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/ConcordanceWindowCollector.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic.impl; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.concordance.classic.AbstractConcordanceWindowCollector; +import org.apache.lucene.concordance.classic.ConcordanceWindow; + +public class ConcordanceWindowCollector extends AbstractConcordanceWindowCollector { + + private List windows = new ArrayList(); + + public ConcordanceWindowCollector(int maxWindows) { + super(maxWindows); + } + + @Override + public void collect(ConcordanceWindow w) { + if (getMaxWindows() != AbstractConcordanceWindowCollector.COLLECT_ALL + && windows.size() >= getMaxWindows()) { + setHitMax(true); + return; + } + windows.add(w); + addDocId(w.getUniqueDocID()); + } + + @Override + public int size() { + return windows.size(); + } + + @Override + public List getWindows() { + return windows; + } + + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DedupingConcordanceWindowCollector.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DedupingConcordanceWindowCollector.java new file mode 100644 index 000000000000..201ef11a73f8 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DedupingConcordanceWindowCollector.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic.impl; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.concordance.classic.AbstractConcordanceWindowCollector; +import org.apache.lucene.concordance.classic.ConcordanceWindow; + +/** + * Like ConcordanceWindowCollector, but this collector + * doesn't store duplicate windows. Windows are defined as duplicates by + * {@link #buildEqualityKey(ConcordanceWindow, StringBuilder)}. + */ +public class DedupingConcordanceWindowCollector extends AbstractConcordanceWindowCollector { + + Map map = new HashMap(); + private StringBuilder sb = new StringBuilder(); + + /** + * @param maxHits maximum number of windows to store. This could potentially + * visit lots more windows than maxHits. + */ + public DedupingConcordanceWindowCollector(int maxHits) { + super(maxHits); + } + + @Override + public void collect(ConcordanceWindow w) { + if (getHitMax() == true) { + return; + } + buildEqualityKey(w, sb); + String key = sb.toString(); + ConcordanceWindow oldWindow = map.get(key); + if (oldWindow == null) { + //we would have added a new window here + if (getMaxWindows() != AbstractConcordanceWindowCollector.COLLECT_ALL && + map.size() >= getMaxWindows()) { + setHitMax(true); + return; + } + oldWindow = w; + } else { + //if the old window existed (i.e. new window is a duplicate) + //keep incrementing the count + oldWindow.incrementCount(); + } + + map.put(key, oldWindow); + addDocId(w.getUniqueDocID()); + } + + + /** + * number of windows collected + */ + @Override + public int size() { + return map.size(); + } + + @Override + public List getWindows() { + List windows = new ArrayList<>(); + windows.addAll(map.values()); + return windows; + } + + /** + * Public for easy overriding. Generate a key to be used to determine + * whether two windows are the same. Some implementations + * might want to lowercase, some might want genuine case folding, + * some might want to strip non-alphanumerics, etc. + *

+ * If you are overriding this, make sure to call sb.setLength(0)! + * + * @param w ConcordanceWindow + * @param sb reuseable StringBuilder; sb.setLength(0) is called before use! + */ + public void buildEqualityKey(ConcordanceWindow w, StringBuilder sb) { + sb.setLength(0); + sb.append(w.getPre().toLowerCase()); + sb.append(">>>"); + sb.append(w.getTarget().toLowerCase()); + sb.append("<<<"); + sb.append(w.getPost().toLowerCase()); + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DefaultSortKeyBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DefaultSortKeyBuilder.java new file mode 100644 index 000000000000..4e9026ccca60 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DefaultSortKeyBuilder.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic.impl; + +import java.util.Map; + +import org.apache.lucene.concordance.charoffsets.RandomAccessCharOffsetContainer; +import org.apache.lucene.concordance.classic.ConcordanceSortKey; +import org.apache.lucene.concordance.classic.ConcordanceSortOrder; +import org.apache.lucene.concordance.classic.SortKeyBuilder; + +/** + * Builds basic sort key for the values available in ConcordanceSortOrder + */ +public class DefaultSortKeyBuilder implements SortKeyBuilder { + + private final static String SPACE = " "; + private final static String EMPTY_STRING = ""; + //what filler to use when a "term" comes back as null from the + //TokenCharOffsetResults + private static String NULL_FILLER = ""; + private final ConcordanceSortOrder sortOrder; + + /** + * Calls {@link #DefaultSortKeyBuilder(ConcordanceSortOrder)} + * with value of: ConcordanceSortOrder.PRE + */ + public DefaultSortKeyBuilder() { + this.sortOrder = ConcordanceSortOrder.PRE; + } + + /** + * @param sortOrder sort order to use + */ + public DefaultSortKeyBuilder(ConcordanceSortOrder sortOrder) { + this.sortOrder = sortOrder; + } + + @Override + public ConcordanceSortKey buildKey(String docKey, + int startTargetTokenOffset, + int endTargetTokenOffset, + RandomAccessCharOffsetContainer charOffsets, + int tokensBefore, int tokensAfter, + Map metadata) { + + if (sortOrder == ConcordanceSortOrder.NONE) { + return new ConcordanceSortKey(EMPTY_STRING); + } + + if (sortOrder == ConcordanceSortOrder.DOC) { + int targCharStart = charOffsets.getCharacterOffsetStart(startTargetTokenOffset); + return new DocumentOrderSortKey(docKey, targCharStart); + } + + StringBuilder sb = new StringBuilder(); + //order is important for appending to sb, target must come before pre/post + if (sortOrder == ConcordanceSortOrder.TARGET_POST + || sortOrder == ConcordanceSortOrder.TARGET_PRE) { + + for (int i = startTargetTokenOffset; i <= endTargetTokenOffset; i++) { + String tmp = charOffsets.getTerm(i); + if (tmp != null && tmp.length() > 0) { + sb.append(tmp).append(SPACE); + } else { + sb.append(NULL_FILLER); + } + } + } + if (sortOrder == ConcordanceSortOrder.PRE + || sortOrder == ConcordanceSortOrder.TARGET_PRE) { + int tmpStart = startTargetTokenOffset - 1; + int tmpEnd = Math.max(0, startTargetTokenOffset - tokensBefore); + if (tmpStart < 0) { + sb.append(SPACE); + } + + for (int i = tmpStart; i >= tmpEnd; i--) { + String tmp = charOffsets.getTerm(i); + if (tmp != null && tmp.length() > 0) { + sb.append(tmp).append(SPACE); + } else { + sb.append(NULL_FILLER); + } + } + + } else if (sortOrder == ConcordanceSortOrder.POST + || sortOrder == ConcordanceSortOrder.TARGET_POST) { + + int tmpStart = endTargetTokenOffset + 1; + int tmpEnd = Math.min(charOffsets.getLast(), endTargetTokenOffset + tokensAfter); + + if (tmpStart > charOffsets.getLast()) { + sb.append(SPACE); + } + for (int i = tmpStart; i <= tmpEnd; i++) { + String tmp = charOffsets.getTerm(i); + if (tmp != null && tmp.length() > 0) { + sb.append(tmp).append(SPACE); + } else { + sb.append(NULL_FILLER); + } + } + } + return new ConcordanceSortKey(sb.toString().trim()); + } + + @Override + public boolean requiresAnalysisOfPre() { + if (sortOrder == ConcordanceSortOrder.PRE + || sortOrder == ConcordanceSortOrder.TARGET_PRE) { + return true; + } + return false; + } + + @Override + public boolean requiresAnalysisOfPost() { + if (sortOrder == ConcordanceSortOrder.POST + || sortOrder == ConcordanceSortOrder.TARGET_POST) { + return true; + } + return false; + } + + @Override + public boolean requiresAnalysisOfTarget() { + if (sortOrder == ConcordanceSortOrder.TARGET_PRE + || sortOrder == ConcordanceSortOrder.TARGET_POST) { + return true; + } + return false; + } + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DocumentOrderSortKey.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DocumentOrderSortKey.java new file mode 100644 index 000000000000..99a70abde9d2 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/DocumentOrderSortKey.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic.impl; + +import org.apache.lucene.concordance.classic.ConcordanceSortKey; + +/** + * This sorts based alphabetically on the document key + * and then numerically on the + */ +public class DocumentOrderSortKey extends ConcordanceSortKey { + + protected final int targetCharStart; + + public DocumentOrderSortKey(String docKey, int targetCharStart) { + super(docKey); + this.targetCharStart = targetCharStart; + } + + @Override + public int compareTo(ConcordanceSortKey o) { + if (o instanceof DocumentOrderSortKey) { + DocumentOrderSortKey other = (DocumentOrderSortKey) o; + int cmp = super.compareTo(o); + if (cmp == 0) { + return Integer.compare(targetCharStart, other.targetCharStart); + } + return cmp; + } else { + return super.compareTo(o); + } + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/FieldBasedDocIdBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/FieldBasedDocIdBuilder.java new file mode 100644 index 000000000000..aeb43eefd441 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/FieldBasedDocIdBuilder.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic.impl; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.concordance.classic.DocIdBuilder; + +/** + * Simple class that grabs the stringValue() of a specified + * field to use as a document's unique key for the ConcordanceWindow + * building process. + *

+ * Note that this takes only the first value of the field. + * If a multi-valued field is selected, surprises might happen. + *

+ * Also, note that if the field is not found, this returns + * a string representation of the ephemeral Lucene docId. + *

+ * Some users might want to throw an exception instead of this behavior. + */ +public class FieldBasedDocIdBuilder implements DocIdBuilder { + + private final String fieldName; + + /** + * @param fieldName, name of field to be used as a document's unique key + */ + public FieldBasedDocIdBuilder(String fieldName) { + this.fieldName = fieldName; + } + + @Override + public String build(Document d, long docId) { + IndexableField field = d.getField(fieldName); + //should probably throw exception, no?! + if (field == null) { + return Long.toString(docId); + } + return field.stringValue(); + } + + /** + * Instead of getField(String fieldName), this allows for extension + * + * @return fields to use + */ + public Set getFields() { + Set fields = new HashSet(); + fields.add(fieldName); + return fields; + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/IndexIdDocIdBuilder.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/IndexIdDocIdBuilder.java new file mode 100644 index 000000000000..c4a4c42ee824 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/IndexIdDocIdBuilder.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic.impl; + +import java.util.Collections; +import java.util.Set; + +import org.apache.lucene.document.Document; +import org.apache.lucene.concordance.classic.DocIdBuilder; + +/** + * Simple id builder based on ephemeral Lucene doc ids. + * Use this only if your documents do not have a unique key. + * Then, use only with great care. + */ +public class IndexIdDocIdBuilder implements DocIdBuilder { + + + @Override + public Set getFields() { + return Collections.EMPTY_SET; + } + + @Override + public String build(Document d, long docId) { + return Long.toString(docId); + } + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/SimpleDocMetadataExtractor.java b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/SimpleDocMetadataExtractor.java new file mode 100644 index 000000000000..6e2bafcf1a9a --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/impl/SimpleDocMetadataExtractor.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.classic.impl; + +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.document.Document; +import org.apache.lucene.concordance.classic.DocMetadataExtractor; + +/** + * Simple class that returns a map of key value pairs + * for the fields specified during initialization. + *

+ * Beware! For multi-valued fields, this will take only the first value. + */ +public class SimpleDocMetadataExtractor implements DocMetadataExtractor { + + private Set fields = new HashSet<>(); + + public SimpleDocMetadataExtractor(String... fields) { + for (String f : fields) { + this.fields.add(f); + } + } + + public SimpleDocMetadataExtractor(Set fields) { + this.fields.addAll(fields); + } + + public void addField(String f) { + fields.add(f); + } + + @Override + public Set getFieldSelector() { + return Collections.unmodifiableSet(fields); + } + + @Override + public Map extract(Document d) { + Map map = new HashMap<>(); + // only takes the first value in a multi-valued field!!! + for (String fieldName : getFieldSelector()) { + String[] fieldValues = d.getValues(fieldName); + + if (fieldValues != null && fieldValues.length > 0) { + map.put(fieldName, fieldValues[0]); + } + } + return map; + } + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/classic/package.html b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/package.html new file mode 100644 index 000000000000..3635ce638caa --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/classic/package.html @@ -0,0 +1,31 @@ + + + + +

+ ConcordanceSearcher performs a search on an index and returns concordance windows. +

+ +

+ This currently relies heavily on SpanQueries. When they are nuked (LUCENE-2878), + this can will be modified to perform the same behavior with a Scorer. +

+ + + + diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/package.html b/lucene/concordance/src/java/org/apache/lucene/concordance/package.html new file mode 100644 index 000000000000..ef21389527f1 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/package.html @@ -0,0 +1,30 @@ + + + + +

The concordance package includes two primary areas of functionality: +

    +
  1. a traditional concordancer to produce concordance results for human use (see: oal.corpus.concordance.classic) +
  2. +
  3. a concordance window visitor to enable calculations of statistics on target terms or + context terms (see: oal.corpus.concordance.windowvisitor) +
  4. +
+

+ + diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/util/ConcordanceSearcherUtil.java b/lucene/concordance/src/java/org/apache/lucene/concordance/util/ConcordanceSearcherUtil.java new file mode 100644 index 000000000000..d5292179f365 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/util/ConcordanceSearcherUtil.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.util; + +import java.util.List; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.concordance.charoffsets.TokenCharOffsetRequests; + +/** + * In other applications with variations on the ConcordanceSearcher, it has been + * useful to factor out the getCharOffsetRequests. + *

+ * This class should be used for functionality that is generally useful for + * concordance searching. + */ +public class ConcordanceSearcherUtil { + + + /** + * Simple utility method to build a TokenCharOffsetRequests object + * from a list of desired tokenOffsets, the number of tokensBefore + * and the number of tokensAfter. + * + * @param tokenOffsets the tokenOffsets that are desired + * @param tokensBefore the number of tokens before a desired tokenOffset + * @param tokensAfter the number of tokens after a desired tokenOffset + * @param requests an empty requests to be filled in + */ + public static void getCharOffsetRequests( + List tokenOffsets, + int tokensBefore, int tokensAfter, + TokenCharOffsetRequests requests) { + + for (OffsetAttribute tokenOffset : tokenOffsets) { + int start = tokenOffset.startOffset() - tokensBefore; + start = (start < 0) ? 0 : start; + int end = tokenOffset.endOffset() + tokensAfter + 1; + for (int i = start; i < end; i++) { + requests.add(i); + } + } + } + +} diff --git a/lucene/concordance/src/java/org/apache/lucene/concordance/util/SimpleTargetTermResults.java b/lucene/concordance/src/java/org/apache/lucene/concordance/util/SimpleTargetTermResults.java new file mode 100644 index 000000000000..200cff13bd32 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/concordance/util/SimpleTargetTermResults.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance.util; + +import java.util.Map; + +/** + * Simple class to hold document frequencies and term frequencies + * for terms. + */ +public class SimpleTargetTermResults { + private final Map tfs; + private final Map dfs; + + /** + * @param dfs document frequencies + * @param tfs term frequencies + */ + protected SimpleTargetTermResults(Map dfs, + Map tfs) { + this.dfs = dfs; + this.tfs = tfs; + } + + /** + * @return term frequency map + */ + public Map getTermFreqs() { + return tfs; + } + + /** + * @return document frequency map + */ + public Map getDocFreqs() { + return dfs; + } +} diff --git a/lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java b/lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java new file mode 100644 index 000000000000..fbfa59a37c68 --- /dev/null +++ b/lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java @@ -0,0 +1,315 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.spans; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.SynonymQuery; +import org.apache.lucene.search.TermQuery; + +public class SimpleSpanQueryConverter { + /** + * Converts a regular query to a {@link org.apache.lucene.search.spans.SpanQuery} for use in a highlighter. + * Because of subtle differences in {@link org.apache.lucene.search.spans.SpanQuery} and {@link org.apache.lucene.search.Query}, this + * {@link org.apache.lucene.search.spans.SpanQuery} will not necessarily return the same documents as the + * initial Query. For example, the generated SpanQuery will not include + * clauses of type BooleanClause.Occur.MUST_NOT. Also, the + * {@link org.apache.lucene.search.spans.SpanQuery} will only cover a single field, whereas the {@link org.apache.lucene.search.Query} + * might contain multiple fields. + *

+ * Returns an empty SpanQuery if the {@link org.apache.lucene.search.Query} is a class that + * is handled, but for some reason can't be converted from a {@link org.apache.lucene.search.Query} to a + * {@link org.apache.lucene.search.spans.SpanQuery}. This can happen for many reasons: e.g. if the Query + * contains no terms in the requested "field" or the Query is a MatchAllDocsQuery. + *

+ * Throws IllegalArgumentException if the Query is a class that is + * is not yet handled. + *

+ * This class does not rewrite the SpanQuery before returning it. + * Clients are required to rewrite if necessary. + *

+ * Much of this code is copied directly from + * oal.search.highlight.WeightedSpanTermExtractor. There are some subtle + * differences. + *

+ * Throws IllegalArgumentException if an unknown query type is passed in. + * + * @param field single field to extract SpanQueries for + * @param queryToConvert query to convert + * @return SpanQuery for use in highlighting; can return empty SpanQuery + * @throws IOException if encountered during parse + */ + public SpanQuery convert(String field, Query queryToConvert) throws IOException { + + Float boost = null; + Query query = queryToConvert; + if (queryToConvert instanceof BoostQuery) { + query = ((BoostQuery)query).getQuery(); + boost = ((BoostQuery)query).getBoost(); + } + /* + * copied nearly verbatim from + * org.apache.lucene.search.highlight.WeightedSpanTermExtractor + * TODO:refactor to avoid duplication of code if possible. + * Beware: there are some subtle differences. + */ + if (query instanceof SpanQuery) { + SpanQuery sq = (SpanQuery) query; + if (sq.getField().equals(field)) { + return (SpanQuery) query; + } else { + return getEmptySpanQuery(); + } + } else if (query instanceof BooleanQuery) { + List queryClauses = ((BooleanQuery) query).clauses(); + List spanQs = new ArrayList(); + for (int i = 0; i < queryClauses.size(); i++) { + if (!queryClauses.get(i).isProhibited()) { + tryToAdd(field, convert(field, queryClauses.get(i).getQuery()), spanQs); + } + } + return addBoost(buildSpanOr(spanQs), boost); + } else if (query instanceof PhraseQuery) { + PhraseQuery phraseQuery = ((PhraseQuery) query); + + Term[] phraseQueryTerms = phraseQuery.getTerms(); + if (phraseQueryTerms.length == 0) { + return getEmptySpanQuery(); + } else if (!phraseQueryTerms[0].field().equals(field)) { + return getEmptySpanQuery(); + } + SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length]; + for (int i = 0; i < phraseQueryTerms.length; i++) { + clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); + } + int slop = phraseQuery.getSlop(); + int[] positions = phraseQuery.getPositions(); + // sum position increments (>1) and add to slop + if (positions.length > 0) { + int lastPos = positions[0]; + int sz = positions.length; + for (int i = 1; i < sz; i++) { + int pos = positions[i]; + int inc = pos - lastPos - 1; + slop += inc; + lastPos = pos; + } + } + + boolean inorder = false; + + if (phraseQuery.getSlop() == 0) { + inorder = true; + } + + SpanQuery sp = new SpanNearQuery(clauses, slop, inorder); + return addBoost(sp, boost); + } else if (query instanceof TermQuery) { + TermQuery tq = (TermQuery) query; + if (tq.getTerm().field().equals(field)) { + return addBoost(new SpanTermQuery(tq.getTerm()), boost); + } else { + return getEmptySpanQuery(); + } + } else if (query instanceof ConstantScoreQuery) { + return convert(field, ((ConstantScoreQuery) query).getQuery()); + } else if (query instanceof DisjunctionMaxQuery) { + List spanQs = new ArrayList<>(); + for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator + .hasNext(); ) { + tryToAdd(field, convert(field, iterator.next()), spanQs); + } + if (spanQs.size() == 0) { + return getEmptySpanQuery(); + } else if (spanQs.size() == 1) { + return addBoost(spanQs.get(0), boost); + } else { + return addBoost(new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()])), boost); + } + } else if (query instanceof MatchAllDocsQuery) { + return getEmptySpanQuery(); + } else if (query instanceof MultiPhraseQuery) { + + final MultiPhraseQuery mpq = (MultiPhraseQuery) query; + + final Term[][] termArrays = mpq.getTermArrays(); + //test for empty or wrong field + if (termArrays.length == 0) { + return getEmptySpanQuery(); + } else if (termArrays.length > 1) { + Term[] ts = termArrays[0]; + if (ts.length > 0) { + Term t = ts[0]; + if (!t.field().equals(field)) { + return getEmptySpanQuery(); + } + } + } + final int[] positions = mpq.getPositions(); + if (positions.length > 0) { + + int maxPosition = positions[positions.length - 1]; + for (int i = 0; i < positions.length - 1; ++i) { + if (positions[i] > maxPosition) { + maxPosition = positions[i]; + } + } + + @SuppressWarnings("unchecked") + final List[] disjunctLists = new List[maxPosition + 1]; + int distinctPositions = 0; + + for (int i = 0; i < termArrays.length; ++i) { + final Term[] termArray = termArrays[i]; + List disjuncts = disjunctLists[positions[i]]; + if (disjuncts == null) { + disjuncts = (disjunctLists[positions[i]] = new ArrayList( + termArray.length)); + ++distinctPositions; + } + for (int j = 0; j < termArray.length; ++j) { + disjuncts.add(new SpanTermQuery(termArray[j])); + } + } + + int positionGaps = 0; + int position = 0; + final SpanQuery[] clauses = new SpanQuery[distinctPositions]; + for (int i = 0; i < disjunctLists.length; ++i) { + List disjuncts = disjunctLists[i]; + if (disjuncts != null) { + if (disjuncts.size() == 1) { + clauses[position++] = disjuncts.get(0); + } else { + clauses[position++] = new SpanOrQuery( + disjuncts.toArray(new SpanQuery[disjuncts.size()])); + } + } else { + ++positionGaps; + } + } + + final int slop = mpq.getSlop(); + final boolean inorder = (slop == 0); + + SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, + inorder); + return addBoost(sp, boost); + } + } else if (query instanceof MultiTermQuery) { + MultiTermQuery tq = (MultiTermQuery) query; + if (! tq.getField().equals(field)) { + return getEmptySpanQuery(); + } + return addBoost( + new SpanMultiTermQueryWrapper<>((MultiTermQuery) query), boost); + } else if (query instanceof SynonymQuery) { + SynonymQuery sq = (SynonymQuery)query; + List spanQs = new ArrayList<>(); + for (Term t : sq.getTerms()) { + spanQs.add(new SpanTermQuery(t)); + } + return addBoost(buildSpanOr(spanQs), boost); + } + return convertUnknownQuery(field, queryToConvert); + } + + private SpanQuery buildSpanOr(List spanQs) { + if (spanQs.size() == 0) { + return getEmptySpanQuery(); + } else if (spanQs.size() == 1) { + return spanQs.get(0); + } else { + return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()])); + } + + } + + private SpanQuery addBoost(SpanQuery sq, Float boost) { + if (boost == null) { + return sq; + } + return new SpanBoostQuery(sq, boost); + } + + private void tryToAdd(String field, SpanQuery q, List qs) { + if (q == null || isEmptyQuery(q) || !q.getField().equals(field)) { + return; + } + qs.add(q); + } + + /** + * Extend this to handle queries that are not currently handled. + * Might consider extending SpanQueryConverter in the queries compilation unit; + * that includes CommonTermsQuery. + *

+ * In this class, this always throws an IllegalArgumentException + * + * @param field field to convert + * @param query query to convert + * @return nothing. Throws IllegalArgumentException + */ + protected SpanQuery convertUnknownQuery(String field, Query query) { + throw new IllegalArgumentException("SpanQueryConverter is unable to convert this class " + + query.getClass().toString()); + } + + /** + * @return an empty SpanQuery (SpanOrQuery with no cluases) + */ + protected SpanQuery getEmptySpanQuery() { + return new SpanOrQuery(new SpanTermQuery[0]); + } + + /** + * Is this a null or empty SpanQuery + * + * @param q query to test + * @return whether a null or empty SpanQuery + */ + private boolean isEmptyQuery(SpanQuery q) { + if (q == null) { + return true; + } + if (q instanceof SpanOrQuery) { + SpanOrQuery soq = (SpanOrQuery) q; + for (SpanQuery sq : soq.getClauses()) { + if (!isEmptyQuery(sq)) { + return false; + } + } + return true; + } + return false; + } +} diff --git a/lucene/concordance/src/test/org/apache/lucene/concordance/ConcordanceTestBase.java b/lucene/concordance/src/test/org/apache/lucene/concordance/ConcordanceTestBase.java new file mode 100644 index 000000000000..7d78d31fbadf --- /dev/null +++ b/lucene/concordance/src/test/org/apache/lucene/concordance/ConcordanceTestBase.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.concordance; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.concordance.charoffsets.SimpleAnalyzerUtil; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; + +public class ConcordanceTestBase extends LuceneTestCase { + + protected final static String FIELD = "f1"; + + public static Analyzer getAnalyzer(final CharacterRunAutomaton stops) { + return getAnalyzer(stops, random().nextInt(10000), random().nextInt(10000)); + } + + public static Analyzer getAnalyzer(final CharacterRunAutomaton stops, + final int posIncGap, final int charOffsetGap) { + + return new Analyzer() { + + @Override + public TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); + TokenFilter filter = new MockTokenFilter(tokenizer, stops); + return new TokenStreamComponents(tokenizer, filter); + } + + @Override + public int getPositionIncrementGap(String fieldName) { + return posIncGap; + } + + @Override + public int getOffsetGap(String fieldName) { + return charOffsetGap; + } + }; + } + + public Directory getDirectory(Analyzer analyzer, String[] vals) + throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(analyzer) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy())); + + for (String s : vals) { + Document d = new Document(); + d.add(newTextField(FIELD, s, Field.Store.YES)); + writer.addDocument(d); + } + writer.close(); + return directory; + } + + public Directory getDirectory(Analyzer analyzer, List input) + throws IOException { + + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(analyzer) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy())); + + for (String[] vals : input) { + Document d = new Document(); + for (String s : vals) { + d.add(newTextField(FIELD, s, Field.Store.YES)); + } + writer.addDocument(d); + } + writer.close(); + return directory; + } + + Directory buildNeedleIndex(String needle, + Analyzer analyzer, int numFieldValues) throws Exception { + + IndexWriterConfig config = newIndexWriterConfig(random(), analyzer) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy()); + + Directory directory = newDirectory(); + + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config); + //create document with multivalued field + String[] fs = new String[numFieldValues]; + for (int i = 0; i < numFieldValues; i++) { + float r = random().nextFloat(); + String doc = ""; + if (r <= 0.33) { + doc = needle + " " + getRandomWords(29, needle, analyzer); + } else if (r <= 0.66) { + doc = getRandomWords(13, needle, analyzer) + " " + needle + " " + getRandomWords(17, needle, analyzer); + } else { + doc = getRandomWords(31, needle, analyzer) + " " + needle; + } + fs[i] = doc; + } + + Document d = new Document(); + FieldType type = new FieldType(); + type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + type.setStored(true); + type.setTokenized(true); + + //IndexableField field = new IndexableField(type); + for (String s : fs) { + d.add(newField(random(), FIELD, s, type)); + } + writer.addDocument(d); + writer.close(); + return directory; + } + + + /** + * this assumes no stop filter in the analyzer. + * Best to use whitespace tokenizer. + */ + private String getRandomWords(int numWords, String needle, Analyzer analyzer) throws Exception { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < numWords; i++) { + sb.append(TestUtil.randomUnicodeString(random(), 31)); + sb.append(" "); + } + List terms = SimpleAnalyzerUtil.getTermStrings(sb.toString(),FIELD, analyzer); + StringBuilder rsb = new StringBuilder(); + int words = -1; + while (words++ < numWords && words < terms.size()) { + String cand = terms.get(words); + if (!needle.equals(cand)) { + if (words > 0) { + rsb.append(" "); + } + rsb.append(cand); + } + } + return rsb.toString(); + } + + + String getNeedle(Analyzer analyzer) { + //try to get a term that would come out of the analyzer + for (int i = 0; i < 10; i++) { + //start with a random base string + String baseString = TestUtil.randomUnicodeString(random(), random().nextInt(10) + 2); + + try { + //run it through the analyzer, and take the first thing + //that comes out of it if the length > 0 + List terms = SimpleAnalyzerUtil.getTermStrings(baseString, FIELD, analyzer); + for (String t : terms) { + if (t.length() > 0) { + return t; + } + } + } catch (IOException e) { + //swallow + } + } + //if nothing is found in 10 tries, + //return literal string "needle" + + return "needle"; + } +} diff --git a/lucene/concordance/src/test/org/apache/lucene/concordance/TestConcordanceSearcher.java b/lucene/concordance/src/test/org/apache/lucene/concordance/TestConcordanceSearcher.java new file mode 100644 index 000000000000..e8b0cc8a0dff --- /dev/null +++ b/lucene/concordance/src/test/org/apache/lucene/concordance/TestConcordanceSearcher.java @@ -0,0 +1,560 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.concordance; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.concordance.classic.AbstractConcordanceWindowCollector; +import org.apache.lucene.concordance.classic.ConcordanceSearcher; +import org.apache.lucene.concordance.classic.ConcordanceSortOrder; +import org.apache.lucene.concordance.classic.ConcordanceWindow; +import org.apache.lucene.concordance.classic.DocIdBuilder; +import org.apache.lucene.concordance.classic.DocMetadataExtractor; +import org.apache.lucene.concordance.classic.WindowBuilder; +import org.apache.lucene.concordance.classic.impl.ConcordanceWindowCollector; +import org.apache.lucene.concordance.classic.impl.DedupingConcordanceWindowCollector; +import org.apache.lucene.concordance.classic.impl.DefaultSortKeyBuilder; +import org.apache.lucene.concordance.classic.impl.IndexIdDocIdBuilder; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.store.Directory; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TestConcordanceSearcher extends ConcordanceTestBase { + + private final static DocMetadataExtractor metadataExtractor = + new DocMetadataExtractor() { + private final Set fields = new HashSet<>(); + private final Map data = new HashMap<>(); + + @Override + public Set getFieldSelector() { + return fields; + } + + @Override + public Map extract(Document d) { + return data; + } + }; + + private final static DocIdBuilder docIdBuilder = new IndexIdDocIdBuilder(); + + @BeforeClass + public static void beforeClass() throws Exception { + // NOOP for now + } + + @AfterClass + public static void afterClass() throws Exception { + // NOOP for now + } + + @Test + public void testSimple() throws Exception { + String[] docs = new String[]{"a b c a b c", "c b a c b a"}; + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(reader); + WindowBuilder wb = new WindowBuilder(10, 10, + analyzer.getOffsetGap(FIELD), + new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder); + ConcordanceSearcher searcher = new ConcordanceSearcher(wb); + SpanQuery q = new SpanTermQuery(new Term(FIELD, "a")); + + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3); + searcher.search(indexSearcher, FIELD, + q, null, analyzer, collector); + + assertEquals(3, collector.size()); + + collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL); + searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); + + // test result size + assertEquals(4, collector.size()); + + // test result with sort order = pre + List windows = collector.getSortedWindows(); + String[] pres = new String[]{"", "c b", "c b a c b", "a b c"}; + String[] posts = new String[]{" b c a b c", " c b a", "", " b c"}; + + for (int i = 0; i < windows.size(); i++) { + ConcordanceWindow w = windows.get(i); + + assertEquals(pres[i], w.getPre()); + assertEquals(posts[i], w.getPost()); + } + + // test sort order post + // sort key is built at search time, so must re-search + wb = new WindowBuilder(10, 10, + analyzer.getOffsetGap(FIELD), + new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder); + searcher = new ConcordanceSearcher(wb); + + collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL); + searcher.search(indexSearcher, FIELD, q, + null, analyzer, collector); + + windows = collector.getSortedWindows(); + + posts = new String[]{"", " b c", " b c a b c", " c b a",}; + for (int i = 0; i < windows.size(); i++) { + ConcordanceWindow w = windows.get(i); + assertEquals(posts[i], w.getPost()); + } + reader.close(); + directory.close(); + } + + @Test + public void testSimpleMultiValuedField() throws Exception { + String[] doc = new String[]{"a b c a b c", "c b a c b a"}; + List docs = new ArrayList<>(); + docs.add(doc); + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(reader); + ConcordanceSearcher searcher = new ConcordanceSearcher( + new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); + SpanQuery q = new SpanTermQuery(new Term(FIELD, "a")); + + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100); + + searcher.search(indexSearcher, FIELD, + q, null, analyzer, collector); + + // test result size + assertEquals(4, collector.size()); + + // test result with sort order = pre + List windows = collector.getSortedWindows(); + String[] pres = new String[]{"", "c b", "c b a c b", "a b c"}; + String[] posts = new String[]{" b c a b c", " c b a", "", " b c"}; + + for (int i = 0; i < pres.length; i++) { + ConcordanceWindow w = windows.get(i); + + assertEquals("pres: " + i, pres[i], w.getPre()); + + assertEquals("posts: " + i, posts[i], w.getPost()); + } + + // test sort order post + // sort key is built at search time, so must re-search + WindowBuilder wb = new WindowBuilder(10, 10, + analyzer.getOffsetGap(FIELD), + new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder); + searcher = new ConcordanceSearcher(wb); + + collector = new ConcordanceWindowCollector(100); + + searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); + + windows = collector.getSortedWindows(); + + posts = new String[]{"", " b c", " b c a b c", " c b a",}; + for (int i = 0; i < posts.length; i++) { + ConcordanceWindow w = windows.get(i); + assertEquals(posts[i], w.getPost()); + } + reader.close(); + directory.close(); + } + + @Test + public void testWindowLengths() throws Exception { + String[] doc = new String[]{"a b c d e f g"}; + List docs = new ArrayList<>(); + docs.add(doc); + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(reader); + + SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); + + String[] pres = {"", "c", "b c", "a b c", "a b c", "a b c"}; + String[] posts = {"", " e", " e f", " e f g", " e f g", " e f g"}; + + for (int tokensBefore = 0; tokensBefore < pres.length; tokensBefore++) { + for (int tokensAfter = 0; tokensAfter < posts.length; tokensAfter++) { + WindowBuilder wb = new WindowBuilder(tokensBefore, tokensAfter, + analyzer.getOffsetGap(FIELD)); + ConcordanceSearcher searcher = new ConcordanceSearcher(wb); + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100); + searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); + ConcordanceWindow w = collector.getSortedWindows().get(0); + assertEquals(tokensBefore + " : " + tokensAfter, pres[tokensBefore], w.getPre()); + assertEquals(tokensBefore + " : " + tokensAfter, posts[tokensAfter], w.getPost()); + } + } + + reader.close(); + directory.close(); + + } + + @Test + public void testClockworkOrangeMultiValuedFieldProblem() throws Exception { + /* + * test handling of target match (or not) over different indices into multivalued + * field array + */ + String[] doc = new String[]{"a b c a b the", "clockwork", + "orange b a c b a"}; + List docs = new ArrayList<>(); + docs.add(doc); + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 10); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(reader); + WindowBuilder wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD)); + + + ConcordanceSearcher searcher = new ConcordanceSearcher(wb); + SpanQuery q1 = new SpanTermQuery( + new Term(FIELD, "the")); + SpanQuery q2 = new SpanTermQuery(new Term(FIELD, + "clockwork")); + SpanQuery q3 = new SpanTermQuery(new Term(FIELD, + "orange")); + SpanQuery q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 3, true); + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3); + + searcher.search(indexSearcher, FIELD, + q, null, analyzer, collector); + assertEquals(1, collector.size()); + + ConcordanceWindow w = collector.getSortedWindows().iterator().next(); + assertEquals("target", "the | clockwork | orange", w.getTarget()); + assertEquals("pre", "c a b", w.getPre()); + assertEquals("post", " b a c", w.getPost()); + + reader.close(); + directory.close(); + + // test hit even over long inter-field gap + analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 20, 50); + directory = getDirectory(analyzer, docs); + reader = DirectoryReader.open(directory); + indexSearcher = new IndexSearcher(reader); + + wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD)); + + searcher = new ConcordanceSearcher(wb); + q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 120, true); + collector = new ConcordanceWindowCollector(100); + + searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); + + assertEquals(1, collector.size()); + w = collector.getSortedWindows().iterator().next(); + assertEquals("target", "the | clockwork | orange", w.getTarget()); + assertEquals("pre", "c a b", w.getPre()); + assertEquals("post", " b a c", w.getPost()); + + reader.close(); + directory.close(); + // test miss + analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 100, 100); + directory = getDirectory(analyzer, docs); + reader = DirectoryReader.open(directory); + indexSearcher = new IndexSearcher(reader); + + wb = new WindowBuilder(); + searcher = new ConcordanceSearcher(wb); + q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 5, true); + collector = new ConcordanceWindowCollector(100); + + searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); + + assertEquals(0, collector.size()); + + reader.close(); + directory.close(); + } + + @Test + public void testWithStops() throws Exception { + String[] docs = new String[]{"a b the d e the f", "g h the d the j"}; + Analyzer analyzer = getAnalyzer(MockTokenFilter.ENGLISH_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(reader); + WindowBuilder wb = new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD)); + + ConcordanceSearcher searcher = new ConcordanceSearcher(wb); + SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3); + + searcher.search(indexSearcher, FIELD, + q, null, analyzer, collector); + List windows = collector.getSortedWindows(); + assertEquals(2, windows.size()); + + // the second word after the target is a stop word + // this post-component of this window should only go to the first word after + // the target + assertEquals("b the", windows.get(0).getPre()); + assertEquals("d", windows.get(0).getTarget()); + assertEquals(" e", windows.get(0).getPost()); + + assertEquals("h the", windows.get(1).getPre()); + assertEquals("d", windows.get(1).getTarget()); + assertEquals(" the j", windows.get(1).getPost()); + + + reader.close(); + directory.close(); + } + + @Test + public void testBasicStandardQueryConversion() throws Exception { + String[] docs = new String[]{"a b c a b c", "c b a c b a d e a", + "c b a c b a e a b c a"}; + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(reader); + ConcordanceSearcher searcher = new ConcordanceSearcher( + new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); + BooleanQuery q = new BooleanQuery.Builder() + .add(new TermQuery(new Term(FIELD, "a")), Occur.MUST) + .add(new TermQuery(new Term(FIELD, "d")), + Occur.MUST_NOT).build(); + + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); + searcher.search(indexSearcher, + FIELD, q, null, + analyzer, collector); + // shouldn't include document with "d" + assertEquals(6, collector.size()); + + // should only include document with "e" and not "d" + Query filter = new TermQuery(new Term( + FIELD, "e")); + collector = new ConcordanceWindowCollector(10); + + searcher.search(indexSearcher, FIELD, (Query) q, filter, analyzer, collector); + assertEquals(4, collector.size()); + + reader.close(); + directory.close(); + } + + @Test + public void testMismatchingFieldsInStandardQueryConversion() throws Exception { + // tests what happens if a Query doesn't contain a term in the "span" field + // in the searcher...should be no exception and zero documents returned. + + String[] docs = new String[]{"a b c a b c",}; + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(reader); + + ConcordanceSearcher searcher = new ConcordanceSearcher( + new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); + + Query q = new TermQuery(new Term("_" + FIELD, "a")); + + int windowCount = -1; + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); + + searcher.search(indexSearcher, FIELD, + q, null, analyzer, collector); + windowCount = collector.size(); + assertEquals(0, windowCount); + reader.close(); + directory.close(); + } + + @Test + public void testUniqueCollector() throws Exception { + String[] docs = new String[]{"a b c d c b a", + "a B C d c b a", + "a b C d C B a", + "a b c d C B A", + "e f g d g f e", + "h i j d j i h" + }; + + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(reader); + ConcordanceSearcher searcher = new ConcordanceSearcher( + new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); + SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); + + DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(2); + searcher.search(indexSearcher, + FIELD, (Query) q, null, + analyzer, collector); + assertEquals(2, collector.size()); + + + collector = + new DedupingConcordanceWindowCollector(AbstractConcordanceWindowCollector.COLLECT_ALL); + searcher.search(indexSearcher, + FIELD, (Query) q, null, + analyzer, collector); + assertEquals(3, collector.size()); + + + reader.close(); + directory.close(); + + } + + + @Test + public void testUniqueCollectorWithSameWindowOverflow() throws Exception { + String[] docs = new String[]{"a b c d c b a", + "a b c d c b a", + "a b c d c b a", + "a b c d c b a", + "e f g d g f e", + "h i j d j i h" + }; + + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(reader); + ConcordanceSearcher searcher = new ConcordanceSearcher( + new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); + + SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); + + DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(3); + searcher.search(indexSearcher, + FIELD, (Query) q, null, + analyzer, collector); + assertEquals(3, collector.size()); + assertEquals(4, collector.getSortedWindows().get(0).getCount()); + reader.close(); + directory.close(); + } + + @Test + public void testAllowTargetOverlaps() throws Exception { + String[] docs = new String[]{"a b c"}; + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(reader); + WindowBuilder wb = new WindowBuilder(10, 10, + analyzer.getOffsetGap(FIELD), + new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder); + ConcordanceSearcher searcher = new ConcordanceSearcher(wb); + SpanQuery term = new SpanTermQuery(new Term(FIELD, "a")); + SpanQuery phrase = new SpanNearQuery( + new SpanQuery[]{ + new SpanTermQuery(new Term(FIELD, "a")), + new SpanTermQuery(new Term(FIELD, "b")) + }, 0, true); + SpanOrQuery q = new SpanOrQuery( + new SpanQuery[]{ + term, + phrase + } + ); + + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); + searcher.search(indexSearcher, FIELD, + q, null, analyzer, collector); + + //default should be: don't allow target overlaps + assertEquals(1, collector.size()); + + searcher.setAllowTargetOverlaps(true); + collector = new ConcordanceWindowCollector(10); + searcher.search(indexSearcher, FIELD, + q, null, analyzer, collector); + + //now there should be two windows with allowTargetOverlaps = true + assertEquals(2, collector.size()); + reader.close(); + directory.close(); + } + + @Test + public void testRewrites() throws Exception { + //test to make sure that queries are rewritten + //first test straight prefix queries + String[] docs = new String[]{"aa ba ca aa ba ca", "ca ba aa ca ba aa da ea za", + "ca ba aa ca ba aa ea aa ba ca za"}; + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(reader); + ConcordanceSearcher searcher = new ConcordanceSearcher( + new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); + BooleanQuery q = new BooleanQuery.Builder() + .add(new PrefixQuery(new Term(FIELD, "a")), Occur.MUST) + .add(new PrefixQuery(new Term(FIELD, "d")), + Occur.MUST_NOT).build(); + + //now test straight and span wrapper + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); + searcher.search(indexSearcher, + FIELD, q, new PrefixQuery(new Term(FIELD, "z")), + analyzer, collector); + // shouldn't include document with "da", but must include one with za + assertEquals(3, collector.size()); + + collector = new ConcordanceWindowCollector(10); + searcher.search(indexSearcher, + FIELD, q, new SpanMultiTermQueryWrapper<>(new PrefixQuery(new Term(FIELD, "z"))), + analyzer, collector); + // shouldn't include document with "da", but must include one with za + assertEquals(3, collector.size()); + + reader.close(); + directory.close(); + } + +} diff --git a/lucene/concordance/src/test/org/apache/lucene/concordance/TestSimpleAnalyzerUtil.java b/lucene/concordance/src/test/org/apache/lucene/concordance/TestSimpleAnalyzerUtil.java new file mode 100644 index 000000000000..7eaf31127ddb --- /dev/null +++ b/lucene/concordance/src/test/org/apache/lucene/concordance/TestSimpleAnalyzerUtil.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.concordance.charoffsets.SimpleAnalyzerUtil; +import org.apache.lucene.store.Directory; +import org.junit.BeforeClass; + +public class TestSimpleAnalyzerUtil extends ConcordanceTestBase { + + private static Analyzer defaultCharOffsetGapAnalyzer; + + private static Analyzer customCharOffsetGapAnalyzer; + + @BeforeClass + public static void beforeClass() throws Exception { + defaultCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 1); + //customCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50, 213); + customCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50, 213); + } + /* + public void testDebug() throws Exception { + String[] values = new String[]{ + "the quick brown fox jumped over the lazy dog", + "the fast green toad slid under the slothful rabbit", + "the happy blue wolverine devoured the lazy moose", + "the depressed purple aardvark the the the the the the the devoured the energetic komodo", + "the exasperated lavender lion", + "the excited orange tiger the the the the the", + "the colorless green idea slept furiously the" + }; + System.out.println(values[0].length()); + List docs = new ArrayList<>(); + docs.add(values); + + Directory directory = getDirectory(defaultCharOffsetGapAnalyzer, docs); + + String joiner = " | "; + int gap = defaultCharOffsetGapAnalyzer.getOffsetGap(FIELD); + IndexReader reader = DirectoryReader.open(directory); + Document d = reader.document(0); + String[] fieldValues = d.getValues(FIELD); + //69, 103 + assertEquals("basic", "", testSimple(42, 45, fieldValues, gap, joiner)); + reader.close(); + directory.close(); + }*/ + + public void testHitInGaps() throws Exception { + String[] values = new String[]{ + "abc", + "def", + "ghi", + "jkl" + }; + List docs = new ArrayList<>(); + docs.add(values); + + Directory directory = getDirectory(customCharOffsetGapAnalyzer, docs); + + String joiner = " | "; + int gap = customCharOffsetGapAnalyzer.getOffsetGap(FIELD); + IndexReader reader = DirectoryReader.open(directory); + Document d = reader.document(0); + String[] fieldValues = d.getValues(FIELD); + + assertEquals("two negs", "", testSimple(-10, -1, fieldValues, gap, joiner)); + + assertEquals("two way beyonds", "", testSimple(1000, 1020, fieldValues, gap, joiner)); + + assertEquals("two in betweens", " | ", testSimple(100, 110, fieldValues, gap, joiner)); + + + assertEquals("one neg", "abc", testSimple(-20, 3, fieldValues, gap, joiner)); + assertEquals("end < start 1", "", testSimple(3, -20, fieldValues, gap, joiner)); + assertEquals("end < start 2", "", testSimple(3, 2, fieldValues, gap, joiner)); + assertEquals("end in between", "abc", testSimple(0, 50, fieldValues, gap, joiner)); + //TODO: these used to be "def"; need to fix + assertEquals("start in between", " | def", testSimple(5, 219, fieldValues, gap, joiner)); + assertEquals("start in between and end in between1", " | def", testSimple(5, 300, fieldValues, gap, joiner)); + assertEquals("start in between and end in between2", " | def | ghi", testSimple(5, 600, fieldValues, gap, joiner)); + assertEquals("", "def | ghi | jkl", testSimple(216, 10000, fieldValues, gap, joiner)); + + reader.close(); + directory.close(); + + } + + public void testRandomWithNeedleOnGaps() throws Exception { + try { + executeNeedleTests(defaultCharOffsetGapAnalyzer); + executeNeedleTests(customCharOffsetGapAnalyzer); + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + private void executeNeedleTests(Analyzer analyzer) throws Exception { + + String needle = getNeedle(analyzer); + int numFieldValues = 23; + + Directory directory = buildNeedleIndex(needle, analyzer, numFieldValues); + + IndexReader reader = DirectoryReader.open(directory); + + LeafReaderContext ctx = reader.leaves().get(0); + LeafReader r = ctx.reader(); + + PostingsEnum dpe = r.postings(new Term(FIELD, needle), PostingsEnum.ALL); + int numTests = 0; + try { + while (dpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + int frq = dpe.freq(); + int advanced = 0; + + String[] fieldValues = r.document(dpe.docID()).getValues(FIELD); + while (++advanced < frq) { + dpe.nextPosition(); + String rebuilt = SimpleAnalyzerUtil.substringFromMultiValuedFields(dpe.startOffset(), + dpe.endOffset(), fieldValues, analyzer.getOffsetGap(FIELD), " | "); + assertEquals(needle, rebuilt); + numTests++; + } + } + } finally { + reader.close(); + directory.close(); + } + assertEquals("number of tests", numFieldValues - 1, numTests); + } + + private String testSimple(int start, int end, String[] fieldValues, int gap, String joiner) { + return SimpleAnalyzerUtil.substringFromMultiValuedFields(start, end, fieldValues, gap, joiner); + } +} diff --git a/lucene/concordance/src/test/org/apache/lucene/concordance/TestSpanQueryConverter.java b/lucene/concordance/src/test/org/apache/lucene/concordance/TestSpanQueryConverter.java new file mode 100644 index 000000000000..2eb65b64e229 --- /dev/null +++ b/lucene/concordance/src/test/org/apache/lucene/concordance/TestSpanQueryConverter.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.concordance; + +import java.io.IOException; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.spans.SimpleSpanQueryConverter; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class TestSpanQueryConverter { + + @Test + public void testMultiTerm() throws IOException { + //test to make sure multiterm returns empty query for different field + String f1 = "f1"; + String f2 = "f2"; + Query q = new PrefixQuery(new Term(f1, "f*")); + SimpleSpanQueryConverter c = new SimpleSpanQueryConverter(); + SpanQuery sq = c.convert(f2, q); + assertTrue(sq instanceof SpanOrQuery); + assertEquals(0, ((SpanOrQuery)sq).getClauses().length); + } + //TODO: add more tests +} diff --git a/lucene/module-build.xml b/lucene/module-build.xml index d48ae37f89c3..7f952e5feb4b 100644 --- a/lucene/module-build.xml +++ b/lucene/module-build.xml @@ -464,6 +464,28 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index cd5d4af576ff..3b516fe747ba 100644 --- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -261,7 +261,7 @@ public void load(SolrQueryRequest req, SolrQueryResponse rsp, } public static class MostlyPassthroughHtmlMapper implements HtmlMapper { - public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper(); + public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper(); /** * Keep all elements and their content.