diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java index 01e1f6f7dfc1..13a4085b1fc9 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import org.apache.lucene.analysis.TokenFilter; @@ -66,9 +67,13 @@ private final static class InputNode implements RollingBuffer.Resettable { * to know when we can freeze. */ int maxToNode = -1; - /** Where we currently map to; this changes (can only - * increase as we see more input tokens), until we are finished - * with this position. */ + /** Minimum to input node for all tokens leaving here; we use this to check if holes exist. */ + int minToNode = Integer.MAX_VALUE; + + /** + * Where we currently map to; this changes (can only increase as we see more input tokens), + * until we are finished with this position. + */ int outputNode = -1; /** Which token (index into {@link #tokens}) we will next output. */ @@ -80,6 +85,7 @@ public void reset() { node = -1; outputNode = -1; maxToNode = -1; + minToNode = Integer.MAX_VALUE; nextOut = 0; } } @@ -188,14 +194,21 @@ private boolean releaseBufferedToken() { } if (inputNode.tokens.size() == 0) { assert inputNode.nextOut == 0; - assert output.nextOut == 0; // Hole dest nodes should never be merged since 1) we always // assign them to a new output position, and 2) since they never - // have arriving tokens they cannot be pushed: - assert output.inputNodes.size() == 1: output.inputNodes.size(); - outputFrom++; - inputNodes.freeBefore(output.inputNodes.get(0)); - outputNodes.freeBefore(outputFrom); + // have arriving tokens they cannot be pushed. Skip them but don't free + // input until all are checked. + // Related tests testAltPathLastStepLongHole, testAltPathLastStepHoleFollowedByHole, + // testAltPathLastStepHoleWithoutEndToken + if (output.inputNodes.size() > 1) { + output.nextOut++; + if (output.nextOut < output.inputNodes.size()) { + continue; + } + } + // Don't free from a hole src. Since no edge leaves here book keeping may be incorrect. + // Later output nodes may point to earlier input nodes. So we don't want to free them yet. + freeBefore(output); continue; } @@ -234,9 +247,7 @@ private boolean releaseBufferedToken() { if (inputNode.nextOut == inputNode.tokens.size()) { output.nextOut++; if (output.nextOut == output.inputNodes.size()) { - outputFrom++; - inputNodes.freeBefore(output.inputNodes.get(0)); - outputNodes.freeBefore(outputFrom); + freeBefore(output); } } @@ -250,6 +261,30 @@ private boolean releaseBufferedToken() { return false; } + /** + * Free inputs nodes before the minimum input node for the given output. + * + * @param output target output node + */ + private void freeBefore(OutputNode output) { + /* We've released all of the tokens that end at the current output, so free all output nodes before this. + Input nodes are more complex. The second shingled tokens with alternate paths can appear later in the output graph + than some of their alternate path tokens. Because of this case we can only free from the minimum because + the minimum node will have come from before the second shingled token. + This means we have to hold onto input nodes whose tokens get stacked on previous nodes until + we've completely passed those inputs. + Related tests testShingledGap, testShingledGapWithHoles + */ + outputFrom++; + int freeBefore = Collections.min(output.inputNodes); + // This will catch a node being freed early if it is input to the next output. + // Could a freed early node be input to a later output? + assert outputNodes.get(outputFrom).inputNodes.stream().filter(n -> freeBefore > n).count() == 0 + : "FreeBefore " + freeBefore + " will free in use nodes"; + inputNodes.freeBefore(freeBefore); + outputNodes.freeBefore(outputFrom); + } + @Override public boolean incrementToken() throws IOException { //System.out.println("\nF.increment inputFrom=" + inputFrom + " outputFrom=" + outputFrom); @@ -267,7 +302,8 @@ public boolean incrementToken() throws IOException { if (input.incrementToken()) { // Input node this token leaves from: - inputFrom += posIncAtt.getPositionIncrement(); + int positionIncrement = posIncAtt.getPositionIncrement(); + inputFrom += positionIncrement; int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); @@ -278,27 +314,44 @@ public boolean incrementToken() throws IOException { InputNode src = inputNodes.get(inputFrom); if (src.node == -1) { - // This means the "from" node of this token was never seen as a "to" node, - // which should only happen if we just crossed a hole. This is a challenging - // case for us because we normally rely on the full dependencies expressed - // by the arcs to assign outgoing node IDs. It would be better if tokens - // were never dropped but instead just marked deleted with a new - // TermDeletedAttribute (boolean valued) ... but until that future, we have - // a hack here to forcefully jump the output node ID: - assert src.outputNode == -1; - src.node = inputFrom; - - src.outputNode = outputNodes.getMaxPos() + 1; - //System.out.println(" hole: force to outputNode=" + src.outputNode); - OutputNode outSrc = outputNodes.get(src.outputNode); + recoverFromHole(src, startOffset, positionIncrement); - // Not assigned yet: - assert outSrc.node == -1; - outSrc.node = src.outputNode; - outSrc.inputNodes.add(inputFrom); - outSrc.startOffset = startOffset; } else { OutputNode outSrc = outputNodes.get(src.outputNode); + /* If positionIncrement > 1 and the position we're incrementing from doesn't come to the current node we've crossed a hole. + * The long edge will point too far back and not account for the holes unless it gets fixed. + * example: + * _____abc______ + * | | + * | V + * O-a->O- ->O- ->O-d->O + * + * A long edge may have already made this fix though, if src is more than 1 position ahead in the output there's no additional work to do + * example: + * _____abc______ + * | ....bc....| + * | . VV + * O-a->O- ->O- ->O-d->O + */ + if (positionIncrement > 1 + && src.outputNode - inputNodes.get(inputFrom - positionIncrement).outputNode <= 1 + && inputNodes.get(inputFrom - positionIncrement).minToNode != inputFrom) { + /* If there was a hole at the end of an alternate path then the input and output nodes + * have been created, + * but the offsets and increments have not been maintained correctly. Here we go back + * and fix them. + * Related test testAltPathLastStepHole + * The last node in the alt path didn't arrive to remove this reference. + */ + assert inputNodes.get(inputFrom).tokens.isEmpty() : "about to remove non empty edge"; + outSrc.inputNodes.remove(Integer.valueOf(inputFrom)); + src.outputNode = -1; + int prevEndOffset = outSrc.endOffset; + + outSrc = recoverFromHole(src, startOffset, positionIncrement); + outSrc.endOffset = prevEndOffset; + } + if (outSrc.startOffset == -1 || startOffset > outSrc.startOffset) { // "shrink wrap" the offsets so the original tokens (with most // restrictive offsets) win: @@ -309,6 +362,7 @@ public boolean incrementToken() throws IOException { // Buffer this token: src.tokens.add(captureState()); src.maxToNode = Math.max(src.maxToNode, inputTo); + src.minToNode = Math.min(src.minToNode, inputTo); maxLookaheadUsed = Math.max(maxLookaheadUsed, inputNodes.getBufferSize()); InputNode dest = inputNodes.get(inputTo); @@ -353,6 +407,55 @@ public boolean incrementToken() throws IOException { } } + private OutputNode recoverFromHole(InputNode src, int startOffset, int posinc) { + // This means the "from" node of this token was never seen as a "to" node, + // which should only happen if we just crossed a hole. This is a challenging + // case for us because we normally rely on the full dependencies expressed + // by the arcs to assign outgoing node IDs. It would be better if tokens + // were never dropped but instead just marked deleted with a new + // TermDeletedAttribute (boolean valued) ... but until that future, we have + // a hack here to forcefully jump the output node ID: + assert src.outputNode == -1; + src.node = inputFrom; + + int outIndex; + int previousInputFrom = inputFrom - posinc; + if (previousInputFrom >= 0) { + InputNode offsetSrc = inputNodes.get(previousInputFrom); + /* Select output src node. Need to make sure the new output node isn't placed too far ahead. + * If a disconnected node is placed at the end of the output graph that may place it after output nodes that map to input nodes that are after src in the input. + * Since it is disconnected there is no path to it, and there could be holes after meaning no paths to following nodes. This "floating" edge will cause problems in FreeBefore. + * In the following section make sure the edge connects to something. + * Related test testLongHole testAltPathLastStepHoleFollowedByHole, testAltPathFirstStepHole, testShingledGapWithHoles + */ + if (offsetSrc.minToNode < inputFrom) { + // There is a possible path to this node. + // place this node one position off from the possible path keeping a 1 inc gap. + // Can't be larger than 1 inc or risk getting disconnected. + outIndex = inputNodes.get(offsetSrc.minToNode).outputNode + 1; + } else { + // no information about how the current node was previously connected. + // Connect it to the end. + outIndex = outputNodes.getMaxPos(); + } + } else { + // in case the first token in the stream is a hole we have no input node to increment from. + outIndex = outputNodes.getMaxPos() + 1; + } + OutputNode outSrc = outputNodes.get(outIndex); + src.outputNode = outIndex; + + // OutSrc may have other inputs + if (outSrc.node == -1) { + outSrc.node = src.outputNode; + outSrc.startOffset = startOffset; + } else { + outSrc.startOffset = Math.max(startOffset, outSrc.startOffset); + } + outSrc.inputNodes.add(inputFrom); + return outSrc; + } + // Only for debugging: /* private void printStates() { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java index c69bcca9cf89..f86c3b42f327 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java @@ -17,13 +17,33 @@ package org.apache.lucene.analysis.core; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; +import java.util.Random; +import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.AutomatonToTokenStream; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CannedTokenStream; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.synonym.SynonymGraphFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.Transition; public class TestFlattenGraphFilter extends BaseTokenStreamTestCase { @@ -195,7 +215,6 @@ public void testSimpleHole() throws Exception { TokenStream out = new FlattenGraphFilter(in); - // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(out, new String[] {"hello", "hole", "fun"}, new int[] {0, 6, 11}, @@ -277,8 +296,689 @@ public void testTwoLongParallelPaths() throws Exception { new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, 11); - } + // b has a posInc of 1, which is correct, but no edge ever visited that node. + // After hole recovery 'b' and 'c' should still be under 'abc' + // assert disabled = pos length of abc = 4 + // assert enabled = AssertionError: outputEndNode=3 vs inputTo=2 + public void testAltPathFirstStepHole() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 3, + new Token[] {token("abc", 1, 3, 0, 3), token("b", 1, 1, 1, 2), token("c", 1, 1, 2, 3)}); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, + new String[] {"abc", "b", "c"}, + new int[] {0, 1, 2}, + new int[] {3, 2, 3}, + new int[] {1, 1, 1}, + new int[] {3, 1, 1}, + 3); + } + + // Last node in an alt path fixes outputnode of long path. In this graph the follow up node fixes + // that. + // incorrect pos length of abc = 1 + public void testAltPathLastStepHole() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 4, + new Token[] { + token("abc", 1, 3, 0, 3), + token("a", 0, 1, 0, 1), + token("b", 1, 1, 1, 2), + token("d", 2, 1, 3, 4) + }); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, + new String[] {"abc", "a", "b", "d"}, + new int[] {0, 0, 1, 3}, + new int[] {1, 1, 2, 4}, + new int[] {1, 0, 1, 2}, + new int[] {3, 1, 1, 1}, + 4); + } + + // Check to see how multiple holes in a row are preserved. + public void testLongHole() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 28, + new Token[] { + token("hello", 1, 1, 0, 5), token("hole", 5, 1, 20, 24), token("fun", 1, 1, 25, 28), + }); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, + new String[] {"hello", "hole", "fun"}, + new int[] {0, 20, 25}, + new int[] {5, 24, 28}, + new int[] {1, 2, 1}, + new int[] {1, 1, 1}, + 28); + } + + // multiple nodes missing in the alt path. + // assert disabled = nothing + // assert enabled = AssertionError + public void testAltPathLastStepLongHole() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 4, + new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("d", 3, 1, 3, 4)}); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, + new String[] {"abc", "a", "d"}, + new int[] {0, 0, 3}, + new int[] {1, 1, 4}, + new int[] {1, 0, 2}, + new int[] {2, 1, 1}, + 4); + } + + // LUCENE-8723 + // Token stream ends without any edge to fix the long edge's output node + // assert disabled = dropped token + // assert enabled = AssertionError: 2 + public void testAltPathLastStepHoleWithoutEndToken() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 2, + new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("b", 1, 1, 1, 2)}); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, + new String[] {"abc", "a", "b"}, + new int[] {0, 0, 1}, + new int[] {1, 1, 2}, + new int[] {1, 0, 1}, + new int[] {1, 1, 1}, + 2); + } + + // similar to AltPathLastStepHoleWithoutEndToken, but instead of no token to trigger long path + // resolution, + // the next token has no way to reference to the long path so we have to resolve as if that last + // token wasn't present. + public void testAltPathLastStepHoleFollowedByHole() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 5, + new Token[] {token("abc", 1, 3, 0, 3), token("b", 1, 1, 1, 2), token("e", 3, 1, 4, 5)}); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, + new String[] {"abc", "b", "e"}, + new int[] {0, 1, 4}, + new int[] {3, 2, 5}, + new int[] {1, 1, 2}, + new int[] {1, 1, 1}, + 5); + } + + // Two Shingled long paths pass each other which gives a flattened graph with tokens backing up a + // lot. + public void testShingledGap() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 5, + new Token[] { + token("abc", 1, 3, 0, 3), + token("a", 0, 1, 0, 1), + token("b", 1, 1, 1, 2), + token("cde", 1, 3, 2, 5), + token("d", 1, 1, 3, 4), + token("e", 1, 1, 4, 5) + }); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, + new String[] {"abc", "a", "d", "b", "cde", "e"}, + new int[] {0, 0, 3, 3, 4, 4}, + new int[] {1, 1, 3, 3, 5, 5}, + new int[] {1, 0, 1, 0, 1, 0}, + new int[] {1, 1, 1, 1, 1, 1}, + 5); + } + + // With shingles, token order may change during flattening. + // We need to be careful not to free input nodes if they still have unreleased edges. + // with/without exceptions ArrayIndexOutOfBoundsException + public void testShingledGapWithHoles() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 5, + new Token[] { + token("abc", 1, 3, 0, 3), + token("b", 1, 1, 1, 2), + token("cde", 1, 3, 2, 5), + token("d", 1, 1, 3, 4), + token("e", 1, 1, 4, 5) + }); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, + new String[] {"abc", "d", "b", "cde", "e"}, + new int[] {0, 3, 3, 4, 4}, + new int[] {3, 3, 3, 5, 5}, + new int[] {1, 1, 0, 1, 0}, + new int[] {1, 1, 1, 1, 1}, + 5); + } + + // When the first token is a hole there is no original token to offset from. + public void testFirstTokenHole() throws Exception { + TokenStream in = new CannedTokenStream(0, 9, new Token[] {token("start", 2, 1, 0, 5)}); + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, new String[] {"start"}, new int[] {0}, new int[] {5}, new int[] {2}, new int[] {1}, 9); + } + + // The singled token starts from a hole. + // Hole recovery will cause the shingled token to start later in the output than its alternate + // paths. + // This will result in it being released too early. + public void testShingleFromGap() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 9, + new Token[] { + token("a", 1, 1, 4, 8), + token("abc", 0, 3, 4, 7), + token("cd", 2, 2, 6, 8), + token("d", 1, 1, 7, 8), + token("e", 1, 1, 8, 9) + }); + TokenStream out = new FlattenGraphFilter(in); + assertTokenStreamContents( + out, + new String[] {"a", "abc", "d", "cd", "e"}, + new int[] {4, 4, 7, 7, 8}, + new int[] {7, 7, 8, 8, 9}, + new int[] {1, 0, 1, 1, 1}, + new int[] {1, 1, 2, 1, 1}, + 9); + } + + public void testShingledGapAltPath() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 4, + new Token[] { + token("abc", 1, 3, 0, 3), token("abcd", 0, 4, 0, 4), token("cd", 2, 2, 2, 4), + }); + TokenStream out = new FlattenGraphFilter(in); + assertTokenStreamContents( + out, + new String[] {"abc", "abcd", "cd"}, + new int[] {0, 0, 2}, + new int[] {3, 4, 4}, + new int[] {1, 0, 1}, + new int[] {1, 2, 1}, + 4); + } + + // Lots of shingles and alternate paths connecting to each other. One edge 'c' missing between + // 'ab' and 'def' + public void testHeavilyConnectedGraphWithGap() throws IOException { + TokenStream in = + new CannedTokenStream( + 0, + 7, + new Token[] { + token("a", 1, 1, 0, 1), + token("ab", 0, 2, 0, 2), + token("abcdef", 0, 6, 0, 6), + token("abcd", 0, 4, 0, 4), + token("bcdef", 1, 5, 1, 7), + token("def", 2, 3, 4, 7), + token("e", 1, 1, 5, 6), + token("f", 1, 1, 6, 7) + }); + TokenStream out = new FlattenGraphFilter(in); + assertTokenStreamContents( + out, + new String[] {"a", "ab", "abcdef", "abcd", "bcdef", "e", "def", "f"}, + new int[] {0, 0, 0, 0, 5, 5, 6, 6}, + new int[] {1, 1, 7, 1, 7, 6, 7, 7}, + new int[] {1, 0, 0, 0, 1, 0, 1, 0}, + new int[] {1, 1, 3, 1, 2, 1, 1, 1}, + 7); + } + // This graph can create a disconnected input node that is farther ahead in the output than its + // subsequent input node. + // Exceptions: Free too early or dropped tokens. + public void testShingleWithLargeLeadingGap() throws IOException { + TokenStream in = + new CannedTokenStream( + 0, + 6, + new Token[] { + token("abcde", 1, 5, 0, 5), token("ef", 4, 2, 4, 6), token("f", 1, 1, 5, 6), + }); + TokenStream out = new FlattenGraphFilter(in); + assertTokenStreamContents( + out, + new String[] {"abcde", "f", "ef"}, + new int[] {0, 5, 5}, + new int[] {5, 6, 6}, + new int[] {1, 1, 0}, + new int[] {1, 1, 1}, + 6); + } + + /** + * build CharsRef containing 2-4 tokens + * + * @param tokens vocabulary of tokens + * @param charsRefBuilder CharsRefBuilder + * @param random Random for selecting tokens + * @return Charsref containing 2-4 tokens. + */ + private CharsRef buildMultiTokenCharsRef( + String[] tokens, CharsRefBuilder charsRefBuilder, Random random) { + int srcLen = random.nextInt(2) + 2; + String[] srcTokens = new String[srcLen]; + for (int pos = 0; pos < srcLen; pos++) { + srcTokens[pos] = tokens[random().nextInt(tokens.length)]; + } + SynonymMap.Builder.join(srcTokens, charsRefBuilder); + return charsRefBuilder.toCharsRef(); + } + + // Create a random graph then delete some edges to see if we can trip up FlattenGraphFilter + public void testRandomGraphs() throws Exception { + String[] baseTokens = new String[] {"t1", "t2", "t3", "t4"}; + String[] synTokens = new String[] {"s1", "s2", "s3", "s4"}; + + SynonymMap.Builder mapBuilder = new SynonymMap.Builder(); + CharsRefBuilder charRefBuilder = new CharsRefBuilder(); + Random random = random(); + + // between 10 and 20 synonym entries + int synCount = random.nextInt(10) + 10; + for (int i = 0; i < synCount; i++) { + int type = random.nextInt(4); + CharsRef src; + CharsRef dest; + switch (type) { + case 0: + // 1:1 + src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef(); + charRefBuilder.clear(); + dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef(); + charRefBuilder.clear(); + break; + case 1: + // many:1 + src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random); + charRefBuilder.clear(); + dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef(); + charRefBuilder.clear(); + break; + case 2: + // 1:many + src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef(); + charRefBuilder.clear(); + dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random); + charRefBuilder.clear(); + break; + default: + // many:many + src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random); + charRefBuilder.clear(); + dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random); + charRefBuilder.clear(); + } + mapBuilder.add(src, dest, true); + } + + SynonymMap synMap = mapBuilder.build(); + + int stopWordCount = random.nextInt(4) + 1; + CharArraySet stopWords = new CharArraySet(stopWordCount, true); + while (stopWords.size() < stopWordCount) { + int index = random.nextInt(baseTokens.length + synTokens.length); + String[] tokenArray = baseTokens; + if (index >= baseTokens.length) { + index -= baseTokens.length; + tokenArray = synTokens; + } + stopWords.add(tokenArray[index]); + } + + Analyzer withFlattenGraph = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer in = new WhitespaceTokenizer(); + TokenStream result = new SynonymGraphFilter(in, synMap, true); + result = new StopFilter(result, stopWords); + result = new FlattenGraphFilter(result); + return new TokenStreamComponents(in, result); + } + }; + + int tokenCount = random.nextInt(20) + 20; + List stringTokens = new ArrayList<>(); + while (stringTokens.size() < tokenCount) { + stringTokens.add(baseTokens[random.nextInt(baseTokens.length)]); + } + + String text = String.join(" ", stringTokens); + // FlattenGraphFilter can create inconsistent offsets. + // If that is resolved we can check offsets + // Until then converting to automaton will pull text through and check if we hit asserts. + // checkAnalysisConsistency(random, withFlattenGraph, false, text); + TokenStreamToAutomaton tsta = new TokenStreamToAutomaton(); + TokenStream flattenedTokenStream = withFlattenGraph.tokenStream("field", text); + assertFalse(Operations.hasDeadStates(tsta.toAutomaton(flattenedTokenStream))); + flattenedTokenStream.close(); + + /* + CheckGeneralization can get VERY slow as matching holes to tokens or other holes generates a lot of potentially valid paths. + Analyzer withoutFlattenGraph = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer in = new WhitespaceTokenizer(); + TokenStream result = new SynonymGraphFilter(in, synMap, true); + result = new StopFilter(result, stopWords); + return new TokenStreamComponents(in, result); + } + }; + checkGeneralization( + withFlattenGraph.tokenStream("field", text), + withoutFlattenGraph.tokenStream("field", text)); + + */ + } + + /* + * Make some strings, make an automaton that accepts those strings, convert that automaton into a TokenStream, + * flatten it, back to an automaton, and see if the original strings are still accepted. + */ + public void testPathsNotLost() throws IOException { + int wordCount = random().nextInt(5) + 5; + List acceptStrings = new LinkedList<>(); + for (int i = 0; i < wordCount; i++) { + int wordLen = random().nextInt(5) + 5; + BytesRef ref = new BytesRef(wordLen); + ref.length = wordLen; + ref.offset = 0; + for (int j = 0; j < wordLen; j++) { + ref.bytes[j] = (byte) (random().nextInt(5) + 65); + } + acceptStrings.add(ref); + } + acceptStrings.sort(Comparator.naturalOrder()); + + acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList()); + Automaton nonFlattenedAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + + TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton); + TokenStream flattenedTokenStream = new FlattenGraphFilter(ts); + TokenStreamToAutomaton tsta = new TokenStreamToAutomaton(); + Automaton flattenedAutomaton = tsta.toAutomaton(flattenedTokenStream); + + // TokenStreamToAutomaton adds position increment transitions into the automaton. + List acceptStringsWithPosSep = createAcceptStringsWithPosSep(acceptStrings); + + for (BytesRef acceptString : acceptStringsWithPosSep) { + assertTrue( + "string not accepted " + acceptString.utf8ToString(), + recursivelyValidate(acceptString, 0, 0, flattenedAutomaton)); + } + } + + /** + * adds POS_SEP bytes between bytes to match TokenStreamToAutomaton format. + * + * @param acceptStrings Byte refs of accepted strings. Each byte is a transition + * @return List of ByteRefs where each byte is separated by a POS_SEP byte. + */ + private List createAcceptStringsWithPosSep(List acceptStrings) { + List acceptStringsWithPosSep = new ArrayList<>(); + for (BytesRef acceptString : acceptStrings) { + BytesRef withPosSep = new BytesRef(acceptString.length * 2 - 1); + withPosSep.length = acceptString.length * 2 - 1; + withPosSep.offset = 0; + for (int i = 0; i < acceptString.length; i++) { + withPosSep.bytes[i * 2] = acceptString.bytes[i]; + if (i * 2 + 1 < withPosSep.length) { + withPosSep.bytes[i * 2 + 1] = TokenStreamToAutomaton.POS_SEP; + } + } + acceptStringsWithPosSep.add(withPosSep); + } + return acceptStringsWithPosSep; + } + + /** + * Checks if acceptString is accepted by the automaton. Automaton may be an NFA. + * + * @param acceptString String to test + * @param acceptStringIndex current index into acceptString, initial value should be 0 + * @param state state to transition from. initial value should be 0 + * @param automaton Automaton to test + * @return true if acceptString is accepted by the automaton. otherwise false. + */ + public boolean recursivelyValidate( + BytesRef acceptString, int acceptStringIndex, int state, Automaton automaton) { + if (acceptStringIndex == acceptString.length) { + return automaton.isAccept(state); + } + + Transition transition = new Transition(); + automaton.initTransition(state, transition); + int numTransitions = automaton.getNumTransitions(state); + boolean accept = false; + // Automaton can be NFA, so we need to check all matching transitions + for (int i = 0; i < numTransitions; i++) { + automaton.getTransition(state, i, transition); + if (transition.min <= acceptString.bytes[acceptStringIndex] + && transition.max >= acceptString.bytes[acceptStringIndex]) { + accept = + recursivelyValidate(acceptString, acceptStringIndex + 1, transition.dest, automaton); + } + if (accept == true) { + break; + } + } + return accept; + } + + /** + * This method checks if strings that lead to the accept state of the not flattened TokenStream + * also lead to the accept state in the flattened TokenStream. This gets complicated when you + * factor in holes. The FlattenGraphFilter will remove alternate paths that are made entirely of + * holes. An alternate path of Holes is indistinguishable from a path that just has long + * lengths(ex: testStrangelyNumberedNodes). Also alternate paths that end in multiple holes could + * be interpreted as sequential holes after the branching has converged during flattening. This + * leads to a lot of weird logic about navigating around holes that may compromise the accuracy of + * this test. + * + * @param flattened flattened TokenStream + * @param notFlattened not flattened TokenStream + * @throws IOException on error creating Automata + */ + /* private void checkGeneralization(TokenStream flattened, TokenStream notFlattened) + throws IOException { + TokenStreamToAutomaton tsta = new TokenStreamToAutomaton(); + + List> acceptStrings = getAcceptStrings(tsta.toAutomaton(notFlattened)); + checkAcceptStrings(acceptStrings, tsta.toAutomaton(flattened)); + flattened.close(); + notFlattened.close(); + }*/ + + /** + * gets up to 10000 strings that lead to accept state in the given automaton. + * + * @param automaton automaton + * @return list of accept sequences + */ + /* private List> getAcceptStrings(Automaton automaton) { + List> acceptedSequences = new LinkedList<>(); + LinkedList prefix = new LinkedList<>(); + // state 0 is always the start node + // Particularly branching automatons can create lots of possible acceptable strings. limit to + // the first 10K + buildAcceptStringRecursive(automaton, 0, prefix, acceptedSequences, 10000); + return acceptedSequences; + }*/ + + /** + * @param automaton automaton to generate strings from + * @param state state to start at + * @param prefix string prefix + * @param acceptedSequences List of strings build so far. + * @param limit maximum number of acceptedSequences. + */ + /*private void buildAcceptStringRecursive( + Automaton automaton, + int state, + LinkedList prefix, + List> acceptedSequences, + int limit) { + if (acceptedSequences.size() == limit) { + return; + } + if (automaton.isAccept(state)) { + acceptedSequences.add(new LinkedList<>(prefix)); + return; + } + int numTransitions = automaton.getNumTransitions(state); + Transition transition = new Transition(); + for (int i = 0; i < numTransitions; i++) { + automaton.getTransition(state, i, transition); + // min and max are the same transitions made from TokenStreamToAutomaton + prefix.addLast(transition.min); + buildAcceptStringRecursive(automaton, transition.dest, prefix, acceptedSequences, limit); + prefix.removeLast(); + } + } + + private void checkAcceptStrings(List> acceptSequence, Automaton automaton) { + for (LinkedList acceptString : acceptSequence) { + assertTrue( + "String did not lead to accept state " + acceptString, + recursivelyValidateWithHoles(acceptString, 0, automaton)); + } + } + + private boolean recursivelyValidateWithHoles( + LinkedList acceptSequence, int state, Automaton automaton) { + if (acceptSequence.isEmpty()) { + return automaton.isAccept(state); + } + + Integer curr = acceptSequence.pop(); + int numTransitions = automaton.getNumTransitions(state); + Transition transition = new Transition(); + + boolean accept = false; + // Automaton can be NFA, so we need to check all matching transitions + for (int i = 0; i < numTransitions; i++) { + automaton.getTransition(state, i, transition); + if (transition.min <= curr && transition.max >= curr) { + accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton); + // Factoring in flattened graphs the space covered by a hole may be bigger in the flattened + // graph. + // Try consuming more steps with holes. + if (accept == false + && transition.min == TokenStreamToAutomaton.HOLE + && transition.max == TokenStreamToAutomaton.HOLE) { + acceptSequence.push(TokenStreamToAutomaton.HOLE); + acceptSequence.push(TokenStreamToAutomaton.POS_SEP); + accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton); + acceptSequence.pop(); + acceptSequence.pop(); + } + } else if (transition.min == TokenStreamToAutomaton.HOLE + && transition.max == TokenStreamToAutomaton.HOLE + && automaton.getNumTransitions(transition.dest) > 0) { + //consume multiple holes in the automaton + // clear POS_INC + automaton.getTransition(transition.dest, 0, transition); + acceptSequence.push(curr); + accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton); + acceptSequence.pop(); + } else if(curr == TokenStreamToAutomaton.HOLE) { + //consume non-holes in the automaton with holes + while (transition.min != TokenStreamToAutomaton.POS_SEP + && automaton.getNumTransitions(transition.dest) > 0) { + automaton.getTransition(transition.dest, 0, transition); + } + acceptSequence.push(curr); + accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton); + acceptSequence.pop(); + } + if (accept) { + break; + } + } + // Flatten graph filter will remove side paths that are only Holes. Gaps may also change size as + // graph is flattened. + // Traverse over them if curr is a hole to make sure the gap is kept + if (accept == false && curr == TokenStreamToAutomaton.HOLE && acceptSequence.size() > 0) { + // get rid of the separator + acceptSequence.pop(); + + for (int i = 0; i < numTransitions; i++) { + automaton.getTransition(state, i, transition); + //advance to the next POS_SEP in automaton + while (transition.min != TokenStreamToAutomaton.POS_SEP + && automaton.getNumTransitions(transition.dest) > 0) { + automaton.getTransition(transition.dest, 0, transition); + } + accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton); + if (accept) { + break; + } + } + + // might be multiple holes squashed under a one step path. Try burning remaining holes + if (accept == false) { + accept = recursivelyValidateWithHoles(acceptSequence, state, automaton); + } + + acceptSequence.push(TokenStreamToAutomaton.POS_SEP); + } + acceptSequence.push(curr); + return accept; + } */ + // NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter } diff --git a/lucene/core/src/java/org/apache/lucene/analysis/AutomatonToTokenStream.java b/lucene/core/src/java/org/apache/lucene/analysis/AutomatonToTokenStream.java new file mode 100644 index 000000000000..ef1bbd20bea5 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/analysis/AutomatonToTokenStream.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.Transition; + +/** Converts an Automaton into a TokenStream. */ +public class AutomatonToTokenStream { + + private AutomatonToTokenStream() {} + + /** + * converts an automaton into a TokenStream. This is done by first Topo sorting the nodes in the + * Automaton. Nodes that have the same distance from the start are grouped together to form the + * position nodes for the TokenStream. The resulting TokenStream releases edges from the automaton + * as tokens in order from the position nodes. This requires the automaton be a finite DAG. + * + * @param automaton automaton to convert. Must be a finite DAG. + * @return TokenStream representation of automaton. + */ + public static TokenStream toTokenStream(Automaton automaton) { + if (Operations.isFinite(automaton) == false) { + throw new IllegalArgumentException("Automaton must be finite"); + } + + List> positionNodes = new ArrayList<>(); + + Transition[][] transitions = automaton.getSortedTransitions(); + + int[] indegree = new int[transitions.length]; + + for (int i = 0; i < transitions.length; i++) { + for (int edge = 0; edge < transitions[i].length; edge++) { + indegree[transitions[i][edge].dest] += 1; + } + } + if (indegree[0] != 0) { + throw new IllegalArgumentException("Start node has incoming edges, creating cycle"); + } + + LinkedList noIncomingEdges = new LinkedList<>(); + Map idToPos = new HashMap<>(); + noIncomingEdges.addLast(new RemapNode(0, 0)); + while (noIncomingEdges.isEmpty() == false) { + RemapNode currState = noIncomingEdges.removeFirst(); + for (int i = 0; i < transitions[currState.id].length; i++) { + indegree[transitions[currState.id][i].dest] -= 1; + if (indegree[transitions[currState.id][i].dest] == 0) { + noIncomingEdges.addLast( + new RemapNode(transitions[currState.id][i].dest, currState.pos + 1)); + } + } + if (positionNodes.size() == currState.pos) { + List posIncs = new ArrayList<>(); + posIncs.add(currState.id); + positionNodes.add(posIncs); + } else { + positionNodes.get(currState.pos).add(currState.id); + } + idToPos.put(currState.id, currState.pos); + } + + for (int i = 0; i < indegree.length; i++) { + if (indegree[i] != 0) { + throw new IllegalArgumentException("Cycle found in automaton"); + } + } + + List> edgesByLayer = new ArrayList<>(); + for (List layer : positionNodes) { + List edges = new ArrayList<>(); + for (int state : layer) { + for (Transition t : transitions[state]) { + // each edge in the token stream can only be on value, though a transition takes a range. + for (int val = t.min; val <= t.max; val++) { + int destLayer = idToPos.get(t.dest); + edges.add(new EdgeToken(destLayer, val)); + // If there's an intermediate accept state, add an edge to the terminal state. + if (automaton.isAccept(t.dest) && destLayer != positionNodes.size() - 1) { + edges.add(new EdgeToken(positionNodes.size() - 1, val)); + } + } + } + } + edgesByLayer.add(edges); + } + + return new TopoTokenStream(edgesByLayer); + } + + /** Token Stream that outputs tokens from a topo sorted graph. */ + private static class TopoTokenStream extends TokenStream { + + private final List> edgesByPos; + private int currentPos; + private int currentEdgeIndex; + private CharTermAttribute charAttr = addAttribute(CharTermAttribute.class); + private PositionIncrementAttribute incAttr = addAttribute(PositionIncrementAttribute.class); + private PositionLengthAttribute lenAttr = addAttribute(PositionLengthAttribute.class); + private OffsetAttribute offAttr = addAttribute(OffsetAttribute.class); + + public TopoTokenStream(List> edgesByPos) { + this.edgesByPos = edgesByPos; + } + + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + while (currentPos < edgesByPos.size() + && currentEdgeIndex == edgesByPos.get(currentPos).size()) { + currentEdgeIndex = 0; + currentPos += 1; + } + if (currentPos == edgesByPos.size()) { + return false; + } + EdgeToken currentEdge = edgesByPos.get(currentPos).get(currentEdgeIndex); + + charAttr.append((char) currentEdge.value); + + incAttr.setPositionIncrement(currentEdgeIndex == 0 ? 1 : 0); + + lenAttr.setPositionLength(currentEdge.destination - currentPos); + + offAttr.setOffset(currentPos, currentEdge.destination); + + currentEdgeIndex++; + + return true; + } + + @Override + public void reset() throws IOException { + super.reset(); + clearAttributes(); + currentPos = 0; + currentEdgeIndex = 0; + } + + @Override + public void end() throws IOException { + clearAttributes(); + incAttr.setPositionIncrement(0); + // -1 because we don't count the terminal state as a position in the TokenStream + offAttr.setOffset(edgesByPos.size() - 1, edgesByPos.size() - 1); + } + } + + /** Edge between position nodes. These edges will be output as tokens in the TokenStream */ + private static class EdgeToken { + public final int destination; + public final int value; + + public EdgeToken(int destination, int value) { + this.destination = destination; + this.value = value; + } + } + + /** Node that contains original node id and position in TokenStream */ + private static class RemapNode { + public final int id; + public final int pos; + + public RemapNode(int id, int pos) { + this.id = id; + this.pos = pos; + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java new file mode 100644 index 000000000000..369856eaf89f --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; + +public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase { + + public void testSinglePath() throws IOException { + List acceptStrings = new ArrayList<>(); + acceptStrings.add(new BytesRef("abc")); + + Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); + assertTokenStreamContents( + ts, + new String[] {"a", "b", "c"}, + new int[] {0, 1, 2}, + new int[] {1, 2, 3}, + new int[] {1, 1, 1}, + new int[] {1, 1, 1}, + 3); + } + + public void testParallelPaths() throws IOException { + List acceptStrings = new ArrayList<>(); + acceptStrings.add(new BytesRef("123")); + acceptStrings.add(new BytesRef("abc")); + + Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); + assertTokenStreamContents( + ts, + new String[] {"1", "a", "2", "b", "3", "c"}, + new int[] {0, 0, 1, 1, 2, 2}, + new int[] {1, 1, 2, 2, 3, 3}, + new int[] {1, 0, 1, 0, 1, 0}, + new int[] {1, 1, 1, 1, 1, 1}, + 3); + } + + public void testForkedPath() throws IOException { + List acceptStrings = new ArrayList<>(); + acceptStrings.add(new BytesRef("ab3")); + acceptStrings.add(new BytesRef("abc")); + + Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); + assertTokenStreamContents( + ts, + new String[] {"a", "b", "3", "c"}, + new int[] {0, 1, 2, 2}, + new int[] {1, 2, 3, 3}, + new int[] {1, 1, 1, 0}, + new int[] {1, 1, 1, 1}, + 3); + } + + public void testNonDeterministicGraph() throws IOException { + Automaton.Builder builder = new Automaton.Builder(); + int start = builder.createState(); + int middle1 = builder.createState(); + int middle2 = builder.createState(); + int accept = builder.createState(); + + builder.addTransition(start, middle1, 'a'); + builder.addTransition(start, middle2, 'a'); + builder.addTransition(middle1, accept, 'b'); + builder.addTransition(middle2, accept, 'c'); + builder.setAccept(accept, true); + + Automaton nfa = builder.finish(); + TokenStream ts = AutomatonToTokenStream.toTokenStream(nfa); + assertTokenStreamContents( + ts, + new String[] {"a", "a", "b", "c"}, + new int[] {0, 0, 1, 1}, + new int[] {1, 1, 2, 2}, + new int[] {1, 0, 1, 0}, + new int[] {1, 1, 1, 1}, + 2); + } + + public void testGraphWithStartNodeCycle() { + Automaton.Builder builder = new Automaton.Builder(); + int start = builder.createState(); + int middle = builder.createState(); + int accept = builder.createState(); + + builder.addTransition(start, middle, 'a'); + builder.addTransition(middle, accept, 'b'); + builder.addTransition(middle, start, '1'); + + builder.setAccept(accept, true); + + Automaton cycleGraph = builder.finish(); + expectThrows( + IllegalArgumentException.class, () -> AutomatonToTokenStream.toTokenStream(cycleGraph)); + } + + public void testGraphWithNonStartCycle() { + Automaton.Builder builder = new Automaton.Builder(); + int start = builder.createState(); + int middle = builder.createState(); + int accept = builder.createState(); + + builder.addTransition(start, middle, 'a'); + builder.addTransition(middle, accept, 'b'); + builder.addTransition(accept, middle, 'c'); + builder.setAccept(accept, true); + + Automaton cycleGraph = builder.finish(); + expectThrows( + IllegalArgumentException.class, () -> AutomatonToTokenStream.toTokenStream(cycleGraph)); + } +}