From c14582dbcd93894f2651ed4f413d5d108a6535a8 Mon Sep 17 00:00:00 2001 From: Lawson Date: Mon, 16 Aug 2021 11:34:58 +0900 Subject: [PATCH] LUCENE-9963 Improve FlattenGraphFilter's robustness when handling incoming token graphs with holes (#157) 6 main improvements: 1) Iterate through all output.InputNodes since dest gaps can exist. 2) freeBefore the minimum input node instead of the first input node(which was usually, but not always, the minimum). 3) Don't freeBefore from a hole source node. Book keeping may not be correct and could result in an early free. 4) When adding an output node after hole recovery, calculate its new position increment instead of adding it to the end of the output graph. 5) Nodes after holes that have edges to their source will do the output re-mapping that the deleted node would have done. 6) If a disconnected input node swaps order with another node in the output, then map them to the same output node. --- .../analysis/core/FlattenGraphFilter.java | 165 ++++-- .../analysis/core/TestFlattenGraphFilter.java | 505 +++++++++++++++++- .../analysis/AutomatonToTokenStream.java | 197 +++++++ .../analysis/TestAutomatonToTokenStream.java | 136 +++++ 4 files changed, 954 insertions(+), 49 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/analysis/AutomatonToTokenStream.java create mode 100644 lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java index 01e1f6f7dfc1..13a4085b1fc9 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import org.apache.lucene.analysis.TokenFilter; @@ -66,9 +67,13 @@ private final static class InputNode implements RollingBuffer.Resettable { * to know when we can freeze. */ int maxToNode = -1; - /** Where we currently map to; this changes (can only - * increase as we see more input tokens), until we are finished - * with this position. */ + /** Minimum to input node for all tokens leaving here; we use this to check if holes exist. */ + int minToNode = Integer.MAX_VALUE; + + /** + * Where we currently map to; this changes (can only increase as we see more input tokens), + * until we are finished with this position. + */ int outputNode = -1; /** Which token (index into {@link #tokens}) we will next output. */ @@ -80,6 +85,7 @@ public void reset() { node = -1; outputNode = -1; maxToNode = -1; + minToNode = Integer.MAX_VALUE; nextOut = 0; } } @@ -188,14 +194,21 @@ private boolean releaseBufferedToken() { } if (inputNode.tokens.size() == 0) { assert inputNode.nextOut == 0; - assert output.nextOut == 0; // Hole dest nodes should never be merged since 1) we always // assign them to a new output position, and 2) since they never - // have arriving tokens they cannot be pushed: - assert output.inputNodes.size() == 1: output.inputNodes.size(); - outputFrom++; - inputNodes.freeBefore(output.inputNodes.get(0)); - outputNodes.freeBefore(outputFrom); + // have arriving tokens they cannot be pushed. Skip them but don't free + // input until all are checked. + // Related tests testAltPathLastStepLongHole, testAltPathLastStepHoleFollowedByHole, + // testAltPathLastStepHoleWithoutEndToken + if (output.inputNodes.size() > 1) { + output.nextOut++; + if (output.nextOut < output.inputNodes.size()) { + continue; + } + } + // Don't free from a hole src. Since no edge leaves here book keeping may be incorrect. + // Later output nodes may point to earlier input nodes. So we don't want to free them yet. + freeBefore(output); continue; } @@ -234,9 +247,7 @@ private boolean releaseBufferedToken() { if (inputNode.nextOut == inputNode.tokens.size()) { output.nextOut++; if (output.nextOut == output.inputNodes.size()) { - outputFrom++; - inputNodes.freeBefore(output.inputNodes.get(0)); - outputNodes.freeBefore(outputFrom); + freeBefore(output); } } @@ -250,6 +261,30 @@ private boolean releaseBufferedToken() { return false; } + /** + * Free inputs nodes before the minimum input node for the given output. + * + * @param output target output node + */ + private void freeBefore(OutputNode output) { + /* We've released all of the tokens that end at the current output, so free all output nodes before this. + Input nodes are more complex. The second shingled tokens with alternate paths can appear later in the output graph + than some of their alternate path tokens. Because of this case we can only free from the minimum because + the minimum node will have come from before the second shingled token. + This means we have to hold onto input nodes whose tokens get stacked on previous nodes until + we've completely passed those inputs. + Related tests testShingledGap, testShingledGapWithHoles + */ + outputFrom++; + int freeBefore = Collections.min(output.inputNodes); + // This will catch a node being freed early if it is input to the next output. + // Could a freed early node be input to a later output? + assert outputNodes.get(outputFrom).inputNodes.stream().filter(n -> freeBefore > n).count() == 0 + : "FreeBefore " + freeBefore + " will free in use nodes"; + inputNodes.freeBefore(freeBefore); + outputNodes.freeBefore(outputFrom); + } + @Override public boolean incrementToken() throws IOException { //System.out.println("\nF.increment inputFrom=" + inputFrom + " outputFrom=" + outputFrom); @@ -267,7 +302,8 @@ public boolean incrementToken() throws IOException { if (input.incrementToken()) { // Input node this token leaves from: - inputFrom += posIncAtt.getPositionIncrement(); + int positionIncrement = posIncAtt.getPositionIncrement(); + inputFrom += positionIncrement; int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); @@ -278,27 +314,44 @@ public boolean incrementToken() throws IOException { InputNode src = inputNodes.get(inputFrom); if (src.node == -1) { - // This means the "from" node of this token was never seen as a "to" node, - // which should only happen if we just crossed a hole. This is a challenging - // case for us because we normally rely on the full dependencies expressed - // by the arcs to assign outgoing node IDs. It would be better if tokens - // were never dropped but instead just marked deleted with a new - // TermDeletedAttribute (boolean valued) ... but until that future, we have - // a hack here to forcefully jump the output node ID: - assert src.outputNode == -1; - src.node = inputFrom; - - src.outputNode = outputNodes.getMaxPos() + 1; - //System.out.println(" hole: force to outputNode=" + src.outputNode); - OutputNode outSrc = outputNodes.get(src.outputNode); + recoverFromHole(src, startOffset, positionIncrement); - // Not assigned yet: - assert outSrc.node == -1; - outSrc.node = src.outputNode; - outSrc.inputNodes.add(inputFrom); - outSrc.startOffset = startOffset; } else { OutputNode outSrc = outputNodes.get(src.outputNode); + /* If positionIncrement > 1 and the position we're incrementing from doesn't come to the current node we've crossed a hole. + * The long edge will point too far back and not account for the holes unless it gets fixed. + * example: + * _____abc______ + * | | + * | V + * O-a->O- ->O- ->O-d->O + * + * A long edge may have already made this fix though, if src is more than 1 position ahead in the output there's no additional work to do + * example: + * _____abc______ + * | ....bc....| + * | . VV + * O-a->O- ->O- ->O-d->O + */ + if (positionIncrement > 1 + && src.outputNode - inputNodes.get(inputFrom - positionIncrement).outputNode <= 1 + && inputNodes.get(inputFrom - positionIncrement).minToNode != inputFrom) { + /* If there was a hole at the end of an alternate path then the input and output nodes + * have been created, + * but the offsets and increments have not been maintained correctly. Here we go back + * and fix them. + * Related test testAltPathLastStepHole + * The last node in the alt path didn't arrive to remove this reference. + */ + assert inputNodes.get(inputFrom).tokens.isEmpty() : "about to remove non empty edge"; + outSrc.inputNodes.remove(Integer.valueOf(inputFrom)); + src.outputNode = -1; + int prevEndOffset = outSrc.endOffset; + + outSrc = recoverFromHole(src, startOffset, positionIncrement); + outSrc.endOffset = prevEndOffset; + } + if (outSrc.startOffset == -1 || startOffset > outSrc.startOffset) { // "shrink wrap" the offsets so the original tokens (with most // restrictive offsets) win: @@ -309,6 +362,7 @@ public boolean incrementToken() throws IOException { // Buffer this token: src.tokens.add(captureState()); src.maxToNode = Math.max(src.maxToNode, inputTo); + src.minToNode = Math.min(src.minToNode, inputTo); maxLookaheadUsed = Math.max(maxLookaheadUsed, inputNodes.getBufferSize()); InputNode dest = inputNodes.get(inputTo); @@ -353,6 +407,55 @@ public boolean incrementToken() throws IOException { } } + private OutputNode recoverFromHole(InputNode src, int startOffset, int posinc) { + // This means the "from" node of this token was never seen as a "to" node, + // which should only happen if we just crossed a hole. This is a challenging + // case for us because we normally rely on the full dependencies expressed + // by the arcs to assign outgoing node IDs. It would be better if tokens + // were never dropped but instead just marked deleted with a new + // TermDeletedAttribute (boolean valued) ... but until that future, we have + // a hack here to forcefully jump the output node ID: + assert src.outputNode == -1; + src.node = inputFrom; + + int outIndex; + int previousInputFrom = inputFrom - posinc; + if (previousInputFrom >= 0) { + InputNode offsetSrc = inputNodes.get(previousInputFrom); + /* Select output src node. Need to make sure the new output node isn't placed too far ahead. + * If a disconnected node is placed at the end of the output graph that may place it after output nodes that map to input nodes that are after src in the input. + * Since it is disconnected there is no path to it, and there could be holes after meaning no paths to following nodes. This "floating" edge will cause problems in FreeBefore. + * In the following section make sure the edge connects to something. + * Related test testLongHole testAltPathLastStepHoleFollowedByHole, testAltPathFirstStepHole, testShingledGapWithHoles + */ + if (offsetSrc.minToNode < inputFrom) { + // There is a possible path to this node. + // place this node one position off from the possible path keeping a 1 inc gap. + // Can't be larger than 1 inc or risk getting disconnected. + outIndex = inputNodes.get(offsetSrc.minToNode).outputNode + 1; + } else { + // no information about how the current node was previously connected. + // Connect it to the end. + outIndex = outputNodes.getMaxPos(); + } + } else { + // in case the first token in the stream is a hole we have no input node to increment from. + outIndex = outputNodes.getMaxPos() + 1; + } + OutputNode outSrc = outputNodes.get(outIndex); + src.outputNode = outIndex; + + // OutSrc may have other inputs + if (outSrc.node == -1) { + outSrc.node = src.outputNode; + outSrc.startOffset = startOffset; + } else { + outSrc.startOffset = Math.max(startOffset, outSrc.startOffset); + } + outSrc.inputNodes.add(inputFrom); + return outSrc; + } + // Only for debugging: /* private void printStates() { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java index 96c940cab64b..75e0da3d2299 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java @@ -17,10 +17,15 @@ package org.apache.lucene.analysis.core; +import java.io.IOException; import java.util.ArrayList; +import java.util.Comparator; +import java.util.LinkedList; import java.util.List; import java.util.Random; +import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.AutomatonToTokenStream; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.CharArraySet; @@ -28,11 +33,17 @@ import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.Transition; public class TestFlattenGraphFilter extends BaseTokenStreamTestCase { @@ -289,10 +300,10 @@ public void testTwoLongParallelPaths() throws Exception { } - // The end node the long path is supposed to flatten over doesn't exist + // b has a posInc of 1, which is correct, but no edge ever visited that node. + // After hole recovery 'b' and 'c' should still be under 'abc' // assert disabled = pos length of abc = 4 // assert enabled = AssertionError: outputEndNode=3 vs inputTo=2 - @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963") public void testAltPathFirstStepHole() throws Exception { TokenStream in = new CannedTokenStream( @@ -312,9 +323,9 @@ public void testAltPathFirstStepHole() throws Exception { 3); } - // Last node in an alt path releases the long path. but it doesn't exist in this graph - // pos length of abc = 1 - @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963") + // Last node in an alt path fixes outputnode of long path. In this graph the follow up node fixes + // that. + // incorrect pos length of abc = 1 public void testAltPathLastStepHole() throws Exception { TokenStream in = new CannedTokenStream( @@ -339,7 +350,7 @@ public void testAltPathLastStepHole() throws Exception { 4); } - // Posinc >2 gets squashed to 2 + // Check to see how multiple holes in a row are preserved. public void testLongHole() throws Exception { TokenStream in = new CannedTokenStream( @@ -361,11 +372,9 @@ public void testLongHole() throws Exception { 28); } - // multiple nodes missing in the alt path. Last edge shows up after long edge and short edge, - // which looks good but the output graph isn't flat. + // multiple nodes missing in the alt path. // assert disabled = nothing // assert enabled = AssertionError - @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963") public void testAltPathLastStepLongHole() throws Exception { TokenStream in = new CannedTokenStream( @@ -380,16 +389,15 @@ public void testAltPathLastStepLongHole() throws Exception { new String[] {"abc", "a", "d"}, new int[] {0, 0, 3}, new int[] {1, 1, 4}, - new int[] {1, 0, 1}, - new int[] {1, 1, 1}, + new int[] {1, 0, 2}, + new int[] {2, 1, 1}, 4); } // LUCENE-8723 - // Token stream ends without last node showing up + // Token stream ends without any edge to fix the long edge's output node // assert disabled = dropped token // assert enabled = AssertionError: 2 - @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963") public void testAltPathLastStepHoleWithoutEndToken() throws Exception { TokenStream in = new CannedTokenStream( @@ -409,6 +417,189 @@ public void testAltPathLastStepHoleWithoutEndToken() throws Exception { 2); } + // similar to AltPathLastStepHoleWithoutEndToken, but instead of no token to trigger long path + // resolution, + // the next token has no way to reference to the long path so we have to resolve as if that last + // token wasn't present. + public void testAltPathLastStepHoleFollowedByHole() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 5, + new Token[] {token("abc", 1, 3, 0, 3), token("b", 1, 1, 1, 2), token("e", 3, 1, 4, 5)}); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, + new String[] {"abc", "b", "e"}, + new int[] {0, 1, 4}, + new int[] {3, 2, 5}, + new int[] {1, 1, 2}, + new int[] {1, 1, 1}, + 5); + } + + // Two Shingled long paths pass each other which gives a flattened graph with tokens backing up a + // lot. + public void testShingledGap() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 5, + new Token[] { + token("abc", 1, 3, 0, 3), + token("a", 0, 1, 0, 1), + token("b", 1, 1, 1, 2), + token("cde", 1, 3, 2, 5), + token("d", 1, 1, 3, 4), + token("e", 1, 1, 4, 5) + }); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, + new String[] {"abc", "a", "d", "b", "cde", "e"}, + new int[] {0, 0, 3, 3, 4, 4}, + new int[] {1, 1, 3, 3, 5, 5}, + new int[] {1, 0, 1, 0, 1, 0}, + new int[] {1, 1, 1, 1, 1, 1}, + 5); + } + + // With shingles, token order may change during flattening. + // We need to be careful not to free input nodes if they still have unreleased edges. + // with/without exceptions ArrayIndexOutOfBoundsException + public void testShingledGapWithHoles() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 5, + new Token[] { + token("abc", 1, 3, 0, 3), + token("b", 1, 1, 1, 2), + token("cde", 1, 3, 2, 5), + token("d", 1, 1, 3, 4), + token("e", 1, 1, 4, 5) + }); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, + new String[] {"abc", "d", "b", "cde", "e"}, + new int[] {0, 3, 3, 4, 4}, + new int[] {3, 3, 3, 5, 5}, + new int[] {1, 1, 0, 1, 0}, + new int[] {1, 1, 1, 1, 1}, + 5); + } + + // When the first token is a hole there is no original token to offset from. + public void testFirstTokenHole() throws Exception { + TokenStream in = new CannedTokenStream(0, 9, new Token[] {token("start", 2, 1, 0, 5)}); + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents( + out, new String[] {"start"}, new int[] {0}, new int[] {5}, new int[] {2}, new int[] {1}, 9); + } + + // The singled token starts from a hole. + // Hole recovery will cause the shingled token to start later in the output than its alternate + // paths. + // This will result in it being released too early. + public void testShingleFromGap() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 9, + new Token[] { + token("a", 1, 1, 4, 8), + token("abc", 0, 3, 4, 7), + token("cd", 2, 2, 6, 8), + token("d", 1, 1, 7, 8), + token("e", 1, 1, 8, 9) + }); + TokenStream out = new FlattenGraphFilter(in); + assertTokenStreamContents( + out, + new String[] {"a", "abc", "d", "cd", "e"}, + new int[] {4, 4, 7, 7, 8}, + new int[] {7, 7, 8, 8, 9}, + new int[] {1, 0, 1, 1, 1}, + new int[] {1, 1, 2, 1, 1}, + 9); + } + + public void testShingledGapAltPath() throws Exception { + TokenStream in = + new CannedTokenStream( + 0, + 4, + new Token[] { + token("abc", 1, 3, 0, 3), token("abcd", 0, 4, 0, 4), token("cd", 2, 2, 2, 4), + }); + TokenStream out = new FlattenGraphFilter(in); + assertTokenStreamContents( + out, + new String[] {"abc", "abcd", "cd"}, + new int[] {0, 0, 2}, + new int[] {3, 4, 4}, + new int[] {1, 0, 1}, + new int[] {1, 2, 1}, + 4); + } + + // Lots of shingles and alternate paths connecting to each other. One edge 'c' missing between + // 'ab' and 'def' + public void testHeavilyConnectedGraphWithGap() throws IOException { + TokenStream in = + new CannedTokenStream( + 0, + 7, + new Token[] { + token("a", 1, 1, 0, 1), + token("ab", 0, 2, 0, 2), + token("abcdef", 0, 6, 0, 6), + token("abcd", 0, 4, 0, 4), + token("bcdef", 1, 5, 1, 7), + token("def", 2, 3, 4, 7), + token("e", 1, 1, 5, 6), + token("f", 1, 1, 6, 7) + }); + TokenStream out = new FlattenGraphFilter(in); + assertTokenStreamContents( + out, + new String[] {"a", "ab", "abcdef", "abcd", "bcdef", "e", "def", "f"}, + new int[] {0, 0, 0, 0, 5, 5, 6, 6}, + new int[] {1, 1, 7, 1, 7, 6, 7, 7}, + new int[] {1, 0, 0, 0, 1, 0, 1, 0}, + new int[] {1, 1, 3, 1, 2, 1, 1, 1}, + 7); + } + // This graph can create a disconnected input node that is farther ahead in the output than its + // subsequent input node. + // Exceptions: Free too early or dropped tokens. + public void testShingleWithLargeLeadingGap() throws IOException { + TokenStream in = + new CannedTokenStream( + 0, + 6, + new Token[] { + token("abcde", 1, 5, 0, 5), token("ef", 4, 2, 4, 6), token("f", 1, 1, 5, 6), + }); + TokenStream out = new FlattenGraphFilter(in); + assertTokenStreamContents( + out, + new String[] {"abcde", "f", "ef"}, + new int[] {0, 5, 5}, + new int[] {5, 6, 6}, + new int[] {1, 1, 0}, + new int[] {1, 1, 1}, + 6); + } + /** * build CharsRef containing 2-4 tokens * @@ -429,8 +620,6 @@ private CharsRef buildMultiTokenCharsRef( } // Create a random graph then delete some edges to see if we can trip up FlattenGraphFilter - // Is there some way we can do this and validate output nodes? - @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963") public void testRandomGraphs() throws Exception { String[] baseTokens = new String[] {"t1", "t2", "t3", "t4"}; String[] synTokens = new String[] {"s1", "s2", "s3", "s4"}; @@ -439,7 +628,7 @@ public void testRandomGraphs() throws Exception { CharsRefBuilder charRefBuilder = new CharsRefBuilder(); Random random = random(); - // between 20 and 20 synonym entries + // between 10 and 20 synonym entries int synCount = random.nextInt(10) + 10; for (int i = 0; i < synCount; i++) { int type = random.nextInt(4); @@ -491,7 +680,7 @@ public void testRandomGraphs() throws Exception { stopWords.add(tokenArray[index]); } - Analyzer a = + Analyzer withFlattenGraph = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { @@ -510,8 +699,288 @@ protected TokenStreamComponents createComponents(String fieldName) { } String text = String.join(" ", stringTokens); - checkAnalysisConsistency(random, a, false, text); + // FlattenGraphFilter can create inconsistent offsets. + // If that is resolved we can check offsets + // Until then converting to automaton will pull text through and check if we hit asserts. + // checkAnalysisConsistency(random, withFlattenGraph, false, text); + TokenStreamToAutomaton tsta = new TokenStreamToAutomaton(); + TokenStream flattenedTokenStream = withFlattenGraph.tokenStream("field", text); + assertFalse(Operations.hasDeadStates(tsta.toAutomaton(flattenedTokenStream))); + flattenedTokenStream.close(); + + /* + CheckGeneralization can get VERY slow as matching holes to tokens or other holes generates a lot of potentially valid paths. + Analyzer withoutFlattenGraph = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer in = new WhitespaceTokenizer(); + TokenStream result = new SynonymGraphFilter(in, synMap, true); + result = new StopFilter(result, stopWords); + return new TokenStreamComponents(in, result); + } + }; + checkGeneralization( + withFlattenGraph.tokenStream("field", text), + withoutFlattenGraph.tokenStream("field", text)); + + */ } + /* + * Make some strings, make an automaton that accepts those strings, convert that automaton into a TokenStream, + * flatten it, back to an automaton, and see if the original strings are still accepted. + */ + public void testPathsNotLost() throws IOException { + int wordCount = random().nextInt(5) + 5; + List acceptStrings = new LinkedList<>(); + for (int i = 0; i < wordCount; i++) { + int wordLen = random().nextInt(5) + 5; + BytesRef ref = new BytesRef(wordLen); + ref.length = wordLen; + ref.offset = 0; + for (int j = 0; j < wordLen; j++) { + ref.bytes[j] = (byte) (random().nextInt(5) + 65); + } + acceptStrings.add(ref); + } + acceptStrings.sort(Comparator.naturalOrder()); + + acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList()); + Automaton nonFlattenedAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + + TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton); + TokenStream flattenedTokenStream = new FlattenGraphFilter(ts); + TokenStreamToAutomaton tsta = new TokenStreamToAutomaton(); + Automaton flattenedAutomaton = tsta.toAutomaton(flattenedTokenStream); + + // TokenStreamToAutomaton adds position increment transitions into the automaton. + List acceptStringsWithPosSep = createAcceptStringsWithPosSep(acceptStrings); + + for (BytesRef acceptString : acceptStringsWithPosSep) { + assertTrue( + "string not accepted " + acceptString.utf8ToString(), + recursivelyValidate(acceptString, 0, 0, flattenedAutomaton)); + } + } + + /** + * adds POS_SEP bytes between bytes to match TokenStreamToAutomaton format. + * + * @param acceptStrings Byte refs of accepted strings. Each byte is a transition + * @return List of ByteRefs where each byte is separated by a POS_SEP byte. + */ + private List createAcceptStringsWithPosSep(List acceptStrings) { + List acceptStringsWithPosSep = new ArrayList<>(); + for (BytesRef acceptString : acceptStrings) { + BytesRef withPosSep = new BytesRef(acceptString.length * 2 - 1); + withPosSep.length = acceptString.length * 2 - 1; + withPosSep.offset = 0; + for (int i = 0; i < acceptString.length; i++) { + withPosSep.bytes[i * 2] = acceptString.bytes[i]; + if (i * 2 + 1 < withPosSep.length) { + withPosSep.bytes[i * 2 + 1] = TokenStreamToAutomaton.POS_SEP; + } + } + acceptStringsWithPosSep.add(withPosSep); + } + return acceptStringsWithPosSep; + } + + /** + * Checks if acceptString is accepted by the automaton. Automaton may be an NFA. + * + * @param acceptString String to test + * @param acceptStringIndex current index into acceptString, initial value should be 0 + * @param state state to transition from. initial value should be 0 + * @param automaton Automaton to test + * @return true if acceptString is accepted by the automaton. otherwise false. + */ + public boolean recursivelyValidate( + BytesRef acceptString, int acceptStringIndex, int state, Automaton automaton) { + if (acceptStringIndex == acceptString.length) { + return automaton.isAccept(state); + } + + Transition transition = new Transition(); + automaton.initTransition(state, transition); + int numTransitions = automaton.getNumTransitions(state); + boolean accept = false; + // Automaton can be NFA, so we need to check all matching transitions + for (int i = 0; i < numTransitions; i++) { + automaton.getTransition(state, i, transition); + if (transition.min <= acceptString.bytes[acceptStringIndex] + && transition.max >= acceptString.bytes[acceptStringIndex]) { + accept = + recursivelyValidate(acceptString, acceptStringIndex + 1, transition.dest, automaton); + } + if (accept == true) { + break; + } + } + return accept; + } + + /** + * This method checks if strings that lead to the accept state of the not flattened TokenStream + * also lead to the accept state in the flattened TokenStream. This gets complicated when you + * factor in holes. The FlattenGraphFilter will remove alternate paths that are made entirely of + * holes. An alternate path of Holes is indistinguishable from a path that just has long + * lengths(ex: testStrangelyNumberedNodes). Also alternate paths that end in multiple holes could + * be interpreted as sequential holes after the branching has converged during flattening. This + * leads to a lot of weird logic about navigating around holes that may compromise the accuracy of + * this test. + * + * @param flattened flattened TokenStream + * @param notFlattened not flattened TokenStream + * @throws IOException on error creating Automata + */ + /* private void checkGeneralization(TokenStream flattened, TokenStream notFlattened) + throws IOException { + TokenStreamToAutomaton tsta = new TokenStreamToAutomaton(); + + List> acceptStrings = getAcceptStrings(tsta.toAutomaton(notFlattened)); + checkAcceptStrings(acceptStrings, tsta.toAutomaton(flattened)); + flattened.close(); + notFlattened.close(); + }*/ + + /** + * gets up to 10000 strings that lead to accept state in the given automaton. + * + * @param automaton automaton + * @return list of accept sequences + */ + /* private List> getAcceptStrings(Automaton automaton) { + List> acceptedSequences = new LinkedList<>(); + LinkedList prefix = new LinkedList<>(); + // state 0 is always the start node + // Particularly branching automatons can create lots of possible acceptable strings. limit to + // the first 10K + buildAcceptStringRecursive(automaton, 0, prefix, acceptedSequences, 10000); + return acceptedSequences; + }*/ + + /** + * @param automaton automaton to generate strings from + * @param state state to start at + * @param prefix string prefix + * @param acceptedSequences List of strings build so far. + * @param limit maximum number of acceptedSequences. + */ + /*private void buildAcceptStringRecursive( + Automaton automaton, + int state, + LinkedList prefix, + List> acceptedSequences, + int limit) { + if (acceptedSequences.size() == limit) { + return; + } + if (automaton.isAccept(state)) { + acceptedSequences.add(new LinkedList<>(prefix)); + return; + } + int numTransitions = automaton.getNumTransitions(state); + Transition transition = new Transition(); + for (int i = 0; i < numTransitions; i++) { + automaton.getTransition(state, i, transition); + // min and max are the same transitions made from TokenStreamToAutomaton + prefix.addLast(transition.min); + buildAcceptStringRecursive(automaton, transition.dest, prefix, acceptedSequences, limit); + prefix.removeLast(); + } + } + + private void checkAcceptStrings(List> acceptSequence, Automaton automaton) { + for (LinkedList acceptString : acceptSequence) { + assertTrue( + "String did not lead to accept state " + acceptString, + recursivelyValidateWithHoles(acceptString, 0, automaton)); + } + } + + private boolean recursivelyValidateWithHoles( + LinkedList acceptSequence, int state, Automaton automaton) { + if (acceptSequence.isEmpty()) { + return automaton.isAccept(state); + } + + Integer curr = acceptSequence.pop(); + int numTransitions = automaton.getNumTransitions(state); + Transition transition = new Transition(); + + boolean accept = false; + // Automaton can be NFA, so we need to check all matching transitions + for (int i = 0; i < numTransitions; i++) { + automaton.getTransition(state, i, transition); + if (transition.min <= curr && transition.max >= curr) { + accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton); + // Factoring in flattened graphs the space covered by a hole may be bigger in the flattened + // graph. + // Try consuming more steps with holes. + if (accept == false + && transition.min == TokenStreamToAutomaton.HOLE + && transition.max == TokenStreamToAutomaton.HOLE) { + acceptSequence.push(TokenStreamToAutomaton.HOLE); + acceptSequence.push(TokenStreamToAutomaton.POS_SEP); + accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton); + acceptSequence.pop(); + acceptSequence.pop(); + } + } else if (transition.min == TokenStreamToAutomaton.HOLE + && transition.max == TokenStreamToAutomaton.HOLE + && automaton.getNumTransitions(transition.dest) > 0) { + //consume multiple holes in the automaton + // clear POS_INC + automaton.getTransition(transition.dest, 0, transition); + acceptSequence.push(curr); + accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton); + acceptSequence.pop(); + } else if(curr == TokenStreamToAutomaton.HOLE) { + //consume non-holes in the automaton with holes + while (transition.min != TokenStreamToAutomaton.POS_SEP + && automaton.getNumTransitions(transition.dest) > 0) { + automaton.getTransition(transition.dest, 0, transition); + } + acceptSequence.push(curr); + accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton); + acceptSequence.pop(); + } + if (accept) { + break; + } + } + // Flatten graph filter will remove side paths that are only Holes. Gaps may also change size as + // graph is flattened. + // Traverse over them if curr is a hole to make sure the gap is kept + if (accept == false && curr == TokenStreamToAutomaton.HOLE && acceptSequence.size() > 0) { + // get rid of the separator + acceptSequence.pop(); + + for (int i = 0; i < numTransitions; i++) { + automaton.getTransition(state, i, transition); + //advance to the next POS_SEP in automaton + while (transition.min != TokenStreamToAutomaton.POS_SEP + && automaton.getNumTransitions(transition.dest) > 0) { + automaton.getTransition(transition.dest, 0, transition); + } + accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton); + if (accept) { + break; + } + } + + // might be multiple holes squashed under a one step path. Try burning remaining holes + if (accept == false) { + accept = recursivelyValidateWithHoles(acceptSequence, state, automaton); + } + + acceptSequence.push(TokenStreamToAutomaton.POS_SEP); + } + acceptSequence.push(curr); + return accept; + } */ + // NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter } diff --git a/lucene/core/src/java/org/apache/lucene/analysis/AutomatonToTokenStream.java b/lucene/core/src/java/org/apache/lucene/analysis/AutomatonToTokenStream.java new file mode 100644 index 000000000000..ef1bbd20bea5 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/analysis/AutomatonToTokenStream.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.Transition; + +/** Converts an Automaton into a TokenStream. */ +public class AutomatonToTokenStream { + + private AutomatonToTokenStream() {} + + /** + * converts an automaton into a TokenStream. This is done by first Topo sorting the nodes in the + * Automaton. Nodes that have the same distance from the start are grouped together to form the + * position nodes for the TokenStream. The resulting TokenStream releases edges from the automaton + * as tokens in order from the position nodes. This requires the automaton be a finite DAG. + * + * @param automaton automaton to convert. Must be a finite DAG. + * @return TokenStream representation of automaton. + */ + public static TokenStream toTokenStream(Automaton automaton) { + if (Operations.isFinite(automaton) == false) { + throw new IllegalArgumentException("Automaton must be finite"); + } + + List> positionNodes = new ArrayList<>(); + + Transition[][] transitions = automaton.getSortedTransitions(); + + int[] indegree = new int[transitions.length]; + + for (int i = 0; i < transitions.length; i++) { + for (int edge = 0; edge < transitions[i].length; edge++) { + indegree[transitions[i][edge].dest] += 1; + } + } + if (indegree[0] != 0) { + throw new IllegalArgumentException("Start node has incoming edges, creating cycle"); + } + + LinkedList noIncomingEdges = new LinkedList<>(); + Map idToPos = new HashMap<>(); + noIncomingEdges.addLast(new RemapNode(0, 0)); + while (noIncomingEdges.isEmpty() == false) { + RemapNode currState = noIncomingEdges.removeFirst(); + for (int i = 0; i < transitions[currState.id].length; i++) { + indegree[transitions[currState.id][i].dest] -= 1; + if (indegree[transitions[currState.id][i].dest] == 0) { + noIncomingEdges.addLast( + new RemapNode(transitions[currState.id][i].dest, currState.pos + 1)); + } + } + if (positionNodes.size() == currState.pos) { + List posIncs = new ArrayList<>(); + posIncs.add(currState.id); + positionNodes.add(posIncs); + } else { + positionNodes.get(currState.pos).add(currState.id); + } + idToPos.put(currState.id, currState.pos); + } + + for (int i = 0; i < indegree.length; i++) { + if (indegree[i] != 0) { + throw new IllegalArgumentException("Cycle found in automaton"); + } + } + + List> edgesByLayer = new ArrayList<>(); + for (List layer : positionNodes) { + List edges = new ArrayList<>(); + for (int state : layer) { + for (Transition t : transitions[state]) { + // each edge in the token stream can only be on value, though a transition takes a range. + for (int val = t.min; val <= t.max; val++) { + int destLayer = idToPos.get(t.dest); + edges.add(new EdgeToken(destLayer, val)); + // If there's an intermediate accept state, add an edge to the terminal state. + if (automaton.isAccept(t.dest) && destLayer != positionNodes.size() - 1) { + edges.add(new EdgeToken(positionNodes.size() - 1, val)); + } + } + } + } + edgesByLayer.add(edges); + } + + return new TopoTokenStream(edgesByLayer); + } + + /** Token Stream that outputs tokens from a topo sorted graph. */ + private static class TopoTokenStream extends TokenStream { + + private final List> edgesByPos; + private int currentPos; + private int currentEdgeIndex; + private CharTermAttribute charAttr = addAttribute(CharTermAttribute.class); + private PositionIncrementAttribute incAttr = addAttribute(PositionIncrementAttribute.class); + private PositionLengthAttribute lenAttr = addAttribute(PositionLengthAttribute.class); + private OffsetAttribute offAttr = addAttribute(OffsetAttribute.class); + + public TopoTokenStream(List> edgesByPos) { + this.edgesByPos = edgesByPos; + } + + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + while (currentPos < edgesByPos.size() + && currentEdgeIndex == edgesByPos.get(currentPos).size()) { + currentEdgeIndex = 0; + currentPos += 1; + } + if (currentPos == edgesByPos.size()) { + return false; + } + EdgeToken currentEdge = edgesByPos.get(currentPos).get(currentEdgeIndex); + + charAttr.append((char) currentEdge.value); + + incAttr.setPositionIncrement(currentEdgeIndex == 0 ? 1 : 0); + + lenAttr.setPositionLength(currentEdge.destination - currentPos); + + offAttr.setOffset(currentPos, currentEdge.destination); + + currentEdgeIndex++; + + return true; + } + + @Override + public void reset() throws IOException { + super.reset(); + clearAttributes(); + currentPos = 0; + currentEdgeIndex = 0; + } + + @Override + public void end() throws IOException { + clearAttributes(); + incAttr.setPositionIncrement(0); + // -1 because we don't count the terminal state as a position in the TokenStream + offAttr.setOffset(edgesByPos.size() - 1, edgesByPos.size() - 1); + } + } + + /** Edge between position nodes. These edges will be output as tokens in the TokenStream */ + private static class EdgeToken { + public final int destination; + public final int value; + + public EdgeToken(int destination, int value) { + this.destination = destination; + this.value = value; + } + } + + /** Node that contains original node id and position in TokenStream */ + private static class RemapNode { + public final int id; + public final int pos; + + public RemapNode(int id, int pos) { + this.id = id; + this.pos = pos; + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java new file mode 100644 index 000000000000..369856eaf89f --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; + +public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase { + + public void testSinglePath() throws IOException { + List acceptStrings = new ArrayList<>(); + acceptStrings.add(new BytesRef("abc")); + + Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); + assertTokenStreamContents( + ts, + new String[] {"a", "b", "c"}, + new int[] {0, 1, 2}, + new int[] {1, 2, 3}, + new int[] {1, 1, 1}, + new int[] {1, 1, 1}, + 3); + } + + public void testParallelPaths() throws IOException { + List acceptStrings = new ArrayList<>(); + acceptStrings.add(new BytesRef("123")); + acceptStrings.add(new BytesRef("abc")); + + Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); + assertTokenStreamContents( + ts, + new String[] {"1", "a", "2", "b", "3", "c"}, + new int[] {0, 0, 1, 1, 2, 2}, + new int[] {1, 1, 2, 2, 3, 3}, + new int[] {1, 0, 1, 0, 1, 0}, + new int[] {1, 1, 1, 1, 1, 1}, + 3); + } + + public void testForkedPath() throws IOException { + List acceptStrings = new ArrayList<>(); + acceptStrings.add(new BytesRef("ab3")); + acceptStrings.add(new BytesRef("abc")); + + Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); + assertTokenStreamContents( + ts, + new String[] {"a", "b", "3", "c"}, + new int[] {0, 1, 2, 2}, + new int[] {1, 2, 3, 3}, + new int[] {1, 1, 1, 0}, + new int[] {1, 1, 1, 1}, + 3); + } + + public void testNonDeterministicGraph() throws IOException { + Automaton.Builder builder = new Automaton.Builder(); + int start = builder.createState(); + int middle1 = builder.createState(); + int middle2 = builder.createState(); + int accept = builder.createState(); + + builder.addTransition(start, middle1, 'a'); + builder.addTransition(start, middle2, 'a'); + builder.addTransition(middle1, accept, 'b'); + builder.addTransition(middle2, accept, 'c'); + builder.setAccept(accept, true); + + Automaton nfa = builder.finish(); + TokenStream ts = AutomatonToTokenStream.toTokenStream(nfa); + assertTokenStreamContents( + ts, + new String[] {"a", "a", "b", "c"}, + new int[] {0, 0, 1, 1}, + new int[] {1, 1, 2, 2}, + new int[] {1, 0, 1, 0}, + new int[] {1, 1, 1, 1}, + 2); + } + + public void testGraphWithStartNodeCycle() { + Automaton.Builder builder = new Automaton.Builder(); + int start = builder.createState(); + int middle = builder.createState(); + int accept = builder.createState(); + + builder.addTransition(start, middle, 'a'); + builder.addTransition(middle, accept, 'b'); + builder.addTransition(middle, start, '1'); + + builder.setAccept(accept, true); + + Automaton cycleGraph = builder.finish(); + expectThrows( + IllegalArgumentException.class, () -> AutomatonToTokenStream.toTokenStream(cycleGraph)); + } + + public void testGraphWithNonStartCycle() { + Automaton.Builder builder = new Automaton.Builder(); + int start = builder.createState(); + int middle = builder.createState(); + int accept = builder.createState(); + + builder.addTransition(start, middle, 'a'); + builder.addTransition(middle, accept, 'b'); + builder.addTransition(accept, middle, 'c'); + builder.setAccept(accept, true); + + Automaton cycleGraph = builder.finish(); + expectThrows( + IllegalArgumentException.class, () -> AutomatonToTokenStream.toTokenStream(cycleGraph)); + } +}