LUCENE-9963 Add tests for alternate path failures in FlattenGraphFilt…

…er (apache#146)
glawson0 · Aug 10, 2021 · f3e0a86 · f3e0a86
1 parent 3a7387d
commit f3e0a86
Showing 1 changed file with 233 additions and 0 deletions.
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
@@ -17,13 +17,22 @@
 
 package org.apache.lucene.analysis.core;
 
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.CharsRefBuilder;
 
 public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
 
@@ -280,5 +289,229 @@ public void testTwoLongParallelPaths() throws Exception {
 
   }
 
+  // The end node the long path is supposed to flatten over doesn't exist
+  // assert disabled = pos length of abc = 4
+  // assert enabled = AssertionError: outputEndNode=3 vs inputTo=2
+  @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
+  public void testAltPathFirstStepHole() throws Exception {
+    TokenStream in =
+        new CannedTokenStream(
+            0,
+            3,
+            new Token[] {token("abc", 1, 3, 0, 3), token("b", 1, 1, 1, 2), token("c", 1, 1, 2, 3)});
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    assertTokenStreamContents(
+        out,
+        new String[] {"abc", "b", "c"},
+        new int[] {0, 1, 2},
+        new int[] {3, 2, 3},
+        new int[] {1, 1, 1},
+        new int[] {3, 1, 1},
+        3);
+  }
+
+  // Last node in an alt path releases the long path. but it doesn't exist in this graph
+  // pos length of abc = 1
+  @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
+  public void testAltPathLastStepHole() throws Exception {
+    TokenStream in =
+        new CannedTokenStream(
+            0,
+            4,
+            new Token[] {
+              token("abc", 1, 3, 0, 3),
+              token("a", 0, 1, 0, 1),
+              token("b", 1, 1, 1, 2),
+              token("d", 2, 1, 3, 4)
+            });
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    assertTokenStreamContents(
+        out,
+        new String[] {"abc", "a", "b", "d"},
+        new int[] {0, 0, 1, 3},
+        new int[] {1, 1, 2, 4},
+        new int[] {1, 0, 1, 2},
+        new int[] {3, 1, 1, 1},
+        4);
+  }
+
+  // Posinc >2 gets squashed to 2
+  public void testLongHole() throws Exception {
+    TokenStream in =
+        new CannedTokenStream(
+            0,
+            28,
+            new Token[] {
+              token("hello", 1, 1, 0, 5), token("hole", 5, 1, 20, 24), token("fun", 1, 1, 25, 28),
+            });
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    assertTokenStreamContents(
+        out,
+        new String[] {"hello", "hole", "fun"},
+        new int[] {0, 20, 25},
+        new int[] {5, 24, 28},
+        new int[] {1, 2, 1},
+        new int[] {1, 1, 1},
+        28);
+  }
+
+  // multiple nodes missing in the alt path. Last edge shows up after long edge and short edge,
+  // which looks good but the output graph isn't flat.
+  // assert disabled = nothing
+  // assert enabled = AssertionError
+  @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
+  public void testAltPathLastStepLongHole() throws Exception {
+    TokenStream in =
+        new CannedTokenStream(
+            0,
+            4,
+            new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("d", 3, 1, 3, 4)});
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    assertTokenStreamContents(
+        out,
+        new String[] {"abc", "a", "d"},
+        new int[] {0, 0, 3},
+        new int[] {1, 1, 4},
+        new int[] {1, 0, 1},
+        new int[] {1, 1, 1},
+        4);
+  }
+
+  // LUCENE-8723
+  // Token stream ends without last node showing up
+  // assert disabled = dropped token
+  // assert enabled = AssertionError: 2
+  @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
+  public void testAltPathLastStepHoleWithoutEndToken() throws Exception {
+    TokenStream in =
+        new CannedTokenStream(
+            0,
+            2,
+            new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("b", 1, 1, 1, 2)});
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    assertTokenStreamContents(
+        out,
+        new String[] {"abc", "a", "b"},
+        new int[] {0, 0, 1},
+        new int[] {1, 1, 2},
+        new int[] {1, 0, 1},
+        new int[] {1, 1, 1},
+        2);
+  }
+
+  /**
+   * build CharsRef containing 2-4 tokens
+   *
+   * @param tokens vocabulary of tokens
+   * @param charsRefBuilder CharsRefBuilder
+   * @param random Random for selecting tokens
+   * @return Charsref containing 2-4 tokens.
+   */
+  private CharsRef buildMultiTokenCharsRef(
+      String[] tokens, CharsRefBuilder charsRefBuilder, Random random) {
+    int srcLen = random.nextInt(2) + 2;
+    String[] srcTokens = new String[srcLen];
+    for (int pos = 0; pos < srcLen; pos++) {
+      srcTokens[pos] = tokens[random().nextInt(tokens.length)];
+    }
+    SynonymMap.Builder.join(srcTokens, charsRefBuilder);
+    return charsRefBuilder.toCharsRef();
+  }
+
+  // Create a random graph then delete some edges to see if we can trip up FlattenGraphFilter
+  // Is there some way we can do this and validate output nodes?
+  @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
+  public void testRandomGraphs() throws Exception {
+    String[] baseTokens = new String[] {"t1", "t2", "t3", "t4"};
+    String[] synTokens = new String[] {"s1", "s2", "s3", "s4"};
+
+    SynonymMap.Builder mapBuilder = new SynonymMap.Builder();
+    CharsRefBuilder charRefBuilder = new CharsRefBuilder();
+    Random random = random();
+
+    // between 20 and 20 synonym entries
+    int synCount = random.nextInt(10) + 10;
+    for (int i = 0; i < synCount; i++) {
+      int type = random.nextInt(4);
+      CharsRef src;
+      CharsRef dest;
+      switch (type) {
+        case 0:
+          // 1:1
+          src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef();
+          charRefBuilder.clear();
+          dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef();
+          charRefBuilder.clear();
+          break;
+        case 1:
+          // many:1
+          src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random);
+          charRefBuilder.clear();
+          dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef();
+          charRefBuilder.clear();
+          break;
+        case 2:
+          // 1:many
+          src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef();
+          charRefBuilder.clear();
+          dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random);
+          charRefBuilder.clear();
+          break;
+        default:
+          // many:many
+          src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random);
+          charRefBuilder.clear();
+          dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random);
+          charRefBuilder.clear();
+      }
+      mapBuilder.add(src, dest, true);
+    }
+
+    SynonymMap synMap = mapBuilder.build();
+
+    int stopWordCount = random.nextInt(4) + 1;
+    CharArraySet stopWords = new CharArraySet(stopWordCount, true);
+    while (stopWords.size() < stopWordCount) {
+      int index = random.nextInt(baseTokens.length + synTokens.length);
+      String[] tokenArray = baseTokens;
+      if (index >= baseTokens.length) {
+        index -= baseTokens.length;
+        tokenArray = synTokens;
+      }
+      stopWords.add(tokenArray[index]);
+    }
+
+    Analyzer a =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer in = new WhitespaceTokenizer();
+            TokenStream result = new SynonymGraphFilter(in, synMap, true);
+            result = new StopFilter(result, stopWords);
+            result = new FlattenGraphFilter(result);
+            return new TokenStreamComponents(in, result);
+          }
+        };
+
+    int tokenCount = random.nextInt(20) + 20;
+    List<String> stringTokens = new ArrayList<>();
+    while (stringTokens.size() < tokenCount) {
+      stringTokens.add(baseTokens[random.nextInt(baseTokens.length)]);
+    }
+
+    String text = String.join(" ", stringTokens);
+    checkAnalysisConsistency(random, a, false, text);
+  }
+
   // NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter
 }