From 5c2648aff8258472105fd1e85df806f4871d8c98 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Mon, 6 Feb 2017 22:48:41 +0000 Subject: [PATCH 1/3] [LUCENE-7498] initial patch --- .../KNearestNeighborClassifier.java | 51 +- .../KNearestNeighborDocumentClassifier.java | 37 +- .../DocumentClassificationTestBase.java | 28 +- .../utils/ConfusionMatrixGeneratorTest.java | 2 +- .../search/similarities/BM25Similarity.java | 10 +- .../lucene/queries/mlt/MoreLikeThis.java | 892 ++---------------- .../queries/mlt/MoreLikeThisParameters.java | 437 +++++++++ .../mlt/{ => query}/MoreLikeThisQuery.java | 88 +- .../mlt/query/MoreLikeThisQueryBuilder.java | 85 ++ .../mlt/terms/DocumentTermFrequencies.java | 110 +++ .../mlt/terms/InterestingTermsRetriever.java | 223 +++++ .../terms/LocalDocumentTermsRetriever.java | 130 +++ .../terms/LuceneDocumentTermsRetriever.java | 68 ++ .../queries/mlt/terms/scorer/BM25Scorer.java | 89 ++ .../queries/mlt/terms/scorer/ScoredTerm.java | 48 + .../queries/mlt/terms/scorer/TFIDFScorer.java | 67 ++ .../queries/mlt/terms/scorer/TermScorer.java | 47 + .../queries/mlt/MoreLikeThisTestBase.java | 239 +++++ .../lucene/queries/mlt/TestMoreLikeThis.java | 117 +-- .../query/MoreLikeThisQueryBuilderTest.java | 92 ++ .../mlt/query/MoreLikeThisQueryTest.java | 61 ++ .../terms/DocumentTermFrequenciesTest.java | 63 ++ .../terms/InterestingTermsRetrieverTest.java | 341 +++++++ .../LocalDocumentTermsRetrieverTest.java | 335 +++++++ .../LuceneDocumentTermsRetrieverTest.java | 354 +++++++ .../mlt/terms/scorer/bm25/BM25ScorerTest.java | 128 +++ .../terms/scorer/tfidf/TFIDFScorerTest.java | 51 + .../xml/builders/LikeThisQueryBuilder.java | 4 +- .../solr/handler/MoreLikeThisHandler.java | 91 +- .../component/MoreLikeThisComponent.java | 3 +- .../solr/search/mlt/CloudMLTQParser.java | 84 +- .../solr/search/mlt/SimpleMLTQParser.java | 55 +- 32 files changed, 3313 insertions(+), 1117 deletions(-) create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisParameters.java rename lucene/queries/src/java/org/apache/lucene/queries/mlt/{ => query}/MoreLikeThisQuery.java (68%) create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryBuilder.java create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/DocumentTermFrequencies.java create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/InterestingTermsRetriever.java create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/LocalDocumentTermsRetriever.java create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/LuceneDocumentTermsRetriever.java create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/BM25Scorer.java create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/ScoredTerm.java create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/TFIDFScorer.java create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/TermScorer.java create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/mlt/MoreLikeThisTestBase.java create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryBuilderTest.java create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryTest.java create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/DocumentTermFrequenciesTest.java create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/InterestingTermsRetrieverTest.java create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/LocalDocumentTermsRetrieverTest.java create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/LuceneDocumentTermsRetrieverTest.java create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/scorer/bm25/BM25ScorerTest.java create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/scorer/tfidf/TFIDFScorerTest.java diff --git a/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java index 77f04164cc3a..92af34fd6748 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java @@ -17,7 +17,6 @@ package org.apache.lucene.classification; import java.io.IOException; -import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -26,11 +25,15 @@ import java.util.Map; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.queries.mlt.MoreLikeThis; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; @@ -90,8 +93,8 @@ public class KNearestNeighborClassifier implements Classifier { * @param query a {@link Query} to eventually filter the docs used for training the classifier, or {@code null} * if all the indexed docs should be used * @param k the no. of docs to select in the MLT results to find the nearest neighbor - * @param minDocsFreq {@link MoreLikeThis#minDocFreq} parameter - * @param minTermFreq {@link MoreLikeThis#minTermFreq} parameter + * @param minDocsFreq {@link MoreLikeThisParameters#minDocFreq} parameter + * @param minTermFreq {@link MoreLikeThisParameters#minTermFreq} parameter * @param classFieldName the name of the field used as the output for the classifier * @param textFieldNames the name of the fields used as the inputs for the classifier, they can contain boosting indication e.g. title^10 */ @@ -100,8 +103,10 @@ public KNearestNeighborClassifier(IndexReader indexReader, Similarity similarity this.textFieldNames = textFieldNames; this.classFieldName = classFieldName; this.mlt = new MoreLikeThis(indexReader); - this.mlt.setAnalyzer(analyzer); - this.mlt.setFieldNames(textFieldNames); + MoreLikeThisParameters mltParameters = new MoreLikeThisParameters(); + this.mlt.setParameters(mltParameters); + mltParameters.setAnalyzer(analyzer); + mltParameters.setFieldNames(textFieldNames); this.indexSearcher = new IndexSearcher(indexReader); if (similarity != null) { this.indexSearcher.setSimilarity(similarity); @@ -109,10 +114,10 @@ public KNearestNeighborClassifier(IndexReader indexReader, Similarity similarity this.indexSearcher.setSimilarity(new ClassicSimilarity()); } if (minDocsFreq > 0) { - mlt.setMinDocFreq(minDocsFreq); + mltParameters.setMinDocFreq(minDocsFreq); } if (minTermFreq > 0) { - mlt.setMinTermFreq(minTermFreq); + mltParameters.setMinTermFreq(minTermFreq); } this.query = query; this.k = k; @@ -160,21 +165,45 @@ public List> getClasses(String text, int max) thr } private TopDocs knnSearch(String text) throws IOException { + Document textDocument = new Document(); + for(String fieldName: textFieldNames){ + textDocument.add(new TextField(fieldName,text, Field.Store.YES)); + } + return knnSearch(textDocument); + } + + /** + * Returns the top k results from a More Like This query based on the input document + * + * @param document the document to use for More Like This search + * @return the top results for the MLT query + * @throws IOException If there is a low-level I/O error + */ + protected TopDocs knnSearch(Document document) throws IOException { + MoreLikeThisParameters classificationMltParameters = mlt.getParameters(); BooleanQuery.Builder mltQuery = new BooleanQuery.Builder(); + Map fieldToQueryTimeBoostFactor = classificationMltParameters.getFieldToQueryTimeBoostFactor(); + ArrayList fieldNamesWithNoBoost = new ArrayList<>(); for (String fieldName : textFieldNames) { String boost = null; - mlt.setBoost(true); //terms boost actually helps in MLT queries if (fieldName.contains("^")) { String[] field2boost = fieldName.split("\\^"); fieldName = field2boost[0]; boost = field2boost[1]; } + fieldNamesWithNoBoost.add(fieldName); + classificationMltParameters.enableBoost(true); // we want always to use the boost coming from TF * IDF of the term if (boost != null) { - mlt.setBoostFactor(Float.parseFloat(boost));//if we have a field boost, we add it + if(fieldToQueryTimeBoostFactor == null){ + fieldToQueryTimeBoostFactor = new HashMap<>(); + classificationMltParameters.setFieldToQueryTimeBoostFactor(fieldToQueryTimeBoostFactor); + } + fieldToQueryTimeBoostFactor.put(fieldName,Float.parseFloat(boost)); // this is an additional multiplicative boost coming from the field boost } - mltQuery.add(new BooleanClause(mlt.like(fieldName, new StringReader(text)), BooleanClause.Occur.SHOULD)); - mlt.setBoostFactor(1);// restore neutral boost for next field } + classificationMltParameters.setFieldNames(fieldNamesWithNoBoost.toArray(textFieldNames)); + mltQuery.add(mlt.like(document), BooleanClause.Occur.MUST); + Query classFieldQuery = new WildcardQuery(new Term(classFieldName, "*")); mltQuery.add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST)); if (query != null) { diff --git a/lucene/classification/src/java/org/apache/lucene/classification/document/KNearestNeighborDocumentClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/document/KNearestNeighborDocumentClassifier.java index e01090a9cac3..13bf5970c539 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/document/KNearestNeighborDocumentClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/document/KNearestNeighborDocumentClassifier.java @@ -19,7 +19,9 @@ import java.io.IOException; import java.io.StringReader; +import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; @@ -29,6 +31,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; @@ -60,8 +63,8 @@ public class KNearestNeighborDocumentClassifier extends KNearestNeighborClassifi * @param query a {@link org.apache.lucene.search.Query} to eventually filter the docs used for training the classifier, or {@code null} * if all the indexed docs should be used * @param k the no. of docs to select in the MLT results to find the nearest neighbor - * @param minDocsFreq {@link org.apache.lucene.queries.mlt.MoreLikeThis#minDocFreq} parameter - * @param minTermFreq {@link org.apache.lucene.queries.mlt.MoreLikeThis#minTermFreq} parameter + * @param minDocsFreq {@link MoreLikeThisParameters#minDocFreq} parameter + * @param minTermFreq {@link MoreLikeThisParameters#minTermFreq} parameter * @param classFieldName the name of the field used as the output for the classifier * @param field2analyzer map with key a field name and the related {org.apache.lucene.analysis.Analyzer} * @param textFieldNames the name of the fields used as the inputs for the classifier, they can contain boosting indication e.g. title^10 @@ -120,31 +123,9 @@ public List> getClasses(Document document, int ma * @return the top results for the MLT query * @throws IOException If there is a low-level I/O error */ - private TopDocs knnSearch(Document document) throws IOException { - BooleanQuery.Builder mltQuery = new BooleanQuery.Builder(); - - for (String fieldName : textFieldNames) { - String boost = null; - if (fieldName.contains("^")) { - String[] field2boost = fieldName.split("\\^"); - fieldName = field2boost[0]; - boost = field2boost[1]; - } - String[] fieldValues = document.getValues(fieldName); - mlt.setBoost(true); // we want always to use the boost coming from TF * IDF of the term - if (boost != null) { - mlt.setBoostFactor(Float.parseFloat(boost)); // this is an additional multiplicative boost coming from the field boost - } - mlt.setAnalyzer(field2analyzer.get(fieldName)); - for (String fieldContent : fieldValues) { - mltQuery.add(new BooleanClause(mlt.like(fieldName, new StringReader(fieldContent)), BooleanClause.Occur.SHOULD)); - } - } - Query classFieldQuery = new WildcardQuery(new Term(classFieldName, "*")); - mltQuery.add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST)); - if (query != null) { - mltQuery.add(query, BooleanClause.Occur.MUST); - } - return indexSearcher.search(mltQuery.build(), k); + protected TopDocs knnSearch(Document document) throws IOException { + MoreLikeThisParameters classificationMltParameters = mlt.getParameters(); + classificationMltParameters.setFieldToAnalyzer(field2analyzer); + return super.knnSearch(document); } } diff --git a/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java b/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java index 3848151c5f87..87fa56bebb97 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java @@ -194,6 +194,33 @@ protected IndexReader populateDocumentClassificationIndex(Analyzer analyzer) thr doc.add(new Field(booleanFieldName, "false", ft)); indexWriter.addDocument(doc); + // following docs mean to create a more realistic corpus + doc = new Document(); + title = "Dark nights across time : returning trend"; + text = "History tells you that the night is often associated to fear and darkness." + + "It is not new to see this kind of association, together with the fire, a concept of relief in dark periods."; + author = "author2"; + doc.add(new Field(textFieldName, text, ft)); + doc.add(new Field(titleFieldName, title, ft)); + doc.add(new Field(authorFieldName, author, ft)); + doc.add(new Field(categoryFieldName, "", ft)); + doc.add(new Field(booleanFieldName, "false", ft)); + for (int i = 0; i < 10; i++) { + indexWriter.addDocument(doc); + } + doc = new Document(); + title = "The time of new troubles"; + text = "Because sometimes success involves passing across troubles but it is completely worth." + + "Many persons faced this kind of path, across different generations. A number of three ( 3 ) studies from Cambridge has confirmed the trend."; + author = "author1"; + doc.add(new Field(textFieldName, text, ft)); + doc.add(new Field(titleFieldName, title, ft)); + doc.add(new Field(authorFieldName, author, ft)); + doc.add(new Field(categoryFieldName, "", ft)); + doc.add(new Field(booleanFieldName, "false", ft)); + for (int i = 0; i < 10; i++) { + indexWriter.addDocument(doc); + } doc = new Document(); text = "unlabeled doc"; @@ -229,7 +256,6 @@ protected Document getBatmanDocument() { " The three-dimensional images of the game are stunning, because it uses the Unreal Engine 3." + " The systems available are PS4, X-Box and personal computer." + " Will the simulate missile that is going to be fired, success ?\" +\n" + - " Will this video game make the history" + " Help you favourite super hero to defeat all his enemies. The Dark Knight has returned !"; String author = "Rocksteady Studios"; doc.add(new Field(textFieldName, text, ft)); diff --git a/lucene/classification/src/test/org/apache/lucene/classification/utils/ConfusionMatrixGeneratorTest.java b/lucene/classification/src/test/org/apache/lucene/classification/utils/ConfusionMatrixGeneratorTest.java index 63cce2a1c8e5..328be50a45b5 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/utils/ConfusionMatrixGeneratorTest.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/utils/ConfusionMatrixGeneratorTest.java @@ -155,7 +155,7 @@ public void testGetConfusionMatrixWithKNN() throws Exception { try { MockAnalyzer analyzer = new MockAnalyzer(random()); reader = getSampleIndex(analyzer); - Classifier classifier = new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, categoryFieldName, textFieldName); + Classifier classifier = new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 1, 1, categoryFieldName, textFieldName); ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader, classifier, categoryFieldName, textFieldName, -1); assertNotNull(confusionMatrix); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index 676311806d75..6fe9278b7091 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -224,7 +224,7 @@ public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) thr return new BM25DocScorer(bm25stats, context.reader().getNormValues(bm25stats.field)); } - private class BM25DocScorer extends SimScorer { + public class BM25DocScorer extends SimScorer { private final BM25Stats stats; private final float weightValue; // boost * idf * (k1 + 1) private final NumericDocValues norms; @@ -258,6 +258,10 @@ public Explanation explain(int doc, Explanation freq) throws IOException { return explainScore(doc, freq, stats, norms); } + public float score(float freq, float norm) throws IOException { + return weightValue * freq / (freq + norm); + } + @Override public float computeSlopFactor(int distance) { return sloppyFreq(distance); @@ -295,6 +299,10 @@ private static class BM25Stats extends SimWeight { } + public final BM25DocScorer instantiateSimilarityScorer(SimWeight stats, NumericDocValues norms) throws IOException { + return new BM25DocScorer((BM25Stats)stats, norms); + } + private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms) throws IOException { List subs = new ArrayList<>(); subs.add(freq); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java index ea02af3f8f43..032b18b6c755 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java @@ -17,34 +17,18 @@ package org.apache.lucene.queries.mlt; import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.util.ArrayList; import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; -import org.apache.lucene.index.Fields; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.queries.mlt.query.MoreLikeThisQueryBuilder; +import org.apache.lucene.queries.mlt.terms.LuceneDocumentTermsRetriever; +import org.apache.lucene.queries.mlt.terms.LocalDocumentTermsRetriever; +import org.apache.lucene.queries.mlt.terms.scorer.ScoredTerm; import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.similarities.ClassicSimilarity; -import org.apache.lucene.search.similarities.TFIDFSimilarity; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.PriorityQueue; /** @@ -87,7 +71,7 @@ *
  * IndexReader ir = ...
  * IndexSearcher is = ...
- *
+ * 

* MoreLikeThis mlt = new MoreLikeThis(ir); * Reader target = ... // orig source of doc you want to find similarities to * Query query = mlt.like( target); @@ -95,7 +79,7 @@ * Hits hits = is.search(query); * // now the usual iteration thru 'hits' - the only thing to watch for is to make sure * //you ignore the doc if it matches your 'target' document, as it should be similar to itself - * + *

*

*

* Thus you: @@ -107,28 +91,6 @@ *

  • call the searcher to find the similar docs * *
    - *

    More Advanced Usage

    - *

    - * You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine - * multiple fields (e.g. body and title) for similarity. - *

    - * Depending on the size of your index and the size and makeup of your documents you - * may want to call the other set methods to control how the similarity queries are - * generated: - *

      - *
    • {@link #setMinTermFreq setMinTermFreq(...)} - *
    • {@link #setMinDocFreq setMinDocFreq(...)} - *
    • {@link #setMaxDocFreq setMaxDocFreq(...)} - *
    • {@link #setMaxDocFreqPct setMaxDocFreqPct(...)} - *
    • {@link #setMinWordLen setMinWordLen(...)} - *
    • {@link #setMaxWordLen setMaxWordLen(...)} - *
    • {@link #setMaxQueryTerms setMaxQueryTerms(...)} - *
    • {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)} - *
    • {@link #setStopWords setStopWord(...)} - *
    - *
    - *
    - *
      * Changes: Mark Harwood 29/02/04
      * Some bugfixing, some refactoring, some optimisation.
      * - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
    @@ -139,433 +101,55 @@
      * 
    */ public final class MoreLikeThis { - - /** - * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. - * - * @see #getMaxNumTokensParsed - */ - public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; - - /** - * Ignore terms with less than this frequency in the source doc. - * - * @see #getMinTermFreq - * @see #setMinTermFreq - */ - public static final int DEFAULT_MIN_TERM_FREQ = 2; - - /** - * Ignore words which do not occur in at least this many docs. - * - * @see #getMinDocFreq - * @see #setMinDocFreq - */ - public static final int DEFAULT_MIN_DOC_FREQ = 5; - - /** - * Ignore words which occur in more than this many docs. - * - * @see #getMaxDocFreq - * @see #setMaxDocFreq - * @see #setMaxDocFreqPct - */ - public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; - - /** - * Boost terms in query based on score. - * - * @see #isBoost - * @see #setBoost - */ - public static final boolean DEFAULT_BOOST = false; - - /** - * Default field names. Null is used to specify that the field names should be looked - * up at runtime from the provided reader. - */ - public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"}; - - /** - * Ignore words less than this length or if 0 then this has no effect. - * - * @see #getMinWordLen - * @see #setMinWordLen - */ - public static final int DEFAULT_MIN_WORD_LENGTH = 0; - - /** - * Ignore words greater than this length or if 0 then this has no effect. - * - * @see #getMaxWordLen - * @see #setMaxWordLen - */ - public static final int DEFAULT_MAX_WORD_LENGTH = 0; - /** - * Default set of stopwords. - * If null means to allow stop words. - * - * @see #setStopWords - * @see #getStopWords - */ - public static final Set DEFAULT_STOP_WORDS = null; - - /** - * Current set of stop words. - */ - private Set stopWords = DEFAULT_STOP_WORDS; - - /** - * Return a Query with no more than this many terms. - * - * @see BooleanQuery#getMaxClauseCount - * @see #getMaxQueryTerms - * @see #setMaxQueryTerms - */ - public static final int DEFAULT_MAX_QUERY_TERMS = 25; - - /** - * Analyzer that will be used to parse the doc. + * Parameter and default */ - private Analyzer analyzer = null; + private MoreLikeThisParameters params; - /** - * Ignore words less frequent that this. - */ - private int minTermFreq = DEFAULT_MIN_TERM_FREQ; - - /** - * Ignore words which do not occur in at least this many docs. - */ - private int minDocFreq = DEFAULT_MIN_DOC_FREQ; + private LocalDocumentTermsRetriever localDocumentTermsRetriever; - /** - * Ignore words which occur in more than this many docs. - */ - private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; + private LuceneDocumentTermsRetriever luceneDocumentTermsRetriever; - /** - * Should we apply a boost to the Query based on the scores? - */ - private boolean boost = DEFAULT_BOOST; - - /** - * Field name we'll analyze. - */ - private String[] fieldNames = DEFAULT_FIELD_NAMES; - - /** - * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - */ - private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; - - /** - * Ignore words if less than this len. - */ - private int minWordLen = DEFAULT_MIN_WORD_LENGTH; - - /** - * Ignore words if greater than this len. - */ - private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; - - /** - * Don't return a query longer than this. - */ - private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; - - /** - * For idf() calculations. - */ - private TFIDFSimilarity similarity;// = new DefaultSimilarity(); + private MoreLikeThisQueryBuilder queryBuilder; /** * IndexReader to use */ private final IndexReader ir; - /** - * Boost factor to use when boosting the terms - */ - private float boostFactor = 1; - - /** - * Returns the boost factor used when boosting terms - * - * @return the boost factor used when boosting terms - * @see #setBoostFactor(float) - */ - public float getBoostFactor() { - return boostFactor; - } - - /** - * Sets the boost factor to use when boosting terms - * - * @see #getBoostFactor() - */ - public void setBoostFactor(float boostFactor) { - this.boostFactor = boostFactor; - } - /** * Constructor requiring an IndexReader. */ public MoreLikeThis(IndexReader ir) { - this(ir, new ClassicSimilarity()); - } - - public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim) { - this.ir = ir; - this.similarity = sim; - } - - - public TFIDFSimilarity getSimilarity() { - return similarity; - } - - public void setSimilarity(TFIDFSimilarity similarity) { - this.similarity = similarity; + this(ir, new MoreLikeThisParameters()); } /** - * Returns an analyzer that will be used to parse source doc with. The default analyzer - * is not set. - * - * @return the analyzer that will be used to parse source doc with. - */ - public Analyzer getAnalyzer() { - return analyzer; - } - - /** - * Sets the analyzer to use. An analyzer is not required for generating a query with the - * {@link #like(int)} method, all other 'like' methods require an analyzer. - * - * @param analyzer the analyzer to use to tokenize text. - */ - public void setAnalyzer(Analyzer analyzer) { - this.analyzer = analyzer; - } - - /** - * Returns the frequency below which terms will be ignored in the source doc. The default - * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. - * - * @return the frequency below which terms will be ignored in the source doc. - */ - public int getMinTermFreq() { - return minTermFreq; - } - - /** - * Sets the frequency below which terms will be ignored in the source doc. - * - * @param minTermFreq the frequency below which terms will be ignored in the source doc. - */ - public void setMinTermFreq(int minTermFreq) { - this.minTermFreq = minTermFreq; - } - - /** - * Returns the frequency at which words will be ignored which do not occur in at least this - * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. - * - * @return the frequency at which words will be ignored which do not occur in at least this - * many docs. - */ - public int getMinDocFreq() { - return minDocFreq; - } - - /** - * Sets the frequency at which words will be ignored which do not occur in at least this - * many docs. - * - * @param minDocFreq the frequency at which words will be ignored which do not occur in at - * least this many docs. - */ - public void setMinDocFreq(int minDocFreq) { - this.minDocFreq = minDocFreq; - } - - /** - * Returns the maximum frequency in which words may still appear. - * Words that appear in more than this many docs will be ignored. The default frequency is - * {@link #DEFAULT_MAX_DOC_FREQ}. - * - * @return get the maximum frequency at which words are still allowed, - * words which occur in more docs than this are ignored. - */ - public int getMaxDocFreq() { - return maxDocFreq; - } - - /** - * Set the maximum frequency in which words may still appear. Words that appear - * in more than this many docs will be ignored. - * - * @param maxFreq the maximum count of documents that a term may appear - * in to be still considered relevant - */ - public void setMaxDocFreq(int maxFreq) { - this.maxDocFreq = maxFreq; - } - - /** - * Set the maximum percentage in which words may still appear. Words that appear - * in more than this many percent of all docs will be ignored. - * - * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear - * in to be still considered relevant - */ - public void setMaxDocFreqPct(int maxPercentage) { - this.maxDocFreq = maxPercentage * ir.numDocs() / 100; - } - - - /** - * Returns whether to boost terms in query based on "score" or not. The default is - * {@link #DEFAULT_BOOST}. - * - * @return whether to boost terms in query based on "score" or not. - * @see #setBoost - */ - public boolean isBoost() { - return boost; - } - - /** - * Sets whether to boost terms in query based on "score" or not. - * - * @param boost true to boost terms in query based on "score", false otherwise. - * @see #isBoost - */ - public void setBoost(boolean boost) { - this.boost = boost; - } - - /** - * Returns the field names that will be used when generating the 'More Like This' query. - * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. - * - * @return the field names that will be used when generating the 'More Like This' query. + * Constructor requiring an IndexReader. */ - public String[] getFieldNames() { - return fieldNames; - } + public MoreLikeThis(IndexReader ir, MoreLikeThisParameters params) { + this.params = params; + this.ir = ir; - /** - * Sets the field names that will be used when generating the 'More Like This' query. - * Set this to null for the field names to be determined at runtime from the IndexReader - * provided in the constructor. - * - * @param fieldNames the field names that will be used when generating the 'More Like This' - * query. - */ - public void setFieldNames(String[] fieldNames) { - this.fieldNames = fieldNames; - } + this.localDocumentTermsRetriever = new LocalDocumentTermsRetriever(ir, params); + this.luceneDocumentTermsRetriever = new LuceneDocumentTermsRetriever(ir, params); - /** - * Returns the minimum word length below which words will be ignored. Set this to 0 for no - * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. - * - * @return the minimum word length below which words will be ignored. - */ - public int getMinWordLen() { - return minWordLen; + this.queryBuilder = new MoreLikeThisQueryBuilder(params); } - /** - * Sets the minimum word length below which words will be ignored. - * - * @param minWordLen the minimum word length below which words will be ignored. - */ - public void setMinWordLen(int minWordLen) { - this.minWordLen = minWordLen; + public MoreLikeThisParameters getParameters() { + return this.params; } - /** - * Returns the maximum word length above which words will be ignored. Set this to 0 for no - * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. - * - * @return the maximum word length above which words will be ignored. - */ - public int getMaxWordLen() { - return maxWordLen; - } + public void setParameters(MoreLikeThisParameters params) { + this.params = params; - /** - * Sets the maximum word length above which words will be ignored. - * - * @param maxWordLen the maximum word length above which words will be ignored. - */ - public void setMaxWordLen(int maxWordLen) { - this.maxWordLen = maxWordLen; - } - - /** - * Set the set of stopwords. - * Any word in this set is considered "uninteresting" and ignored. - * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as - * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". - * - * @param stopWords set of stopwords, if null it means to allow stop words - * @see #getStopWords - */ - public void setStopWords(Set stopWords) { - this.stopWords = stopWords; - } + this.localDocumentTermsRetriever.setParameters(params); + this.luceneDocumentTermsRetriever.setParameters(params); - /** - * Get the current stop words being used. - * - * @see #setStopWords - */ - public Set getStopWords() { - return stopWords; + this.queryBuilder.setParameters(params); } - - /** - * Returns the maximum number of query terms that will be included in any generated query. - * The default is {@link #DEFAULT_MAX_QUERY_TERMS}. - * - * @return the maximum number of query terms that will be included in any generated query. - */ - public int getMaxQueryTerms() { - return maxQueryTerms; - } - - /** - * Sets the maximum number of query terms that will be included in any generated query. - * - * @param maxQueryTerms the maximum number of query terms that will be included in any - * generated query. - */ - public void setMaxQueryTerms(int maxQueryTerms) { - this.maxQueryTerms = maxQueryTerms; - } - - /** - * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - * @see #DEFAULT_MAX_NUM_TOKENS_PARSED - */ - public int getMaxNumTokensParsed() { - return maxNumTokensParsed; - } - - /** - * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - */ - public void setMaxNumTokensParsed(int i) { - maxNumTokensParsed = i; - } - - /** * Return a query that will return docs like the passed lucene document ID. * @@ -573,424 +157,58 @@ public void setMaxNumTokensParsed(int i) { * @return a query that will return docs like the passed lucene document ID. */ public Query like(int docNum) throws IOException { - if (fieldNames == null) { - // gather list of valid fields from lucene - Collection fields = MultiFields.getIndexedFields(ir); - fieldNames = fields.toArray(new String[fields.size()]); - } + String[] fieldNames = params.getFieldNames(); + initMoreLikeThisQueryFields(fieldNames); + PriorityQueue scoredTerms = localDocumentTermsRetriever.retrieveTermsFromLocalDocument(docNum); - return createQuery(retrieveTerms(docNum)); - } - - /** - * - * @param filteredDocument Document with field values extracted for selected fields. - * @return More Like This query for the passed document. - */ - public Query like(Map> filteredDocument) throws IOException { - if (fieldNames == null) { - // gather list of valid fields from lucene - Collection fields = MultiFields.getIndexedFields(ir); - fieldNames = fields.toArray(new String[fields.size()]); - } - return createQuery(retrieveTerms(filteredDocument)); + return queryBuilder.createQuery(scoredTerms); } - /** - * Return a query that will return docs like the passed Readers. - * This was added in order to treat multi-value fields. - * - * @return a query that will return docs like the passed Readers. - */ - public Query like(String fieldName, Reader... readers) throws IOException { - Map> perFieldTermFrequencies = new HashMap<>(); - for (Reader r : readers) { - addTermFrequencies(r, perFieldTermFrequencies, fieldName); - } - return createQuery(createQueue(perFieldTermFrequencies)); + public Query like(Document luceneDocument) throws IOException { + initMoreLikeThisQueryFields(params.getFieldNames()); + PriorityQueue scoredTerms = luceneDocumentTermsRetriever.retrieveTermsFromDocument(luceneDocument); + return queryBuilder.createQuery(scoredTerms); } - /** - * Create the More like query from a PriorityQueue - */ - private Query createQuery(PriorityQueue q) { - BooleanQuery.Builder query = new BooleanQuery.Builder(); - ScoreTerm scoreTerm; - float bestScore = -1; - - while ((scoreTerm = q.pop()) != null) { - Query tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word)); + public Query like(String fieldName, String... seedText) throws IOException { + initMoreLikeThisQueryFields(params.getFieldNames()); - if (boost) { - if (bestScore == -1) { - bestScore = (scoreTerm.score); - } - float myScore = (scoreTerm.score); - tq = new BoostQuery(tq, boostFactor * myScore / bestScore); - } - - try { - query.add(tq, BooleanClause.Occur.SHOULD); - } - catch (BooleanQuery.TooManyClauses ignore) { - break; - } + Document luceneDocument = new Document(); + for (String seedTextValue : seedText) { + luceneDocument.add(new TextField(fieldName, seedTextValue, Field.Store.YES)); } - return query.build(); - } - - /** - * Create a PriorityQueue from a word->tf map. - * - * @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int objects as the values. - */ - private PriorityQueue createQueue(Map> perFieldTermFrequencies) throws IOException { - // have collected all words in doc and their freqs - int numDocs = ir.numDocs(); - final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies)); - FreqQ queue = new FreqQ(limit); // will order words by score - for (Map.Entry> entry : perFieldTermFrequencies.entrySet()) { - Map perWordTermFrequencies = entry.getValue(); - String fieldName = entry.getKey(); - - for (Map.Entry tfEntry : perWordTermFrequencies.entrySet()) { // for every word - String word = tfEntry.getKey(); - int tf = tfEntry.getValue().x; // term freq in the source doc - if (minTermFreq > 0 && tf < minTermFreq) { - continue; // filter out words that don't occur enough times in the source - } - - int docFreq = ir.docFreq(new Term(fieldName, word)); - - if (minDocFreq > 0 && docFreq < minDocFreq) { - continue; // filter out words that don't occur in enough docs - } - if (docFreq > maxDocFreq) { - continue; // filter out words that occur in too many docs - } - - if (docFreq == 0) { - continue; // index update problem? - } - - float idf = similarity.idf(docFreq, numDocs); - float score = tf * idf; - - if (queue.size() < limit) { - // there is still space in the queue - queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf)); - } else { - ScoreTerm term = queue.top(); - if (term.score < score) { // update the smallest in the queue in place and update the queue. - term.update(word, fieldName, score, idf, docFreq, tf); - queue.updateTop(); - } - } - } - } - return queue; + return this.like(luceneDocument); } - private int getTermsCount(Map> perFieldTermFrequencies) { - int totalTermsCount = 0; - Collection> values = perFieldTermFrequencies.values(); - for (Map perWordTermFrequencies : values) { - totalTermsCount += perWordTermFrequencies.size(); + private void initMoreLikeThisQueryFields(String[] fieldNames) { + if (fieldNames == null) { + // gather list of valid fields from lucene + Collection fields = MultiFields.getIndexedFields(ir); + params.setFieldNames(fields.toArray(new String[fields.size()])); } - return totalTermsCount; } /** * Describe the parameters that control how the "more like this" query is formed. */ - public String describeParams() { + public String describeParameters() { StringBuilder sb = new StringBuilder(); - sb.append("\t").append("maxQueryTerms : ").append(maxQueryTerms).append("\n"); - sb.append("\t").append("minWordLen : ").append(minWordLen).append("\n"); - sb.append("\t").append("maxWordLen : ").append(maxWordLen).append("\n"); + sb.append("\t").append("maxQueryTerms : ").append(params.getMaxQueryTerms()).append("\n"); + sb.append("\t").append("minWordLen : ").append(params.getMinWordLen()).append("\n"); + sb.append("\t").append("maxWordLen : ").append(params.getMaxWordLen()).append("\n"); sb.append("\t").append("fieldNames : "); String delim = ""; - for (String fieldName : fieldNames) { + for (String fieldName : params.getFieldNames()) { sb.append(delim).append(fieldName); delim = ", "; } sb.append("\n"); - sb.append("\t").append("boost : ").append(boost).append("\n"); - sb.append("\t").append("minTermFreq : ").append(minTermFreq).append("\n"); - sb.append("\t").append("minDocFreq : ").append(minDocFreq).append("\n"); + sb.append("\t").append("boost : ").append(params.isBoostEnabled()).append("\n"); + sb.append("\t").append("minTermFreq : ").append(params.getMinTermFreq()).append("\n"); + sb.append("\t").append("minDocFreq : ").append(params.getMinDocFreq()).append("\n"); return sb.toString(); } - /** - * Find words for a more-like-this query former. - * - * @param docNum the id of the lucene document from which to find terms - */ - private PriorityQueue retrieveTerms(int docNum) throws IOException { - Map> field2termFreqMap = new HashMap<>(); - for (String fieldName : fieldNames) { - final Fields vectors = ir.getTermVectors(docNum); - final Terms vector; - if (vectors != null) { - vector = vectors.terms(fieldName); - } else { - vector = null; - } - - // field does not store term vector info - if (vector == null) { - Document d = ir.document(docNum); - IndexableField[] fields = d.getFields(fieldName); - for (IndexableField field : fields) { - final String stringValue = field.stringValue(); - if (stringValue != null) { - addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName); - } - } - } else { - addTermFrequencies(field2termFreqMap, vector, fieldName); - } - } - - return createQueue(field2termFreqMap); - } - - private PriorityQueue retrieveTerms(Map> field2fieldValues) throws - IOException { - Map> field2termFreqMap = new HashMap<>(); - for (String fieldName : fieldNames) { - for (String field : field2fieldValues.keySet()) { - Collection fieldValues = field2fieldValues.get(field); - if(fieldValues == null) - continue; - for(Object fieldValue:fieldValues) { - if (fieldValue != null) { - addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap, - fieldName); - } - } - } - } - return createQueue(field2termFreqMap); - } - /** - * Adds terms and frequencies found in vector into the Map termFreqMap - * - * @param field2termFreqMap a Map of terms and their frequencies per field - * @param vector List of terms and their frequencies for a doc/field - */ - private void addTermFrequencies(Map> field2termFreqMap, Terms vector, String fieldName) throws IOException { - Map termFreqMap = field2termFreqMap.get(fieldName); - if (termFreqMap == null) { - termFreqMap = new HashMap<>(); - field2termFreqMap.put(fieldName, termFreqMap); - } - final TermsEnum termsEnum = vector.iterator(); - final CharsRefBuilder spare = new CharsRefBuilder(); - BytesRef text; - while((text = termsEnum.next()) != null) { - spare.copyUTF8Bytes(text); - final String term = spare.toString(); - if (isNoiseWord(term)) { - continue; - } - final int freq = (int) termsEnum.totalTermFreq(); - - // increment frequency - Int cnt = termFreqMap.get(term); - if (cnt == null) { - cnt = new Int(); - termFreqMap.put(term, cnt); - cnt.x = freq; - } else { - cnt.x += freq; - } - } - } - - /** - * Adds term frequencies found by tokenizing text from reader into the Map words - * - * @param r a source of text to be tokenized - * @param perFieldTermFrequencies a Map of terms and their frequencies per field - * @param fieldName Used by analyzer for any special per-field analysis - */ - private void addTermFrequencies(Reader r, Map> perFieldTermFrequencies, String fieldName) - throws IOException { - if (analyzer == null) { - throw new UnsupportedOperationException("To use MoreLikeThis without " + - "term vectors, you must provide an Analyzer"); - } - Map termFreqMap = perFieldTermFrequencies.get(fieldName); - if (termFreqMap == null) { - termFreqMap = new HashMap<>(); - perFieldTermFrequencies.put(fieldName, termFreqMap); - } - try (TokenStream ts = analyzer.tokenStream(fieldName, r)) { - int tokenCount = 0; - // for every token - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - ts.reset(); - while (ts.incrementToken()) { - String word = termAtt.toString(); - tokenCount++; - if (tokenCount > maxNumTokensParsed) { - break; - } - if (isNoiseWord(word)) { - continue; - } - - // increment frequency - Int cnt = termFreqMap.get(word); - if (cnt == null) { - termFreqMap.put(word, new Int()); - } else { - cnt.x++; - } - } - ts.end(); - } - } - - - /** - * determines if the passed term is likely to be of interest in "more like" comparisons - * - * @param term The word being considered - * @return true if should be ignored, false if should be used in further analysis - */ - private boolean isNoiseWord(String term) { - int len = term.length(); - if (minWordLen > 0 && len < minWordLen) { - return true; - } - if (maxWordLen > 0 && len > maxWordLen) { - return true; - } - return stopWords != null && stopWords.contains(term); - } - - - /** - * Find words for a more-like-this query former. - * The result is a priority queue of arrays with one entry for every word in the document. - * Each array has 6 elements. - * The elements are: - *
      - *
    1. The word (String) - *
    2. The top field that this word comes from (String) - *
    3. The score for this word (Float) - *
    4. The IDF value (Float) - *
    5. The frequency of this word in the index (Integer) - *
    6. The frequency of this word in the source document (Integer) - *
    - * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. - * This method is exposed so that you can identify the "interesting words" in a document. - * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. - * - * @param r the reader that has the content of the document - * @param fieldName field passed to the analyzer to use when analyzing the content - * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first - * @see #retrieveInterestingTerms - */ - private PriorityQueue retrieveTerms(Reader r, String fieldName) throws IOException { - Map> field2termFreqMap = new HashMap<>(); - addTermFrequencies(r, field2termFreqMap, fieldName); - return createQueue(field2termFreqMap); - } - - /** - * @see #retrieveInterestingTerms(java.io.Reader, String) - */ - public String[] retrieveInterestingTerms(int docNum) throws IOException { - ArrayList al = new ArrayList<>(maxQueryTerms); - PriorityQueue pq = retrieveTerms(docNum); - ScoreTerm scoreTerm; - int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... - // we just want to return the top words - while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { - al.add(scoreTerm.word); // the 1st entry is the interesting word - } - String[] res = new String[al.size()]; - return al.toArray(res); - } - - /** - * Convenience routine to make it easy to return the most interesting words in a document. - * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. - * - * @param r the source document - * @param fieldName field passed to analyzer to use when analyzing the content - * @return the most interesting words in the document - * @see #retrieveTerms(java.io.Reader, String) - * @see #setMaxQueryTerms - */ - public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { - ArrayList al = new ArrayList<>(maxQueryTerms); - PriorityQueue pq = retrieveTerms(r, fieldName); - ScoreTerm scoreTerm; - int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... - // we just want to return the top words - while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { - al.add(scoreTerm.word); // the 1st entry is the interesting word - } - String[] res = new String[al.size()]; - return al.toArray(res); - } - - /** - * PriorityQueue that orders words by score. - */ - private static class FreqQ extends PriorityQueue { - FreqQ(int maxSize) { - super(maxSize); - } - - @Override - protected boolean lessThan(ScoreTerm a, ScoreTerm b) { - return a.score < b.score; - } - } - - private static class ScoreTerm { - // only really need 1st 3 entries, other ones are for troubleshooting - String word; - String topField; - float score; - float idf; - int docFreq; - int tf; - - ScoreTerm(String word, String topField, float score, float idf, int docFreq, int tf) { - this.word = word; - this.topField = topField; - this.score = score; - this.idf = idf; - this.docFreq = docFreq; - this.tf = tf; - } - - void update(String word, String topField, float score, float idf, int docFreq, int tf) { - this.word = word; - this.topField = topField; - this.score = score; - this.idf = idf; - this.docFreq = docFreq; - this.tf = tf; - } - } - - /** - * Use for frequencies and to avoid renewing Integers. - */ - private static class Int { - int x; - - Int() { - x = 1; - } - } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisParameters.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisParameters.java new file mode 100644 index 000000000000..d83d9858a204 --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisParameters.java @@ -0,0 +1,437 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.queries.mlt; + +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.BooleanQuery; + +public final class MoreLikeThisParameters { + + /** + * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. + * + * @see #getMaxNumTokensParsed + */ + public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; + + /** + * Ignore terms with less than this frequency in the source doc. + * + * @see #getMinTermFreq + * @see #setMinTermFreq + */ + public static final int DEFAULT_MIN_TERM_FREQ = 2; + + /** + * Ignore words which do not occur in at least this many docs. + * + * @see #getMinDocFreq + * @see #setMinDocFreq + */ + public static final int DEFAULT_MIN_DOC_FREQ = 5; + + /** + * Ignore words which occur in more than this many docs. + * + * @see #getMaxDocFreq + * @see #setMaxDocFreq + * @see #setMaxDocFreqPct + */ + public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; + + /** + * Default field names. Null is used to specify that the field names should be looked + * up at runtime from the provided reader. + */ + public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"}; + + /** + * Ignore words less than this length or if 0 then this has no effect. + * + * @see #getMinWordLen + * @see #setMinWordLen + */ + public static final int DEFAULT_MIN_WORD_LENGTH = 0; + + /** + * Ignore words greater than this length or if 0 then this has no effect. + * + * @see #getMaxWordLen + * @see #setMaxWordLen + */ + public static final int DEFAULT_MAX_WORD_LENGTH = 0; + + /** + * Default set of stopwords. + * If null means to allow stop words. + * + * @see #setStopWords + * @see #getStopWords + */ + public static final Set DEFAULT_STOP_WORDS = null; + + /** + * Current set of stop words. + */ + private Set stopWords = DEFAULT_STOP_WORDS; + + /** + * Return a Query with no more than this many terms. + * + * @see BooleanQuery#getMaxClauseCount + * @see #getMaxQueryTerms + * @see #setMaxQueryTerms + */ + public static final int DEFAULT_MAX_QUERY_TERMS = 25; + + /** + * Analyzer that will be used to parse the doc. + * This analyzer will be used for all the fields in the document. + */ + private Analyzer analyzer = null; + + /** + * Advanced : + * Pass a specific Analyzer per field + */ + private Map fieldToAnalyzer = null; + + /** + * Ignore words less frequent that this. + */ + private int minTermFreq = DEFAULT_MIN_TERM_FREQ; + + /** + * Ignore words which do not occur in at least this many docs. + */ + private int minDocFreq = DEFAULT_MIN_DOC_FREQ; + + /** + * Ignore words which occur in more than this many docs. + */ + private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; + + /** + * If enabled a queryTimeBoostFactor will applied to each query term. + * This queryTimeBoostFactor is the term score. + * More the term is considered interesting, stronger the queryTimeBoost + */ + private boolean boostEnabled = false; + + /** + * Generic queryTimeBoostFactor that will affect all the fields. + * This can be override specifying a boost factor per field. + */ + private float queryTimeBoostFactor = 1.0f; + + /** + * Boost factor per field, it overrides the generic queryTimeBoostFactor. + */ + private Map fieldToQueryTimeBoostFactor = null; + + /** + * Field name we'll analyze. + */ + private String[] fieldNames = DEFAULT_FIELD_NAMES; + + /** + * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; + + /** + * Ignore words if less than this len. + */ + private int minWordLen = DEFAULT_MIN_WORD_LENGTH; + + /** + * Ignore words if greater than this len. + */ + private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; + + /** + * Don't return a query longer than this. + */ + private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; + + + public float getQueryTimeBoostFactor() { + return queryTimeBoostFactor; + } + + public void setQueryTimeBoostFactor(float queryTimeBoostFactor) { + this.queryTimeBoostFactor = queryTimeBoostFactor; + } + + public Map getFieldToQueryTimeBoostFactor() { + return fieldToQueryTimeBoostFactor; + } + + public void setFieldToQueryTimeBoostFactor(Map fieldToQueryTimeBoostFactor) { + this.fieldToQueryTimeBoostFactor = fieldToQueryTimeBoostFactor; + } + + /** + * Returns an analyzer that will be used to parse source doc with. The default analyzer + * is not set. + * + * @return the analyzer that will be used to parse source doc with. + */ + public Analyzer getAnalyzer() { + return analyzer; + } + + /** + * Sets the analyzer to use. An analyzer is not required for generating a query + * when using {@link MoreLikeThis} like(int docId) and term Vector is available + * for the fields we are interested in using for similarity. + * method, all other 'like' methods require an analyzer. + * + * @param analyzer the analyzer to use to tokenize text. + */ + public void setAnalyzer(Analyzer analyzer) { + this.analyzer = analyzer; + } + + public Map getFieldToAnalyzer() { + return fieldToAnalyzer; + } + + public void setFieldToAnalyzer(Map fieldToAnalyzer) { + this.fieldToAnalyzer = fieldToAnalyzer; + } + + /** + * Returns the frequency below which terms will be ignored in the source doc. The default + * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. + * + * @return the frequency below which terms will be ignored in the source doc. + */ + public int getMinTermFreq() { + return minTermFreq; + } + + /** + * Sets the frequency below which terms will be ignored in the source doc. + * + * @param minTermFreq the frequency below which terms will be ignored in the source doc. + */ + public void setMinTermFreq(int minTermFreq) { + this.minTermFreq = minTermFreq; + } + + /** + * Returns the frequency at which words will be ignored which do not occur in at least this + * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. + * + * @return the frequency at which words will be ignored which do not occur in at least this + * many docs. + */ + public int getMinDocFreq() { + return minDocFreq; + } + + /** + * Sets the frequency at which words will be ignored which do not occur in at least this + * many docs. + * + * @param minDocFreq the frequency at which words will be ignored which do not occur in at + * least this many docs. + */ + public void setMinDocFreq(int minDocFreq) { + this.minDocFreq = minDocFreq; + } + + /** + * Returns the maximum frequency in which words may still appear. + * Words that appear in more than this many docs will be ignored. The default frequency is + * {@link #DEFAULT_MAX_DOC_FREQ}. + * + * @return get the maximum frequency at which words are still allowed, + * words which occur in more docs than this are ignored. + */ + public int getMaxDocFreq() { + return maxDocFreq; + } + + /** + * Set the maximum frequency in which words may still appear. Words that appear + * in more than this many docs will be ignored. + * + * @param maxFreq the maximum count of documents that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreq(int maxFreq) { + this.maxDocFreq = maxFreq; + } + + /** + * Set the maximum percentage in which words may still appear. Words that appear + * in more than this many percent of all docs will be ignored. + * + * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreqPct(IndexReader ir, int maxPercentage) { + this.maxDocFreq = maxPercentage * ir.numDocs() / 100; + } + + + /** + * Returns whether to boostEnabled terms in query based on "score" or not. The default is + * false. + * + * @return whether to boostEnabled terms in query based on "score" or not. + * @see #enableBoost + */ + public boolean isBoostEnabled() { + return boostEnabled; + } + + /** + * Sets whether to boostEnabled terms in query based on "score" or not. + * + * @param boostEnabled true to boostEnabled terms in query based on "score", false otherwise. + * @see #isBoostEnabled + */ + public void enableBoost(boolean boostEnabled) { + this.boostEnabled = boostEnabled; + } + + /** + * Returns the field names that will be used when generating the 'More Like This' query. + * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. + * + * @return the field names that will be used when generating the 'More Like This' query. + */ + public String[] getFieldNames() { + return fieldNames; + } + + /** + * Sets the field names that will be used when generating the 'More Like This' query. + * Set this to null for the field names to be determined at runtime from the IndexReader + * provided in the constructor. + * + * @param fieldNames the field names that will be used when generating the 'More Like This' + * query. + */ + public void setFieldNames(String[] fieldNames) { + this.fieldNames = fieldNames; + } + + /** + * Returns the minimum term length below which words will be ignored. Set this to 0 for no + * minimum term length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. + * + * @return the minimum term length below which words will be ignored. + */ + public int getMinWordLen() { + return minWordLen; + } + + /** + * Sets the minimum term length below which words will be ignored. + * + * @param minWordLen the minimum term length below which words will be ignored. + */ + public void setMinWordLen(int minWordLen) { + this.minWordLen = minWordLen; + } + + /** + * Returns the maximum term length above which words will be ignored. Set this to 0 for no + * maximum term length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. + * + * @return the maximum term length above which words will be ignored. + */ + public int getMaxWordLen() { + return maxWordLen; + } + + /** + * Sets the maximum term length above which words will be ignored. + * + * @param maxWordLen the maximum term length above which words will be ignored. + */ + public void setMaxWordLen(int maxWordLen) { + this.maxWordLen = maxWordLen; + } + + /** + * Set the set of stopwords. + * Any term in this set is considered "uninteresting" and ignored. + * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as + * for the purposes of document similarity it seems reasonable to assume that "a stop term is never interesting". + * + * @param stopWords set of stopwords, if null it means to allow stop words + * @see #getStopWords + */ + public void setStopWords(Set stopWords) { + this.stopWords = stopWords; + } + + /** + * Get the current stop words being used. + * + * @see #setStopWords + */ + public Set getStopWords() { + return stopWords; + } + + + /** + * Returns the maximum number of query terms that will be included in any generated query. + * The default is {@link #DEFAULT_MAX_QUERY_TERMS}. + * + * @return the maximum number of query terms that will be included in any generated query. + */ + public int getMaxQueryTerms() { + return maxQueryTerms; + } + + /** + * Sets the maximum number of query terms that will be included in any generated query. + * + * @param maxQueryTerms the maximum number of query terms that will be included in any + * generated query. + */ + public void setMaxQueryTerms(int maxQueryTerms) { + this.maxQueryTerms = maxQueryTerms; + } + + /** + * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + * @see #DEFAULT_MAX_NUM_TOKENS_PARSED + */ + public int getMaxNumTokensParsed() { + return maxNumTokensParsed; + } + + /** + * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + public void setMaxNumTokensParsed(int i) { + maxNumTokensParsed = i; + } + +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/query/MoreLikeThisQuery.java similarity index 68% rename from lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisQuery.java rename to lucene/queries/src/java/org/apache/lucene/queries/mlt/query/MoreLikeThisQuery.java index 9f3310c7a2c8..ff18e6bda307 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/query/MoreLikeThisQuery.java @@ -14,31 +14,35 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.queries.mlt; +package org.apache.lucene.queries.mlt.query; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Objects; +import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queries.mlt.MoreLikeThis; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; -import java.io.IOException; -import java.io.StringReader; -import java.util.Arrays; -import java.util.Set; -import java.util.Objects; - /** * A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg * in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the * actual MoreLikeThis object and obtain the real Query object. + * TO DO : fixare qeusta classe */ public class MoreLikeThisQuery extends Query { - private String likeText; - private String[] moreLikeFields; + private String seedText; private Analyzer analyzer; - private final String fieldName; + private String[] moreLikeFields; private float percentTermsToMatch = 0.3f; private int minTermFrequency = 1; private int maxQueryTerms = 5; @@ -48,26 +52,22 @@ public class MoreLikeThisQuery extends Query { /** * @param moreLikeFields fields used for similarity measure */ - public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer, String fieldName) { - this.likeText = Objects.requireNonNull(likeText); - this.moreLikeFields = Objects.requireNonNull(moreLikeFields); + public MoreLikeThisQuery(String seedText, String[] moreLikeFields, Analyzer analyzer) { + this.seedText = Objects.requireNonNull(seedText); this.analyzer = Objects.requireNonNull(analyzer); - this.fieldName = Objects.requireNonNull(fieldName); + this.moreLikeFields = Objects.requireNonNull(moreLikeFields); } @Override public Query rewrite(IndexReader reader) throws IOException { - MoreLikeThis mlt = new MoreLikeThis(reader); + MoreLikeThisParameters mltParams = initMoreLikeThisParams(); + MoreLikeThis mlt = new MoreLikeThis(reader, mltParams); - mlt.setFieldNames(moreLikeFields); - mlt.setAnalyzer(analyzer); - mlt.setMinTermFreq(minTermFrequency); - if (minDocFreq >= 0) { - mlt.setMinDocFreq(minDocFreq); + Document textDocument = new Document(); + for (String fieldName : moreLikeFields) { + textDocument.add(new TextField(fieldName, seedText, Field.Store.YES)); } - mlt.setMaxQueryTerms(maxQueryTerms); - mlt.setStopWords(stopWords); - BooleanQuery bq = (BooleanQuery) mlt.like(fieldName, new StringReader(likeText)); + BooleanQuery bq = (BooleanQuery) mlt.like(textDocument); BooleanQuery.Builder newBq = new BooleanQuery.Builder(); for (BooleanClause clause : bq) { newBq.add(clause); @@ -77,12 +77,25 @@ public Query rewrite(IndexReader reader) throws IOException { return newBq.build(); } + private MoreLikeThisParameters initMoreLikeThisParams() { + MoreLikeThisParameters mltParams = new MoreLikeThisParameters(); + mltParams.setFieldNames(moreLikeFields); + mltParams.setAnalyzer(analyzer); + mltParams.setMinTermFreq(minTermFrequency); + if (minDocFreq >= 0) { + mltParams.setMinDocFreq(minDocFreq); + } + mltParams.setMaxQueryTerms(maxQueryTerms); + mltParams.setStopWords(stopWords); + return mltParams; + } + /* (non-Javadoc) * @see org.apache.lucene.search.Query#toString(java.lang.String) */ @Override public String toString(String field) { - return "like:" + likeText; + return "like:" + seedText; } public float getPercentTermsToMatch() { @@ -101,12 +114,12 @@ public void setAnalyzer(Analyzer analyzer) { this.analyzer = analyzer; } - public String getLikeText() { - return likeText; + public String getSeedText() { + return seedText; } - public void setLikeText(String likeText) { - this.likeText = likeText; + public void setSeedText(String seedText) { + this.seedText = seedText; } public int getMaxQueryTerms() { @@ -153,7 +166,7 @@ public void setMinDocFreq(int minDocFreq) { public int hashCode() { final int prime = 31; int result = classHash(); - result = prime * result + Objects.hash(analyzer, fieldName, likeText, stopWords); + result = prime * result + Objects.hash(analyzer, seedText, stopWords); result = prime * result + maxQueryTerms; result = prime * result + minDocFreq; result = prime * result + minTermFrequency; @@ -165,18 +178,17 @@ public int hashCode() { @Override public boolean equals(Object other) { return sameClassAs(other) && - equalsTo(getClass().cast(other)); + equalsTo(getClass().cast(other)); } private boolean equalsTo(MoreLikeThisQuery other) { return maxQueryTerms == other.maxQueryTerms && - minDocFreq == other.minDocFreq && - minTermFrequency == other.minTermFrequency && - Float.floatToIntBits(percentTermsToMatch) == Float.floatToIntBits(other.percentTermsToMatch) && - analyzer.equals(other.analyzer) && - fieldName.equals(other.fieldName) && - likeText.equals(other.likeText) && - Arrays.equals(moreLikeFields, other.moreLikeFields) && - Objects.equals(stopWords, other.stopWords); + minDocFreq == other.minDocFreq && + minTermFrequency == other.minTermFrequency && + Float.floatToIntBits(percentTermsToMatch) == Float.floatToIntBits(other.percentTermsToMatch) && + analyzer.equals(other.analyzer) && + seedText.equals(other.seedText) && + Arrays.equals(moreLikeFields, other.moreLikeFields) && + Objects.equals(stopWords, other.stopWords); } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryBuilder.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryBuilder.java new file mode 100644 index 000000000000..0361eeacaa00 --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryBuilder.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.query; + +import java.util.Map; + +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; +import org.apache.lucene.queries.mlt.terms.scorer.ScoredTerm; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util.PriorityQueue; + +/** + * This class has the responsibility of building the More Like This Boolean Query. + * It takes in inpout the interesting terms and build the term queries based on the score of each. + * + * Query time boosting is supported. + * If enabled each term will be boosted by its score. + */ +public class MoreLikeThisQueryBuilder { + + private MoreLikeThisParameters parameters; + + public MoreLikeThisQueryBuilder(MoreLikeThisParameters params) { + this.parameters = params; + } + + public BooleanQuery createQuery(PriorityQueue interestingTerms) { + BooleanQuery.Builder moreLikeThisQuery = new BooleanQuery.Builder(); + ScoredTerm interestingTerm; + float minScore = -1; + + while ((interestingTerm = interestingTerms.pop()) != null) { + Query interestingTermQuery = new TermQuery(new Term(interestingTerm.field, interestingTerm.term)); + + if (parameters.isBoostEnabled()) { + float currentScore = (interestingTerm.score); + if (minScore == -1) { + float fieldBoost = 1.0f; + Map fieldToQueryTimeBoostFactor = parameters.getFieldToQueryTimeBoostFactor(); + if(fieldToQueryTimeBoostFactor!=null && fieldToQueryTimeBoostFactor.get(interestingTerm.field)!=null){ + fieldBoost = fieldToQueryTimeBoostFactor.get(interestingTerm.field); + } + minScore = currentScore/fieldBoost; // boost was already applied when finindg interesting terms + } + interestingTermQuery = new BoostQuery(interestingTermQuery, currentScore / minScore); + } + + try { + moreLikeThisQuery.add(interestingTermQuery, BooleanClause.Occur.SHOULD); + } + catch (BooleanQuery.TooManyClauses ignore) { + break; + } + } + return moreLikeThisQuery.build(); + } + + public MoreLikeThisParameters getParameters() { + return parameters; + } + + public void setParameters(MoreLikeThisParameters parameters) { + this.parameters = parameters; + } +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/DocumentTermFrequencies.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/DocumentTermFrequencies.java new file mode 100644 index 000000000000..84d9685ebbfe --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/DocumentTermFrequencies.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * This class has the responsibility of storing the term frequencies count for all the terms in each document field. + * It is an auxiliary data structure used for the Lucene More Like This + */ +public class DocumentTermFrequencies { + private Map perFieldTermFrequencies; + + public DocumentTermFrequencies() { + perFieldTermFrequencies = new HashMap<>(); + } + + public FieldTermFrequencies get(String fieldName){ + FieldTermFrequencies requestedTermFrequencies = perFieldTermFrequencies.get(fieldName); + if(requestedTermFrequencies == null){ + requestedTermFrequencies = new FieldTermFrequencies(fieldName); + perFieldTermFrequencies.put(fieldName,requestedTermFrequencies); + } + return requestedTermFrequencies; + } + + public Collection getAll(){ + return perFieldTermFrequencies.values(); + } + + public void increment(String fieldName, String term, int frequency) { + FieldTermFrequencies fieldTermFrequencies = this.get(fieldName); + fieldTermFrequencies.incrementFrequency(term,frequency); + } + + public class FieldTermFrequencies{ + private String fieldName; + private Map perTermFrequency; + + public FieldTermFrequencies(String fieldName) { + this.fieldName = fieldName; + this.perTermFrequency = new HashMap<>(); + } + + public Int get(String term){ + return perTermFrequency.get(term); + } + + private void incrementFrequency(String term, int frequency){ + Int freqWrapper = perTermFrequency.get(term); + if (freqWrapper == null) { + freqWrapper = new Int(); + perTermFrequency.put(term, freqWrapper); + freqWrapper.frequency = frequency; + } else { + freqWrapper.frequency+=frequency; + } + } + + public Set> getAll(){ + return perTermFrequency.entrySet(); + } + + public Collection getAllFrequencies(){ + return perTermFrequency.values(); + } + + public int size(){ + return perTermFrequency.size(); + } + + public String getFieldName() { + return fieldName; + } + + public void setFieldName(String fieldName) { + this.fieldName = fieldName; + } + + } + + /** + * Use for frequencies and to avoid renewing Integers. + */ + static class Int { + int frequency; + Int() { + frequency = 1; + } + } + +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/InterestingTermsRetriever.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/InterestingTermsRetriever.java new file mode 100644 index 000000000000..dd283d655f23 --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/InterestingTermsRetriever.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms; + +import java.io.IOException; +import java.util.Collection; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; +import org.apache.lucene.queries.mlt.terms.scorer.BM25Scorer; +import org.apache.lucene.queries.mlt.terms.scorer.ScoredTerm; +import org.apache.lucene.queries.mlt.terms.scorer.TermScorer; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.SmallFloat; + +public abstract class InterestingTermsRetriever { + + protected MoreLikeThisParameters parameters; + protected TermScorer interestingTermsScorer = new BM25Scorer(); + protected IndexReader ir; + + /** + * Extract term frequencies from the field in input. + * This is used when no term vector is stored for the specific field + * + * @param perFieldTermFrequencies a Map of terms and their frequencies per field + */ + protected void updateTermFrequenciesCount(IndexableField field, DocumentTermFrequencies perFieldTermFrequencies) + throws IOException { + String fieldName = field.name(); + String fieldStringContent = field.stringValue(); + + if (fieldStringContent != null) { + Analyzer analyzer = parameters.getAnalyzer(); + if(parameters.getFieldToAnalyzer()!=null && parameters.getFieldToAnalyzer().get(fieldName)!=null){ + analyzer = parameters.getFieldToAnalyzer().get(fieldName); + } + final int maxNumTokensParsed = parameters.getMaxNumTokensParsed(); + + if (analyzer == null) { + throw new UnsupportedOperationException("To use MoreLikeThis without " + + "term vectors, you must provide an Analyzer"); + } + + try (TokenStream analysedTextStream = analyzer.tokenStream(fieldName, fieldStringContent)) { + int tokenCount = 0; + // for every token + CharTermAttribute termAtt = analysedTextStream.addAttribute(CharTermAttribute.class); + analysedTextStream.reset(); + while (analysedTextStream.incrementToken()) { + String word = termAtt.toString(); + tokenCount++; + if (tokenCount > maxNumTokensParsed) { + break; + } + if (isNoiseWord(word)) { + continue; + } + perFieldTermFrequencies.increment(fieldName,word,1); + } + analysedTextStream.end(); + } + } + } + + /** + * Given the term frequencies per field, this method creates a PriorityQueue based on Score. + * + * @param perFieldTermFrequencies a per field map of words keyed on the term(String) with Int objects representing frequencies as the values. + */ + public PriorityQueue retrieveInterestingTerms(DocumentTermFrequencies perFieldTermFrequencies) throws IOException { + final int minTermFreq = parameters.getMinTermFreq(); + final int maxQueryTerms = parameters.getMaxQueryTerms(); + final int minDocFreq = parameters.getMinDocFreq(); + final int maxDocFreq = parameters.getMaxDocFreq(); + final int queueSize = Math.min(maxQueryTerms, this.getTotalTermsCount(perFieldTermFrequencies)); + + FreqQ interestingTerms = new FreqQ(queueSize); // will order words by score + for (DocumentTermFrequencies.FieldTermFrequencies fieldTermFrequencies : perFieldTermFrequencies.getAll()) { + String fieldName = fieldTermFrequencies.getFieldName(); + float fieldBoost = getQueryTimeBoost(fieldName); + CollectionStatistics fieldStats = new IndexSearcher(ir).collectionStatistics(fieldName); + for (Map.Entry termFrequencyEntry : fieldTermFrequencies.getAll()) { // for every term + String word = termFrequencyEntry.getKey(); + int tf = termFrequencyEntry.getValue().frequency; // term freq in the source doc + + if (minTermFreq > 0 && tf < minTermFreq) { + continue; // filter out words that don't occur enough times in the source + } + + final Term currentTerm = new Term(fieldName, word); + int docFreq = ir.docFreq(currentTerm); + final TermStatistics currentTermStat = new TermStatistics(currentTerm.bytes(), docFreq, ir.totalTermFreq(currentTerm)); + + if (minDocFreq > 0 && docFreq < minDocFreq) { + continue; // filter out words that don't occur in enough docs + } + + if (docFreq > maxDocFreq) { + continue; // filter out words that occur in too many docs + } + + if (docFreq == 0) { + continue; // index update problem? + } + + float score = interestingTermsScorer.score(fieldName, fieldStats, currentTermStat, tf); + // Boost should affect which terms ends up to be interesting + score = fieldBoost * score; + + Similarity.SimWeight currentSimilarityStats = interestingTermsScorer.getSimilarityStats(fieldName, fieldStats, currentTermStat, tf); + + if (interestingTerms.size() < queueSize) { + // there is still space in the interestingTerms + interestingTerms.add(new ScoredTerm(word, fieldName, score, currentSimilarityStats));// there was idf, possibly we want the stats there + } else { + ScoredTerm minScoredTerm = interestingTerms.top(); + if (minScoredTerm.score < score) { // current term deserve a space as it is more interesting than the top + minScoredTerm.update(word, fieldName, score, currentSimilarityStats); + interestingTerms.updateTop(); + } + } + } + } + return interestingTerms; + } + + private float getQueryTimeBoost(String fieldName) { + float queryTimeBoost = parameters.getQueryTimeBoostFactor(); + Map fieldToQueryTimeBoost = parameters.getFieldToQueryTimeBoostFactor(); + if(fieldToQueryTimeBoost !=null){ + Float currentFieldQueryTimeBoost = fieldToQueryTimeBoost.get(fieldName); + if(currentFieldQueryTimeBoost!=null){ + queryTimeBoost = currentFieldQueryTimeBoost; + } + } + return queryTimeBoost; + } + + protected int getTotalTermsCount(DocumentTermFrequencies perFieldTermFrequencies) { + int totalTermsCount = 0; + Collection termFrequencies = perFieldTermFrequencies.getAll(); + for (DocumentTermFrequencies.FieldTermFrequencies singleFieldTermFrequencies : termFrequencies) { + totalTermsCount += singleFieldTermFrequencies.size(); + } + return totalTermsCount; + } + + protected float getNorm(DocumentTermFrequencies perFieldTermFrequencies, String fieldName, float fieldIndexBoost) { + DocumentTermFrequencies.FieldTermFrequencies term2frequencies = perFieldTermFrequencies.get(fieldName); + int fieldLength = term2frequencies.getAllFrequencies().stream().mapToInt(i -> i.frequency).sum(); + return (float) SmallFloat.floatToByte315(fieldIndexBoost / (float) Math.sqrt(fieldLength)); + } + + /** + * determines if the passed term is likely to be of interest in "more like" comparisons + * + * @param term The term being considered + * @return true if should be ignored, false if should be used in further analysis + */ + protected boolean isNoiseWord(String term) { + int maxWordLen = parameters.getMaxWordLen(); + int minWordLen = parameters.getMinWordLen(); + final Set stopWords = parameters.getStopWords(); + + int len = term.length(); + if (minWordLen > 0 && len < minWordLen) { + return true; + } + if (maxWordLen > 0 && len > maxWordLen) { + return true; + } + return stopWords != null && stopWords.contains(term); + } + + /** + * PriorityQueue that orders words by score. + */ + protected static class FreqQ extends PriorityQueue { + FreqQ(int maxSize) { + super(maxSize); + } + + @Override + protected boolean lessThan(ScoredTerm a, ScoredTerm b) { + return a.score < b.score; + } + } + + public MoreLikeThisParameters getParameters() { + return parameters; + } + + public void setParameters(MoreLikeThisParameters parameters) { + this.parameters = parameters; + } +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/LocalDocumentTermsRetriever.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/LocalDocumentTermsRetriever.java new file mode 100644 index 000000000000..d36da8cd1f10 --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/LocalDocumentTermsRetriever.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.MultiDocValues; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; +import org.apache.lucene.queries.mlt.terms.scorer.ScoredTerm; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.PriorityQueue; + +/** + * This class has the responsiblity of extracting interesting terms from a document already indexed. + * Each term will have a score assigned, indicating how much important is in the field. + * + * This class is currently used in : + * - MoreLikeThis Request Handler + * - Simple More Like This query parser + */ +public class LocalDocumentTermsRetriever extends InterestingTermsRetriever{ + + public LocalDocumentTermsRetriever(IndexReader ir) { + this.ir = ir; + } + + public LocalDocumentTermsRetriever(IndexReader ir, MoreLikeThisParameters params) { + this.ir = ir; + this.parameters =params; + } + + /** + * Find words for a more-like-this query former. + * + * @param docNum the id of the lucene document from which to find terms + */ + public PriorityQueue retrieveTermsFromLocalDocument(int docNum) throws IOException { + DocumentTermFrequencies perFieldTermFrequencies =new DocumentTermFrequencies(); + Map fieldToNorms = new HashMap<>(); + + for (String fieldName : parameters.getFieldNames()) { + fieldToNorms.put(fieldName,MultiDocValues.getNormValues(ir,fieldName)); + final Fields vectors = ir.getTermVectors(docNum); + final Terms vector; + + if (vectors != null) { + vector = vectors.terms(fieldName); + } else { + vector = null; + } + // field does not store term vector info + if (vector == null) { + Document localDocument = ir.document(docNum); + IndexableField[] fields = localDocument.getFields(fieldName); + for (IndexableField field : fields) { + updateTermFrequenciesCount(field,perFieldTermFrequencies); + } + } else { + updateTermFrequenciesCount(perFieldTermFrequencies, vector, fieldName); + } + } + super.interestingTermsScorer.setField2normsFromIndex(fieldToNorms); + super.interestingTermsScorer.setDocId(docNum); + + return retrieveInterestingTerms(perFieldTermFrequencies); + } + + /** + * Adds terms and frequencies found in vector into the Map termFreqMap + * + * @param perFieldTermFrequencies a Map of terms and their frequencies per field + * @param vector List of terms and their frequencies for a doc/field + */ + protected void updateTermFrequenciesCount(DocumentTermFrequencies perFieldTermFrequencies, Terms vector, String fieldName) throws IOException { + final TermsEnum termsEnum = vector.iterator(); + final CharsRefBuilder spare = new CharsRefBuilder(); + BytesRef text; + while((text = termsEnum.next()) != null) { + spare.copyUTF8Bytes(text); + final String term = spare.toString(); + if (isNoiseWord(term)) { + continue; + } + final int freq = (int) termsEnum.totalTermFreq(); + perFieldTermFrequencies.increment(fieldName,term,freq); + } + } + + public String[] retrieveInterestingTerms(int docNum) throws IOException { + final int maxQueryTerms = parameters.getMaxQueryTerms(); + + ArrayList al = new ArrayList<>(maxQueryTerms); + PriorityQueue pq = retrieveTermsFromLocalDocument(docNum); + ScoredTerm scoredTerm; + int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... + // we just want to return the top words + while (((scoredTerm = pq.pop()) != null) && lim-- > 0) { + al.add(scoredTerm.term); // the 1st entry is the interesting term + } + String[] res = new String[al.size()]; + return al.toArray(res); + } + +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/LuceneDocumentTermsRetriever.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/LuceneDocumentTermsRetriever.java new file mode 100644 index 000000000000..34c389699c6f --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/LuceneDocumentTermsRetriever.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; +import org.apache.lucene.queries.mlt.terms.scorer.ScoredTerm; +import org.apache.lucene.util.PriorityQueue; + +/** + * This class has the responsiblity of extracting interesting terms from a lucene document in input. + * Each term will have a score assigned, indicating how much important is in the field. + * + * This class is currently used in : + * - CloudMLTQParser + */ +public class LuceneDocumentTermsRetriever extends InterestingTermsRetriever{ + + public LuceneDocumentTermsRetriever(IndexReader ir, MoreLikeThisParameters params) { + this.ir = ir; + this.parameters =params; + } + + public LuceneDocumentTermsRetriever(IndexReader ir) { + this.ir = ir; + } + + public PriorityQueue retrieveTermsFromDocument(Document luceneDocument) throws + IOException { + DocumentTermFrequencies perFieldTermFrequencies = new DocumentTermFrequencies(); + Map fieldToNorm = new HashMap<>(); + for (String fieldName : parameters.getFieldNames()) { + for (IndexableField field : luceneDocument.getFields(fieldName)) { + updateTermFrequenciesCount(field, perFieldTermFrequencies); + float indexTimeBoost=1.0f; // at the moment we will stand with this simplification + float norm = getNorm(perFieldTermFrequencies, fieldName,indexTimeBoost); + fieldToNorm.put(fieldName,norm); + } + } + super.interestingTermsScorer.setField2norm(fieldToNorm); + + return retrieveInterestingTerms(perFieldTermFrequencies); + } + + + +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/BM25Scorer.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/BM25Scorer.java new file mode 100644 index 000000000000..de66148e35b3 --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/BM25Scorer.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms.scorer; + +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.search.similarities.BM25Similarity; +import org.apache.lucene.search.similarities.BM25Similarity.BM25DocScorer; +import org.apache.lucene.search.similarities.Similarity; + +/** + * Implementation using BM25 {@link BM25Similarity} to calculate term score . + */ +public class BM25Scorer implements TermScorer { + + private BM25Similarity similarity = new BM25Similarity(); + + private Map field2normsFromIndex; + private Map field2norm; + private int docId; + private float textNorm; + + @Override + public float score(String fieldName, CollectionStatistics fieldStats, TermStatistics termStats, float termFrequency) throws IOException { + float termScore = 0; + Similarity.SimWeight bm25SimilarityStats = similarity.computeWeight(1.0f, fieldStats, termStats); + + BM25DocScorer similarityScorer; + + boolean scoringLocalTerm = field2normsFromIndex != null; + boolean scoringCloudTerm = field2norm!=null; + + if (scoringLocalTerm) { + similarityScorer = similarity.instantiateSimilarityScorer(bm25SimilarityStats, field2normsFromIndex.get(fieldName)); + termScore = similarityScorer.score(docId, termFrequency); + } else if(scoringCloudTerm){ + similarityScorer = similarity.instantiateSimilarityScorer(bm25SimilarityStats,null); + termScore = similarityScorer.score(termFrequency, field2norm.get(fieldName)); + } else{ + similarityScorer = similarity.instantiateSimilarityScorer(bm25SimilarityStats,null); + termScore = similarityScorer.score(termFrequency, textNorm); + } + return termScore; + + } + + public Similarity.SimWeight getSimilarityStats(String fieldName, CollectionStatistics fieldStats, TermStatistics termStats, float termFrequency) throws IOException { + return similarity.computeWeight(1.0f, fieldStats, termStats); + } + + public void setField2normsFromIndex(Map field2normsFromIndex) { + this.field2normsFromIndex = field2normsFromIndex; + } + + public void setField2norm(Map field2norm) { + this.field2norm = field2norm; + } + + public int getDocId() { + return docId; + } + + public void setDocId(int docId) { + this.docId = docId; + } + + public void setTextNorm(float textNorm) { + this.textNorm = textNorm; + } +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/ScoredTerm.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/ScoredTerm.java new file mode 100644 index 000000000000..2e1027bdd7d2 --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/ScoredTerm.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms.scorer; + +import org.apache.lucene.search.similarities.Similarity; + +/** + * This class represents a term scored . + * The score represents how much interesting the term is for the input document. + * Higher the score, more relevant the term for the document. + */ +public class ScoredTerm { + // only really need 1st 3 entries, other ones are for troubleshooting + public String term; + public String field; + public float score; + + public Similarity.SimWeight stats; + + public ScoredTerm(String term, String field, float score, Similarity.SimWeight stats ) { + this.term = term; + this.field = field; + this.score = score; + this.stats = stats; + } + + public void update(String term, String field, float score, Similarity.SimWeight stats) { + this.term = term; + this.field = field; + this.score = score; + this.stats = stats; + } +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/TFIDFScorer.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/TFIDFScorer.java new file mode 100644 index 000000000000..e408a1c9814d --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/TFIDFScorer.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms.scorer; + +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.search.similarities.ClassicSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.TFIDFSimilarity; + +/** + * Implementation using BM25 {@link TFIDFSimilarity} to calculate term score . + */ +public class TFIDFScorer implements TermScorer { + TFIDFSimilarity similarity = new ClassicSimilarity(); + + @Override + public float score(String fieldName, CollectionStatistics fieldStats, TermStatistics termStats, float termFrequency) throws IOException { + float idf = similarity.idf(termStats.docFreq(), fieldStats.docCount()); + float score = termFrequency * idf; + return score; + } + + public Similarity.SimWeight getSimilarityStats(String fieldName, CollectionStatistics fieldStats, TermStatistics termStats, float termFrequency) throws IOException { + return similarity.computeWeight(1.0f, fieldStats, termStats); + } + + @Override + public void setField2normsFromIndex(Map field2normsFromIndex) { + + } + + @Override + public void setField2norm(Map field2norm) { + + } + + @Override + public void setDocId(int docId) { + + } + + @Override + public void setTextNorm(float textNorm) { + + } + +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/TermScorer.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/TermScorer.java new file mode 100644 index 000000000000..8d9c650e2d23 --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/terms/scorer/TermScorer.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms.scorer; + +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.search.similarities.Similarity; + +/** +* This class has the responsibility of calculating a score for a term. + * The score will measure how much interesting the term is in the field given : + * - term stats local to the field content + * - field stats global to the index + */ +public interface TermScorer { + float score(String fieldName, CollectionStatistics fieldStats, TermStatistics termStats, float termFrequency) throws IOException; + + Similarity.SimWeight getSimilarityStats(String fieldName, CollectionStatistics fieldStats, TermStatistics termStats, float termFrequency) throws IOException; + + void setField2normsFromIndex(Map field2normsFromIndex); + + void setField2norm(Map field2norm); + + void setDocId(int docId); + + void setTextNorm(float textNorm); + + } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/MoreLikeThisTestBase.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/MoreLikeThisTestBase.java new file mode 100644 index 000000000000..2d80b37c9d82 --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/MoreLikeThisTestBase.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt; + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.mlt.terms.scorer.ScoredTerm; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.PriorityQueue; + +import static org.hamcrest.CoreMatchers.is; + +public class MoreLikeThisTestBase extends LuceneTestCase { + + protected static final String FIELD1 = "field1"; + protected static final String FIELD2 = "field2"; + public static final String SUFFIX_A = "a"; + public static final String SUFFIX_B = "b"; + + protected int numDocs = 100; + + protected Directory directory; + protected IndexReader reader; + protected IndexSearcher searcher; + protected Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + + @Override + public void setUp() throws Exception { + super.setUp(); + } + + @Override + public void tearDown() throws Exception { + if (reader != null) { + reader.close(); + } + if (directory != null) { + directory.close(); + } + if (analyzer != null) { + analyzer.close(); + } + super.tearDown(); + } + + protected MoreLikeThisParameters getDefaultParams() { + MoreLikeThisParameters params = new MoreLikeThisParameters(); + params.setAnalyzer(analyzer); + params.setMinDocFreq(1); + params.setMinTermFreq(1); + params.setMinWordLen(1); + params.setMaxQueryTerms(25); + params.setFieldNames(new String[]{FIELD1}); + return params; + } + + /** + * This method will prepare an index on {@link #numDocs} total docs. + * Each doc will have a single field with terms up to its sequential number : + *

    + * Doc 4 + * 1a + * 2a + * 3a + * ... + * na + *

    + * This means that '1a' will have the max docFrequency na + * While 'na' will have min docFrequency ( = 1) + * + * @return + * @throws IOException + */ + protected int initIndexWithSingleFieldDocuments() throws IOException { + // add series of docs with terms of decreasing df + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + for (int i = 1; i <= numDocs; i++) { + addDocumentWithSingleField(writer, getArithmeticSeriesWithSuffix(1, i, SUFFIX_A)); + } + reader = writer.getReader(); + int lastDocId = writer.numDocs() - 1; + writer.close(); + searcher = newSearcher(reader); + return lastDocId; + } + + /** + * This method will prepare an index on {@link #numDocs} total docs. + * Each doc will have multiple fields with terms up to its sequential number : + *

    + * Doc 4 + * 1a + * 2a + * 3a + * ... + * na + *

    + * This means that '1a' will have the max docFrequency n + * While na will have min docFrequency ( = 1) + *

    + * Each field will have a difference suffix. + * + * @return + * @throws IOException + */ + protected int initIndex() throws IOException { + // add series of docs with terms of decreasing document frequency + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + for (int i = 1; i <= numDocs; i++) { + addDocument(writer, getArithmeticSeriesWithSuffix(1, i, SUFFIX_A), getArithmeticSeriesWithSuffix(1, i, SUFFIX_B)); + } + reader = writer.getReader(); + int lastDocId = writer.numDocs() - 1; + writer.close(); + searcher = newSearcher(reader); + return lastDocId; + } + + protected int addDocumentWithSingleField(RandomIndexWriter writer, String[] fieldValues) throws IOException { + Document doc = new Document(); + for (String text : fieldValues) { + doc.add(newTextField(FIELD1, text, Field.Store.YES)); + } + writer.addDocument(doc); + return writer.numDocs() - 1; + } + + protected int addDocument(RandomIndexWriter writer, String[] field1Values, String[] field2Values) throws IOException { + Document doc = new Document(); + for (String value1 : field1Values) { + doc.add(newTextField(FIELD1, value1, Field.Store.YES)); + } + for (String value2 : field2Values) { + doc.add(newTextField(FIELD2, value2, Field.Store.YES)); + } + writer.addDocument(doc); + return writer.numDocs() - 1; + } + + /** + * Generates an arithmetic sequence of terms ( common difference = 1), + * A suffix is added to each term. + * Each term will appear with a frequency of 1 . + * e.g. + * 1a + * 2a + * ... + * na + * + * @param from + * @param size + * @return + */ + protected String[] getArithmeticSeriesWithSuffix(int from, int size, String suffix) { + String[] generatedStrings = new String[size]; + for (int i = 0; i < generatedStrings.length; i++) { + generatedStrings[i] = String.valueOf(from + i) + suffix; + } + return generatedStrings; + } + + /** + * Generates the multiple values for a field. + * Each term N will appear with a frequency of N . + * e.g. + * 1a + * 2a 2a + * 3a 3a 3a + * 4a 4a 4a 4a + * ... + * + * @param from + * @param size + * @return + */ + protected String[] getTriangularArithmeticSeriesWithSuffix(int from, int size, String suffix) { + String[] generatedStrings = new String[size]; + for (int i = 0; i < generatedStrings.length; i++) { + StringBuilder singleFieldValue = new StringBuilder(); + for (int j = 0; j < from + i; j++) { + singleFieldValue.append(String.valueOf(from + i) + suffix + " "); + } + generatedStrings[i] = singleFieldValue.toString().trim(); + } + return generatedStrings; + } + + protected void assertScoredTermsPriorityOrder(PriorityQueue scoredTerms, Term[] expectedTerms) { + for (int i = 0; scoredTerms.top() != null; i++) { + ScoredTerm singleTerm = scoredTerms.pop(); + Term term = new Term(FIELD1, singleTerm.term); + assertThat(term, is(expectedTerms[i])); + } + } + + protected void assertScoredTermsPriorityOrder(PriorityQueue scoredTerms, Term[] expectedField1Terms, Term[] expectedField2Terms) { + int i1 = 0; + int i2 = 0; + while (i1 < expectedField1Terms.length || i2 < expectedField2Terms.length) { + ScoredTerm singleTerm = scoredTerms.pop(); + Term term = new Term(singleTerm.field, singleTerm.term); + if (term.field().equals(FIELD1)) { + assertThat(term, is(expectedField1Terms[i1])); + i1++; + } else { + assertThat(term, is(expectedField2Terms[i2])); + i2++; + } + } + } + +} diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index 32a610bf8a93..7a9fc041b699 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -17,7 +17,6 @@ package org.apache.lucene.queries.mlt; import java.io.IOException; -import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -32,6 +31,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.queries.mlt.query.MoreLikeThisQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; @@ -93,22 +93,27 @@ private void addDoc(RandomIndexWriter writer, String[] texts) throws IOException public void testBoostFactor() throws Throwable { Map originalValues = getOriginalValues(); - MoreLikeThis mlt = new MoreLikeThis(reader); + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); - mlt.setAnalyzer(analyzer); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); - mlt.setFieldNames(new String[] {"text"}); - mlt.setBoost(true); + MoreLikeThisParameters params = new MoreLikeThisParameters(); + params.setAnalyzer(analyzer); + params.setMinDocFreq(1); + params.setMinTermFreq(1); + params.setMinWordLen(1); + params.setFieldNames(new String[]{"text"}); + params.enableBoost(true); + + MoreLikeThis mlt = new MoreLikeThis(reader, params); // this mean that every term boost factor will be multiplied by this // number float boostFactor = 5; - mlt.setBoostFactor(boostFactor); - - BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader( - "lucene release")); + Map fieldToBoost = new HashMap<>(); + fieldToBoost.put("text", boostFactor); + params.setFieldToQueryTimeBoostFactor(fieldToBoost); + + BooleanQuery query = (BooleanQuery) mlt.like("text", + "lucene release"); Collection clauses = query.clauses(); assertEquals("Expected " + originalValues.size() + " clauses.", @@ -130,16 +135,18 @@ public void testBoostFactor() throws Throwable { private Map getOriginalValues() throws IOException { Map originalValues = new HashMap<>(); - MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); - mlt.setAnalyzer(analyzer); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); - mlt.setFieldNames(new String[] {"text"}); - mlt.setBoost(true); - BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader( - "lucene release")); + MoreLikeThisParameters params = new MoreLikeThisParameters(); + params.setAnalyzer(analyzer); + params.setMinDocFreq(1); + params.setMinTermFreq(1); + params.setMinWordLen(1); + params.setFieldNames(new String[]{"text"}); + params.enableBoost(true); + MoreLikeThis mlt = new MoreLikeThis(reader, params); + + BooleanQuery query = (BooleanQuery) mlt.like("text", + "lucene release"); Collection clauses = query.clauses(); for (BooleanClause clause : clauses) { @@ -153,30 +160,33 @@ private Map getOriginalValues() throws IOException { // LUCENE-3326 public void testMultiFields() throws Exception { - MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); - mlt.setAnalyzer(analyzer); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); - mlt.setFieldNames(new String[] {"text", "foobar"}); - mlt.like("foobar", new StringReader("this is a test")); + MoreLikeThisParameters params = new MoreLikeThisParameters(); + params.setAnalyzer(analyzer); + params.setMinDocFreq(1); + params.setMinTermFreq(1); + params.setMinWordLen(1); + params.setFieldNames(new String[]{"text", "foobar"}); + MoreLikeThis mlt = new MoreLikeThis(reader, params); + + mlt.like("foobar", "this is a test"); analyzer.close(); } // LUCENE-5725 public void testMultiValues() throws Exception { - MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); - mlt.setAnalyzer(analyzer); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); - mlt.setFieldNames(new String[] {"text"}); + MoreLikeThisParameters params = new MoreLikeThisParameters(); + params.setAnalyzer(analyzer); + params.setMinDocFreq(1); + params.setMinTermFreq(1); + params.setMinWordLen(1); + params.setFieldNames(new String[]{"text"}); + MoreLikeThis mlt = new MoreLikeThis(reader, params); BooleanQuery query = (BooleanQuery) mlt.like("text", - new StringReader("lucene"), new StringReader("lucene release"), - new StringReader("apache"), new StringReader("apache lucene")); + "lucene", "lucene release", + "apache", "apache lucene"); Collection clauses = query.clauses(); assertEquals("Expected 2 clauses only!", 2, clauses.size()); for (BooleanClause clause : clauses) { @@ -189,7 +199,7 @@ public void testMultiValues() throws Exception { // just basic equals/hashcode etc public void testMoreLikeThisQuery() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); - Query query = new MoreLikeThisQuery("this is a test", new String[] { "text" }, analyzer, "text"); + Query query = new MoreLikeThisQuery("this is a test",new String[]{"text"}, analyzer); QueryUtils.check(random(), query, searcher); analyzer.close(); } @@ -208,21 +218,22 @@ public void testTopN() throws Exception { writer.close(); // setup MLT query - MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); - mlt.setAnalyzer(analyzer); - mlt.setMaxQueryTerms(topN); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); - mlt.setFieldNames(new String[]{"text"}); + MoreLikeThisParameters params = new MoreLikeThisParameters(); + params.setAnalyzer(analyzer); + params.setMaxQueryTerms(topN); + params.setMinDocFreq(1); + params.setMinTermFreq(1); + params.setMinWordLen(1); + params.setFieldNames(new String[]{"text"}); + MoreLikeThis mlt = new MoreLikeThis(reader, params); // perform MLT query String likeText = ""; for (String text : generateStrSeq(0, numDocs)) { likeText += text + " "; } - BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText)); + BooleanQuery query = (BooleanQuery) mlt.like("text", likeText); // check best terms are topN of highest idf Collection clauses = query.clauses(); @@ -293,14 +304,14 @@ public void testMultiFieldShouldReturnPerFieldBooleanQuery() throws Exception { writer.close(); // setup MLT query - MoreLikeThis mlt = new MoreLikeThis(reader); - - mlt.setAnalyzer(analyzer); - mlt.setMaxQueryTerms(maxQueryTerms); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); - mlt.setFieldNames(new String[]{FOR_SALE, NOT_FOR_SALE}); + MoreLikeThisParameters params = new MoreLikeThisParameters(); + params.setAnalyzer(analyzer); + params.setMaxQueryTerms(maxQueryTerms); + params.setMinDocFreq(1); + params.setMinTermFreq(1); + params.setMinWordLen(1); + params.setFieldNames(new String[]{FOR_SALE, NOT_FOR_SALE}); + MoreLikeThis mlt = new MoreLikeThis(reader, params); // perform MLT query BooleanQuery query = (BooleanQuery) mlt.like(inputDocId); diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryBuilderTest.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryBuilderTest.java new file mode 100644 index 000000000000..c052026f75b4 --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryBuilderTest.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.query; + +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; +import org.apache.lucene.queries.mlt.MoreLikeThisTestBase; +import org.apache.lucene.queries.mlt.terms.scorer.ScoredTerm; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.PriorityQueue; +import org.junit.Test; + +import static org.hamcrest.core.Is.is; + +public class MoreLikeThisQueryBuilderTest extends MoreLikeThisTestBase { + private MoreLikeThisQueryBuilder builderToTest; + + @Override + public void setUp() throws Exception { + super.setUp(); + builderToTest = new MoreLikeThisQueryBuilder(getDefaultParams()); + } + + @Test + public void boostOff_shouldBuildQueryWithNoBoost() throws Exception { + MoreLikeThisParameters defaultParams = getDefaultParams(); + builderToTest = new MoreLikeThisQueryBuilder(defaultParams); + PriorityQueue interestingTerms = this.buildInterestingTermsQueue(); + + Query query = builderToTest.createQuery(interestingTerms); + + assertThat(query.toString(), is("field2:term5 field1:term3 field1:term2 field1:term1 field2:term4")); + } + + @Test + public void boostOn_shouldBuildQueryWithDefaultBoost() throws Exception { + MoreLikeThisParameters params = getDefaultParams(); + params.enableBoost(true); + builderToTest = new MoreLikeThisQueryBuilder(params); + PriorityQueue interestingTerms = this.buildInterestingTermsQueue(); + + Query query = builderToTest.createQuery(interestingTerms); + + assertThat(query.toString(), is("(field2:term5)^1.0 (field1:term3)^3.0 (field1:term2)^4.0 (field1:term1)^5.0 (field2:term4)^7.0")); + } + + private PriorityQueue buildInterestingTermsQueue() { + ScoredTerm term1 = new ScoredTerm("term1", "field1", 0.5f, null); + ScoredTerm term2 = new ScoredTerm("term2", "field1", 0.4f, null); + ScoredTerm term3 = new ScoredTerm("term3", "field1", 0.3f, null); + + ScoredTerm term4 = new ScoredTerm("term4", "field2", 0.7f, null); + ScoredTerm term5 = new ScoredTerm("term5", "field2", 0.1f, null); + + FreqQ queue = new FreqQ(5); + queue.add(term1); + queue.add(term2); + queue.add(term3); + queue.add(term4); + queue.add(term5); + + return queue; + } + + /** + * PriorityQueue that orders words by score. + */ + protected static class FreqQ extends PriorityQueue { + FreqQ(int maxSize) { + super(maxSize); + } + + @Override + protected boolean lessThan(ScoredTerm a, ScoredTerm b) { + return a.score < b.score; + } + } +} diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryTest.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryTest.java new file mode 100644 index 000000000000..388567866bef --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/query/MoreLikeThisQueryTest.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.query; + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.queries.mlt.MoreLikeThisTestBase; +import org.apache.lucene.search.Query; +import org.junit.Test; + +import static org.hamcrest.core.Is.is; + +public class MoreLikeThisQueryTest extends MoreLikeThisTestBase { + private MoreLikeThisQuery queryToTest; + + protected Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + + @Override + public void setUp() throws Exception { + super.setUp(); + initIndex(); + } + + @Test + public void seedFieldNameConfigured_shouldRewriteUsingSeedFieldName() throws IOException { + String seedText = "1a 2a 4a 3b"; + queryToTest = new MoreLikeThisQuery(seedText,new String[]{FIELD1}, analyzer); + + Query actualMltQuery = queryToTest.rewrite(reader); + + assertThat(actualMltQuery.toString(), is("field1:1a field1:2a field1:4a")); + } + + @Test + public void seedFieldNamesConfigured_shouldRewriteUsingAllFieldNames() throws IOException { + String seedText = "1a 2a 4a 3b"; + queryToTest = new MoreLikeThisQuery(seedText,new String[]{FIELD1,FIELD2}, analyzer); + + Query actualMltQuery = queryToTest.rewrite(reader); + + assertThat(actualMltQuery.toString(), is("(field1:1a field1:2a field2:3b field1:4a)~1")); + } +} diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/DocumentTermFrequenciesTest.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/DocumentTermFrequenciesTest.java new file mode 100644 index 000000000000..f70eaf31b345 --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/DocumentTermFrequenciesTest.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms; + +import org.junit.Test; + +import static org.hamcrest.core.Is.is; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertThat; + +public class DocumentTermFrequenciesTest { + + private DocumentTermFrequencies toTest = new DocumentTermFrequencies(); + + @Test + public void getUnknownField_shouldCreateEmptyFieldTermFrequencies() { + toTest = new DocumentTermFrequencies(); + assertNotNull(toTest.get("First")); + } + + @Test + public void incrementFieldFirstTime_shouldUpdateFieldTermFrequencies() { + toTest = new DocumentTermFrequencies(); + + String unknownField = "First"; + String unknownTerm = "term1"; + toTest.increment(unknownField, unknownTerm, 5); + + assertNotNull(toTest.get(unknownField)); + assertThat(toTest.get(unknownField).get(unknownTerm).frequency, is(5)); + } + + @Test + public void incrementFieldSecondTime_shouldUpdateFieldTermFrequencies() { + toTest = new DocumentTermFrequencies(); + String unknownField = "First"; + String unknownTerm = "term1"; + toTest.increment(unknownField, unknownTerm, 5); + + assertNotNull(toTest.get(unknownField)); + assertThat(toTest.get(unknownField).get(unknownTerm).frequency, is(5)); + + toTest.increment(unknownField, unknownTerm, 3); + + assertThat(toTest.get(unknownField).get(unknownTerm).frequency, is(8)); + } + +} diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/InterestingTermsRetrieverTest.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/InterestingTermsRetrieverTest.java new file mode 100644 index 000000000000..20ca7fa9b10d --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/InterestingTermsRetrieverTest.java @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; +import org.apache.lucene.queries.mlt.MoreLikeThisTestBase; +import org.apache.lucene.queries.mlt.terms.scorer.ScoredTerm; +import org.apache.lucene.util.PriorityQueue; +import org.junit.Test; + +import static org.hamcrest.CoreMatchers.is; + + +public class InterestingTermsRetrieverTest extends MoreLikeThisTestBase { + + private InterestingTermsRetriever toTest; + + @Override + public void setUp() throws Exception { + super.setUp(); + } + + @Test + public void singleFieldDoc_KQueryTerms_shouldReturnTopKTerms() throws Exception { + //More Like This parameters definition + int topK = 26; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxQueryTerms(topK); + //Test preparation + initIndexWithSingleFieldDocuments(); + toTest = new MockInterestingTermsRetriever(reader); + toTest.setParameters(params); + DocumentTermFrequencies sampleTermFrequencies = initTermFrequencies(numDocs); + + PriorityQueue scoredTerms = toTest.retrieveInterestingTerms(sampleTermFrequencies); + + assertEquals("Expected " + topK + " terms only!", topK, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[topK]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - topK + 1, topK, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_KQueryTerms_shouldReturnTopKTerms() throws Exception { + //More Like This parameters definition + int topK = 26; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxQueryTerms(topK); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + //Test preparation + initIndex(); + toTest = new MockInterestingTermsRetriever(reader); + toTest.setParameters(params); + + DocumentTermFrequencies sampleTermFrequencies = initTermFrequencies(numDocs); + PriorityQueue scoredTerms = toTest.retrieveInterestingTerms(sampleTermFrequencies); + + assertEquals("Expected " + topK + " terms only!", topK, scoredTerms.size()); + //Expected terms preparation + int perFieldTermsSize = (topK / params.getFieldNames().length); + Term[] expectedField1Terms = new Term[perFieldTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - perFieldTermsSize + 1, perFieldTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - perFieldTermsSize + 1, perFieldTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void singleFieldDoc_minTermFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minTermFreq = 5; + MoreLikeThisParameters params = getDefaultParams(); + params.setMinTermFreq(minTermFreq); + //Test preparation + initIndexWithSingleFieldDocuments(); + int termsCountPerField = 10; + toTest = new MockInterestingTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = termsCountPerField - minTermFreq + 1; + DocumentTermFrequencies sampleTermFrequencies = initTermFrequencies(termsCountPerField); + + PriorityQueue scoredTerms = toTest.retrieveInterestingTerms(sampleTermFrequencies); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[expectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(minTermFreq, expectedScoredTermsSize, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_minTermFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minTermFreq = 6; + MoreLikeThisParameters params = getDefaultParams(); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + params.setMinTermFreq(minTermFreq); + //Test preparation + initIndex(); + int termsCountPerField = 10; + toTest = new MockInterestingTermsRetriever(reader); + toTest.setParameters(params); + int perFieldExpectedScoredTermsSize = termsCountPerField - minTermFreq + 1; + int expectedScoredTermsSize = params.getFieldNames().length * perFieldExpectedScoredTermsSize; + DocumentTermFrequencies sampleTermFrequencies = initTermFrequencies(termsCountPerField); + + PriorityQueue scoredTerms = toTest.retrieveInterestingTerms(sampleTermFrequencies); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedField1Terms = new Term[perFieldExpectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(minTermFreq, perFieldExpectedScoredTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldExpectedScoredTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(minTermFreq, perFieldExpectedScoredTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void singleFieldDoc_minDocFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minDocFreq = 91; + MoreLikeThisParameters params = getDefaultParams(); + params.setMinDocFreq(minDocFreq); + //Test preparation + initIndexWithSingleFieldDocuments(); + toTest = new MockInterestingTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = 10; + DocumentTermFrequencies sampleTermFrequencies = initTermFrequencies(expectedScoredTermsSize + 1); + + PriorityQueue scoredTerms = toTest.retrieveInterestingTerms(sampleTermFrequencies); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[expectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(1, expectedScoredTermsSize, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_minDocFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minDocFreq = 96; + MoreLikeThisParameters params = getDefaultParams(); + params.setMinDocFreq(minDocFreq); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + //Test preparation + initIndex(); + toTest = new MockInterestingTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = 10; + DocumentTermFrequencies sampleTermFrequencies = initTermFrequencies(expectedScoredTermsSize); + + PriorityQueue scoredTerms = toTest.retrieveInterestingTerms(sampleTermFrequencies); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + int perFieldTermsSize = (expectedScoredTermsSize / params.getFieldNames().length); + Term[] expectedField1Terms = new Term[perFieldTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(1, perFieldTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(1, perFieldTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void singleFieldDoc_maxDocFreq_shouldIgnoreTermsTooFrequent() throws Exception { + //More Like This parameters definition + int maxDocFreq = 10; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxDocFreq(maxDocFreq); + //Test preparation + initIndexWithSingleFieldDocuments(); + toTest = new MockInterestingTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = maxDocFreq; + + DocumentTermFrequencies sampleTermFrequencies = initTermFrequencies(numDocs); + PriorityQueue scoredTerms = toTest.retrieveInterestingTerms(sampleTermFrequencies); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[expectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - expectedScoredTermsSize + 1, expectedScoredTermsSize, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_maxDocFreq_shouldIgnoreTermsTooFrequent() throws Exception { + //More Like This parameters definition + int maxDocFreq = 5; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxDocFreq(maxDocFreq); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + //Test preparation + initIndex(); + toTest = new MockInterestingTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = params.getFieldNames().length * maxDocFreq; + + DocumentTermFrequencies sampleTermFrequencies = initTermFrequencies(numDocs); + PriorityQueue scoredTerms = toTest.retrieveInterestingTerms(sampleTermFrequencies); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + int perFieldTermsSize = (expectedScoredTermsSize / params.getFieldNames().length); + Term[] expectedField1Terms = new Term[perFieldTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(96, perFieldTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(96, perFieldTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void multiFieldDoc_field1Boosted_shouldConsiderMoreTermsFromField1() throws Exception { + //More Like This parameters definition + int topK = 26; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxQueryTerms(topK); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + Map testBoostFactor = new HashMap<>(); + testBoostFactor.put(FIELD2, 2.0f); + params.setFieldToQueryTimeBoostFactor(testBoostFactor); + //Test preparation + initIndex(); + toTest = new MockInterestingTermsRetriever(reader); + toTest.setParameters(params); + + DocumentTermFrequencies sampleTermFrequencies = initTermFrequencies(numDocs); + PriorityQueue scoredTerms = toTest.retrieveInterestingTerms(sampleTermFrequencies); + + assertEquals("Expected " + topK + " terms only!", topK, scoredTerms.size()); + int countField1 = 0; + int countField2 = 0; + for (ScoredTerm term : scoredTerms) { + if (term.field.equals(FIELD1)) { + countField1++; + } else if (term.field.equals(FIELD2)) { + countField2++; + } + } + + assertThat(countField1 < countField2, is(true)); + } + + /** + * Init a {@link DocumentTermFrequencies} structure with 2 fields. + * Each field will have terms of linearly increasing frequency. + *

    + * 1a - tf=1 + * 2a - tf=2 + * ... + * na - tf=n + * + * @param numTermsPerField + * @return + */ + private DocumentTermFrequencies initTermFrequencies(int numTermsPerField) { + DocumentTermFrequencies frequencies = new DocumentTermFrequencies(); + + for (int i = 1; i <= numTermsPerField; i++) { + frequencies.increment(FIELD1, i + "a", i); + } + for (int i = 1; i <= numTermsPerField; i++) { + frequencies.increment(FIELD2, i + "b", i); + } + + return frequencies; + } + + private class MockInterestingTermsRetriever extends InterestingTermsRetriever { + public MockInterestingTermsRetriever(IndexReader ir) { + this.ir = ir; + } + } +} diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/LocalDocumentTermsRetrieverTest.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/LocalDocumentTermsRetrieverTest.java new file mode 100644 index 000000000000..ed38dddc07b7 --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/LocalDocumentTermsRetrieverTest.java @@ -0,0 +1,335 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; +import org.apache.lucene.queries.mlt.MoreLikeThisTestBase; +import org.apache.lucene.queries.mlt.terms.scorer.ScoredTerm; +import org.apache.lucene.util.PriorityQueue; +import org.junit.Test; + +import static org.hamcrest.CoreMatchers.is; + +public class LocalDocumentTermsRetrieverTest extends MoreLikeThisTestBase { + private LocalDocumentTermsRetriever toTest; + + @Override + public void setUp() throws Exception { + super.setUp(); + } + + @Test + public void singleFieldDoc_KQueryTerms_shouldReturnTopKTerms() throws Exception { + //More Like This parameters definition + int topK = 26; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxQueryTerms(topK); + //Test preparation + int lastDocId = initIndexWithSingleFieldDocuments(); + toTest = new LocalDocumentTermsRetriever(reader); + toTest.setParameters(params); + + PriorityQueue scoredTerms = toTest.retrieveTermsFromLocalDocument(lastDocId); + + assertEquals("Expected " + topK + " terms only!", topK, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[topK]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - topK + 1, topK, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_KQueryTerms_shouldReturnTopKTerms() throws Exception { + //More Like This parameters definition + int topK = 26; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxQueryTerms(topK); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + //Test preparation + int lastDocId = initIndex(); + toTest = new LocalDocumentTermsRetriever(reader); + toTest.setParameters(params); + + PriorityQueue scoredTerms = toTest.retrieveTermsFromLocalDocument(lastDocId); + + assertEquals("Expected " + topK + " terms only!", topK, scoredTerms.size()); + //Expected terms preparation + int perFieldTermsSize = (topK / params.getFieldNames().length); + Term[] expectedField1Terms = new Term[perFieldTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - perFieldTermsSize + 1, perFieldTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - perFieldTermsSize + 1, perFieldTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void singleFieldDoc_minTermFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minTermFreq = 5; + MoreLikeThisParameters params = getDefaultParams(); + params.setMinTermFreq(minTermFreq); + //Test preparation + initIndexWithSingleFieldDocuments(); + int termsCountPerField = 10; + int testDocId = indexDocumentWithLinearTermFrequencies(termsCountPerField); + toTest = new LocalDocumentTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = termsCountPerField - minTermFreq + 1; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromLocalDocument(testDocId); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[expectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(minTermFreq, expectedScoredTermsSize, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_minTermFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minTermFreq = 5; + MoreLikeThisParameters params = getDefaultParams(); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + params.setMinTermFreq(minTermFreq); + //Test preparation + initIndex(); + int termsCountPerField = 10; + int testDocId = indexDocumentWithLinearTermFrequencies(termsCountPerField); + toTest = new LocalDocumentTermsRetriever(reader); + toTest.setParameters(params); + int perFieldExpectedScoredTermsSize = termsCountPerField - minTermFreq + 1; + int expectedScoredTermsSize = params.getFieldNames().length * perFieldExpectedScoredTermsSize; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromLocalDocument(testDocId); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedField1Terms = new Term[perFieldExpectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(minTermFreq, perFieldExpectedScoredTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldExpectedScoredTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(minTermFreq, perFieldExpectedScoredTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void singleFieldDoc_minDocFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minDocFreq = 91; + MoreLikeThisParameters params = getDefaultParams(); + params.setMinDocFreq(minDocFreq); + //Test preparation + int lastDocId = initIndexWithSingleFieldDocuments(); + toTest = new LocalDocumentTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = 10; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromLocalDocument(lastDocId); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[expectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(1, expectedScoredTermsSize, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_minDocFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minDocFreq = 96; + MoreLikeThisParameters params = getDefaultParams(); + params.setMinDocFreq(minDocFreq); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + //Test preparation + int lastDocId = initIndex(); + toTest = new LocalDocumentTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = 10; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromLocalDocument(lastDocId); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + int perFieldTermsSize = (expectedScoredTermsSize / params.getFieldNames().length); + Term[] expectedField1Terms = new Term[perFieldTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(1, perFieldTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(1, perFieldTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void singleFieldDoc_maxDocFreq_shouldIgnoreTermsTooFrequent() throws Exception { + //More Like This parameters definition + int maxDocFreq = 10; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxDocFreq(maxDocFreq); + //Test preparation + int lastDocId = initIndexWithSingleFieldDocuments(); + toTest = new LocalDocumentTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = maxDocFreq; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromLocalDocument(lastDocId); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[expectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - expectedScoredTermsSize + 1, expectedScoredTermsSize, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_maxDocFreq_shouldIgnoreTermsTooFrequent() throws Exception { + //More Like This parameters definition + int maxDocFreq = 5; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxDocFreq(maxDocFreq); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + //Test preparation + int lastDocId = initIndex(); + toTest = new LocalDocumentTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = params.getFieldNames().length * maxDocFreq; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromLocalDocument(lastDocId); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + int perFieldTermsSize = (expectedScoredTermsSize / params.getFieldNames().length); + Term[] expectedField1Terms = new Term[perFieldTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(96, perFieldTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(96, perFieldTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void multiFieldDoc_onlyField1Configured_shouldConsiderOnlyTermsFromField1() throws Exception { + //More Like This parameters definition + int topK = 26; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxQueryTerms(topK); + params.setFieldNames(new String[]{FIELD1}); + Map testBoostFactor = new HashMap<>(); + testBoostFactor.put(FIELD2, 2.0f); + params.setFieldToQueryTimeBoostFactor(testBoostFactor); + //Test preparation + int lastDocId = initIndex(); + toTest = new LocalDocumentTermsRetriever(reader); + toTest.setParameters(params); + + PriorityQueue scoredTerms = toTest.retrieveTermsFromLocalDocument(lastDocId); + + assertEquals("Expected " + topK + " terms only!", topK, scoredTerms.size()); + int countField2 = 0; + for (ScoredTerm term : scoredTerms) { + if (term.field.equals(FIELD2)) { + countField2++; + } + } + + assertThat(countField2, is(0)); + } + + /** + * This method will index in the index a seed document. + * The seed document will have 2 multi valued fields. + * Each field will have terms of linearly increasing term frequency : + * 1a - tf=1 + * 2a - tf=2 + * 3a - tf=3 + * ... + * + * @throws IOException + */ + private int indexDocumentWithLinearTermFrequencies(int numTermsPerField) throws IOException { + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + writer.numDocs(); + + Document doc = new Document(); + for (String value1 : getTriangularArithmeticSeriesWithSuffix(1, numTermsPerField, "a")) { + doc.add(newTextField(FIELD1, value1, Field.Store.YES)); + } + for (String value2 : getTriangularArithmeticSeriesWithSuffix(1, numTermsPerField, "b")) { + doc.add(newTextField(FIELD2, value2, Field.Store.YES)); + } + + writer.addDocument(doc); + int docId = writer.numDocs() - 1; + reader.close(); + reader = writer.getReader(); + writer.close(); + searcher = newSearcher(reader); + return docId; + } +} + + diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/LuceneDocumentTermsRetrieverTest.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/LuceneDocumentTermsRetrieverTest.java new file mode 100644 index 000000000000..ef28cbfa1429 --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/LuceneDocumentTermsRetrieverTest.java @@ -0,0 +1,354 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; +import org.apache.lucene.queries.mlt.MoreLikeThisTestBase; +import org.apache.lucene.queries.mlt.terms.scorer.ScoredTerm; +import org.apache.lucene.util.PriorityQueue; +import org.junit.Test; + +import static org.hamcrest.CoreMatchers.is; + +public class LuceneDocumentTermsRetrieverTest extends MoreLikeThisTestBase { + private LuceneDocumentTermsRetriever toTest; + + @Override + public void setUp() throws Exception { + super.setUp(); + } + + @Test + public void singleFieldDoc_KQueryTerms_shouldReturnTopKTerms() throws Exception { + //More Like This parameters definition + int topK = 26; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxQueryTerms(topK); + //Test preparation + initIndexWithSingleFieldDocuments(); + Document testDocument = getDocumentWithConstantUnitaryTermFrequencies(numDocs); + toTest = new LuceneDocumentTermsRetriever(reader); + toTest.setParameters(params); + + PriorityQueue scoredTerms = toTest.retrieveTermsFromDocument(testDocument); + + assertEquals("Expected " + topK + " terms only!", topK, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[topK]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - topK + 1, topK, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_KQueryTerms_shouldReturnTopKTerms() throws Exception { + //More Like This parameters definition + int topK = 26; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxQueryTerms(topK); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + //Test preparation + initIndex(); + Document testDocument = getDocumentWithConstantUnitaryTermFrequencies(numDocs); + toTest = new LuceneDocumentTermsRetriever(reader); + toTest.setParameters(params); + + PriorityQueue scoredTerms = toTest.retrieveTermsFromDocument(testDocument); + + assertEquals("Expected " + topK + " terms only!", topK, scoredTerms.size()); + //Expected terms preparation + int perFieldTermsSize = (topK / params.getFieldNames().length); + Term[] expectedField1Terms = new Term[perFieldTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - perFieldTermsSize + 1, perFieldTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - perFieldTermsSize + 1, perFieldTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void singleFieldDoc_minTermFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minTermFreq = 5; + MoreLikeThisParameters params = getDefaultParams(); + params.setMinTermFreq(minTermFreq); + //Test preparation + initIndexWithSingleFieldDocuments(); + int termsCountPerField = 10; + Document testDocument = getDocumentWithLinearTermFrequencies(termsCountPerField); + toTest = new LuceneDocumentTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = termsCountPerField - minTermFreq; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromDocument(testDocument); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[expectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(minTermFreq, expectedScoredTermsSize, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_minTermFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minTermFreq = 5; + MoreLikeThisParameters params = getDefaultParams(); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + params.setMinTermFreq(minTermFreq); + //Test preparation + initIndex(); + int termsCountPerField = 10; + Document testDocument = getDocumentWithLinearTermFrequencies(termsCountPerField); + toTest = new LuceneDocumentTermsRetriever(reader); + toTest.setParameters(params); + int perFieldExpectedScoredTermsSize = termsCountPerField - minTermFreq; + int expectedScoredTermsSize = params.getFieldNames().length * perFieldExpectedScoredTermsSize; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromDocument(testDocument); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedField1Terms = new Term[perFieldExpectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(minTermFreq, perFieldExpectedScoredTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldExpectedScoredTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(minTermFreq, perFieldExpectedScoredTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void singleFieldDoc_minDocFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minDocFreq = 91; + MoreLikeThisParameters params = getDefaultParams(); + params.setMinDocFreq(minDocFreq); + //Test preparation + initIndexWithSingleFieldDocuments(); + Document testDocument = getDocumentWithConstantUnitaryTermFrequencies(numDocs); + toTest = new LuceneDocumentTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = 10; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromDocument(testDocument); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[expectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(1, expectedScoredTermsSize, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_minDocFreq_shouldIgnoreTermsLessFrequent() throws Exception { + //More Like This parameters definition + int minDocFreq = 96; + MoreLikeThisParameters params = getDefaultParams(); + params.setMinDocFreq(minDocFreq); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + //Test preparation + initIndex(); + Document testDocument = getDocumentWithConstantUnitaryTermFrequencies(numDocs); + toTest = new LuceneDocumentTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = 10; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromDocument(testDocument); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + int perFieldTermsSize = (expectedScoredTermsSize / params.getFieldNames().length); + Term[] expectedField1Terms = new Term[perFieldTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(1, perFieldTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(1, perFieldTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void singleFieldDoc_maxDocFreq_shouldIgnoreTermsTooFrequent() throws Exception { + //More Like This parameters definition + int maxDocFreq = 10; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxDocFreq(maxDocFreq); + //Test preparation + initIndexWithSingleFieldDocuments(); + Document testDocument = getDocumentWithConstantUnitaryTermFrequencies(numDocs); + toTest = new LuceneDocumentTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = maxDocFreq; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromDocument(testDocument); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + Term[] expectedTerms = new Term[expectedScoredTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(numDocs - expectedScoredTermsSize + 1, expectedScoredTermsSize, "a")) { + expectedTerms[idx++] = new Term(FIELD1, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedTerms); + } + + @Test + public void multiFieldDoc_maxDocFreq_shouldIgnoreTermsTooFrequent() throws Exception { + //More Like This parameters definition + int maxDocFreq = 5; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxDocFreq(maxDocFreq); + params.setFieldNames(new String[]{FIELD1, FIELD2}); + //Test preparation + initIndex(); + Document testDocument = getDocumentWithConstantUnitaryTermFrequencies(numDocs); + toTest = new LuceneDocumentTermsRetriever(reader); + toTest.setParameters(params); + int expectedScoredTermsSize = params.getFieldNames().length * maxDocFreq; + + PriorityQueue scoredTerms = toTest.retrieveTermsFromDocument(testDocument); + + assertEquals("Expected " + expectedScoredTermsSize + " terms only!", expectedScoredTermsSize, scoredTerms.size()); + //Expected terms preparation + int perFieldTermsSize = (expectedScoredTermsSize / params.getFieldNames().length); + Term[] expectedField1Terms = new Term[perFieldTermsSize]; + int idx = 0; + for (String text : getArithmeticSeriesWithSuffix(96, perFieldTermsSize, "a")) { + expectedField1Terms[idx++] = new Term(FIELD1, text); + } + Term[] expectedField2Terms = new Term[perFieldTermsSize]; + idx = 0; + for (String text : getArithmeticSeriesWithSuffix(96, perFieldTermsSize, "b")) { + expectedField2Terms[idx++] = new Term(FIELD2, text); + } + //Expected terms assertions + assertScoredTermsPriorityOrder(scoredTerms, expectedField1Terms, expectedField2Terms); + } + + @Test + public void multiFieldDoc_onlyField1Configured_shouldConsiderOnlyTermsFromField1() throws Exception { + //More Like This parameters definition + int topK = 26; + MoreLikeThisParameters params = getDefaultParams(); + params.setMaxQueryTerms(topK); + params.setFieldNames(new String[]{FIELD1}); + Map testBoostFactor = new HashMap<>(); + testBoostFactor.put(FIELD2, 2.0f); + params.setFieldToQueryTimeBoostFactor(testBoostFactor); + //Test preparation + initIndexWithSingleFieldDocuments(); + Document testDocument = getDocumentWithConstantUnitaryTermFrequencies(numDocs); + toTest = new LuceneDocumentTermsRetriever(reader); + toTest.setParameters(params); + + PriorityQueue scoredTerms = toTest.retrieveTermsFromDocument(testDocument); + + assertEquals("Expected " + topK + " terms only!", topK, scoredTerms.size()); + int countField2 = 0; + for (ScoredTerm term : scoredTerms) { + if (term.field.equals(FIELD2)) { + countField2++; + } + } + + assertThat(countField2, is(0)); + } + + /** + * Scope of this init is to index a single document where terms have a term freq >1 + * + * @throws IOException + */ + private Document getDocumentWithLinearTermFrequencies(int numTermsPerField) throws IOException { + Document document = new Document(); + + for (String value1 : getTriangularArithmeticSeriesWithSuffix(0, numTermsPerField, "a")) { + Field field1 = newTextField(FIELD1, value1, Field.Store.YES); + document.add(field1); + } + + for (String value2 : getTriangularArithmeticSeriesWithSuffix(0, numTermsPerField, "b")) { + Field field2 = newTextField(FIELD2, value2, Field.Store.YES); + document.add(field2); + } + + return document; + } + + /** + * Scope of this init is to index a single document where terms have a term freq >1 + * + * @throws IOException + */ + private Document getDocumentWithConstantUnitaryTermFrequencies(int valuesCountPerField) throws IOException { + Document document = new Document(); + + String[] valuesA = getArithmeticSeriesWithSuffix(1, valuesCountPerField, "a"); + for (String value1 : valuesA) { + Field field1 = newTextField(FIELD1, value1, Field.Store.YES); + document.add(field1); + } + + String[] valuesB = getArithmeticSeriesWithSuffix(1, valuesCountPerField, "b"); + for (String value2 : valuesB) { + Field field2 = newTextField(FIELD2, value2, Field.Store.YES); + document.add(field2); + } + + return document; + } + +} + + diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/scorer/bm25/BM25ScorerTest.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/scorer/bm25/BM25ScorerTest.java new file mode 100644 index 000000000000..4d3e3b04c14e --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/scorer/bm25/BM25ScorerTest.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms.scorer.bm25; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.queries.mlt.terms.scorer.BM25Scorer; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.util.BytesRef; +import org.junit.Assert; +import org.junit.Test; + +public class BM25ScorerTest { + public static final String SAMPLE_FIELD = "field1"; + public static final String SAMPLE_TERM = "term1"; + + private BM25Scorer scorerToTest = new BM25Scorer(); + + @Test + public void indexedNorms_shouldCalculateBM25SimilarityScore() throws IOException { + Map field2normsFromIndex = new HashMap<>(); + NumericDocValues field1Norms = new TestNumericDocValues(10); + field2normsFromIndex.put(SAMPLE_FIELD, field1Norms); + scorerToTest.setField2normsFromIndex(field2normsFromIndex); + + CollectionStatistics field1Stats = new CollectionStatistics(SAMPLE_FIELD, 100, 90, 1000, 1000); + + BytesRef term1ByteRef = new BytesRef(SAMPLE_TERM); + TermStatistics term1Stat = new TermStatistics(term1ByteRef, 20, 60); + + float score = scorerToTest.score(SAMPLE_FIELD, field1Stats, term1Stat, 20); + Assert.assertEquals(0.19, score, 0.01); + } + + @Test + public void normsFromDocument_shouldCalculateBM25SimilarityScore() throws IOException { + Map fieldToNorm = new HashMap<>(); + fieldToNorm.put(SAMPLE_FIELD, 332f); + scorerToTest.setField2norm(fieldToNorm); + + CollectionStatistics field1Stats = new CollectionStatistics(SAMPLE_FIELD, 100, 90, 1000, 1000); + + BytesRef term1ByteRef = new BytesRef(SAMPLE_TERM); + TermStatistics term1Stat = new TermStatistics(term1ByteRef, 20, 60); + + float score = scorerToTest.score(SAMPLE_FIELD, field1Stats, term1Stat, 20); + Assert.assertEquals(0.19, score, 0.01); + } + + @Test + public void normFromFreeText_shouldCalculateBM25SimilarityScore() throws IOException { + float norm = 332f; + scorerToTest.setTextNorm(norm); + + CollectionStatistics field1Stats = new CollectionStatistics(SAMPLE_FIELD, 100, 90, 1000, 1000); + + BytesRef term1ByteRef = new BytesRef(SAMPLE_TERM); + TermStatistics term1Stat = new TermStatistics(term1ByteRef, 20, 60); + + float score = scorerToTest.score(SAMPLE_FIELD, field1Stats, term1Stat, 20); + Assert.assertEquals(0.19, score, 0.01); + } + + //testare se ho direttamente la norma constante + + private class TestNumericDocValues extends NumericDocValues { + long normIndex = 100; // precomputer value in cache for this position is 332 + final int maxDoc; + int doc = -1; + + TestNumericDocValues(int maxDoc) { + this.maxDoc = maxDoc; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + return doc = NO_MORE_DOCS; + } + return doc = target; + } + + @Override + public boolean advanceExact(int target) { + doc = target; + return true; + } + + @Override + public long cost() { + return maxDoc; + } + + @Override + public long longValue() throws IOException { + return normIndex; + } + } +} diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/scorer/tfidf/TFIDFScorerTest.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/scorer/tfidf/TFIDFScorerTest.java new file mode 100644 index 000000000000..a759be115895 --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/terms/scorer/tfidf/TFIDFScorerTest.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.mlt.terms.scorer.tfidf; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.queries.mlt.terms.scorer.BM25Scorer; +import org.apache.lucene.queries.mlt.terms.scorer.TFIDFScorer; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.util.BytesRef; +import org.junit.Assert; +import org.junit.Test; + +public class TFIDFScorerTest { + public static final String SAMPLE_FIELD = "field1"; + public static final String SAMPLE_TERM = "term1"; + + private TFIDFScorer scorerToTest = new TFIDFScorer(); + + @Test + public void sampleStats_shouldCalculateTFIDFSimilarityScore() throws IOException { + CollectionStatistics field1Stats = new CollectionStatistics(SAMPLE_FIELD, 100, 90, 1000, 1000); + + BytesRef term1ByteRef = new BytesRef(SAMPLE_TERM); + TermStatistics term1Stat = new TermStatistics(term1ByteRef, 20, 60); + + float score = scorerToTest.score(SAMPLE_FIELD, field1Stats, term1Stat, 20); + Assert.assertEquals(49.30, score, 0.03); + } + + +} diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/LikeThisQueryBuilder.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/LikeThisQueryBuilder.java index 2812043c50ae..df6d4973cbfc 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/LikeThisQueryBuilder.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/LikeThisQueryBuilder.java @@ -23,7 +23,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.queries.mlt.MoreLikeThisQuery; +import org.apache.lucene.queries.mlt.query.MoreLikeThisQuery; import org.apache.lucene.queryparser.xml.QueryBuilder; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.Query; @@ -86,7 +86,7 @@ public Query getQuery(Element e) throws ParserException { } - MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.getText(e), fields, analyzer, fields[0]); + MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.getText(e), fields, analyzer); mlt.setMaxQueryTerms(DOMUtils.getAttribute(e, "maxQueryTerms", DEFAULT_MAX_QUERY_TERMS)); mlt.setMinTermFrequency(DOMUtils.getAttribute(e, "minTermFrequency", DEFAULT_MIN_TERM_FREQUENCY)); mlt.setPercentTermsToMatch(DOMUtils.getAttribute(e, "percentTermsToMatch", DEFAULT_PERCENT_TERMS_TO_MATCH) / 100); diff --git a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java index 9c86350d82a4..d5175fb613d9 100644 --- a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java @@ -28,11 +28,14 @@ import java.util.Map; import java.util.regex.Pattern; +import org.apache.commons.io.IOUtils; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.ExitableDirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.queries.mlt.MoreLikeThis; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; @@ -136,12 +139,12 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw SolrIndexSearcher searcher = req.getSearcher(); - MoreLikeThisHelper mlt = new MoreLikeThisHelper(params, searcher); + MoreLikeThisHelper mltHelper = new MoreLikeThisHelper(params, searcher); // Hold on to the interesting terms if relevant TermStyle termStyle = TermStyle.get(params.get(MoreLikeThisParams.INTERESTING_TERMS)); List interesting = (termStyle == TermStyle.NONE) - ? null : new ArrayList<>(mlt.mlt.getMaxQueryTerms()); + ? null : new ArrayList<>(mltHelper.mlt.getParameters().getMaxQueryTerms()); DocListAndSet mltDocs = null; @@ -169,7 +172,7 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw // Find documents MoreLikeThis - either with a reader or a query // -------------------------------------------------------------------------------- if (reader != null) { - mltDocs = mlt.getMoreLikeThis(reader, start, rows, filters, + mltDocs = mltHelper.getMoreLikeThis(reader, start, rows, filters, interesting, flags); } else if (q != null) { // Matching options @@ -188,7 +191,7 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw if (iterator.hasNext()) { // do a MoreLikeThis query for each document in results int id = iterator.nextDoc(); - mltDocs = mlt.getMoreLikeThis(id, start, rows, filters, interesting, + mltDocs = mltHelper.getMoreLikeThis(id, start, rows, filters, interesting, flags); } } else { @@ -254,7 +257,7 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw // Copied from StandardRequestHandler... perhaps it should be added to doStandardDebug? if (dbg == true) { try { - NamedList dbgInfo = SolrPluginUtils.doStandardDebug(req, q, mlt.getRawMLTQuery(), mltDocs.docList, dbgQuery, dbgResults); + NamedList dbgInfo = SolrPluginUtils.doStandardDebug(req, q, mltHelper.getBoostedMLTQuery(), mltDocs.docList, dbgQuery, dbgResults); if (null != dbgInfo) { if (null != filters) { dbgInfo.add("filter_queries", req.getParams().getParams(CommonParams.FQ)); @@ -324,30 +327,27 @@ public MoreLikeThisHelper( SolrParams params, SolrIndexSearcher searcher ) } this.mlt = new MoreLikeThis( reader ); // TODO -- after LUCENE-896, we can use , searcher.getSimilarity() ); - mlt.setFieldNames(fields); - mlt.setAnalyzer( searcher.getSchema().getIndexAnalyzer() ); + MoreLikeThisParameters luceneMltParams = new MoreLikeThisParameters(); + mlt.setParameters(luceneMltParams); + luceneMltParams.setFieldNames(fields); + luceneMltParams.setAnalyzer( searcher.getSchema().getIndexAnalyzer() ); // configurable params - - mlt.setMinTermFreq( params.getInt(MoreLikeThisParams.MIN_TERM_FREQ, MoreLikeThis.DEFAULT_MIN_TERM_FREQ)); - mlt.setMinDocFreq( params.getInt(MoreLikeThisParams.MIN_DOC_FREQ, MoreLikeThis.DEFAULT_MIN_DOC_FREQ)); - mlt.setMaxDocFreq( params.getInt(MoreLikeThisParams.MAX_DOC_FREQ, MoreLikeThis.DEFAULT_MAX_DOC_FREQ)); - mlt.setMinWordLen( params.getInt(MoreLikeThisParams.MIN_WORD_LEN, MoreLikeThis.DEFAULT_MIN_WORD_LENGTH)); - mlt.setMaxWordLen( params.getInt(MoreLikeThisParams.MAX_WORD_LEN, MoreLikeThis.DEFAULT_MAX_WORD_LENGTH)); - mlt.setMaxQueryTerms( params.getInt(MoreLikeThisParams.MAX_QUERY_TERMS, MoreLikeThis.DEFAULT_MAX_QUERY_TERMS)); - mlt.setMaxNumTokensParsed(params.getInt(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED, MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED)); - mlt.setBoost( params.getBool(MoreLikeThisParams.BOOST, false ) ); - boostFields = SolrPluginUtils.parseFieldBoosts(params.getParams(MoreLikeThisParams.QF)); + + luceneMltParams.setMinTermFreq( params.getInt(MoreLikeThisParams.MIN_TERM_FREQ, MoreLikeThisParameters.DEFAULT_MIN_TERM_FREQ)); + luceneMltParams.setMinDocFreq( params.getInt(MoreLikeThisParams.MIN_DOC_FREQ, MoreLikeThisParameters.DEFAULT_MIN_DOC_FREQ)); + luceneMltParams.setMaxDocFreq( params.getInt(MoreLikeThisParams.MAX_DOC_FREQ, MoreLikeThisParameters.DEFAULT_MAX_DOC_FREQ)); + luceneMltParams.setMinWordLen( params.getInt(MoreLikeThisParams.MIN_WORD_LEN, MoreLikeThisParameters.DEFAULT_MIN_WORD_LENGTH)); + luceneMltParams.setMaxWordLen( params.getInt(MoreLikeThisParams.MAX_WORD_LEN, MoreLikeThisParameters.DEFAULT_MAX_WORD_LENGTH)); + luceneMltParams.setMaxQueryTerms( params.getInt(MoreLikeThisParams.MAX_QUERY_TERMS, MoreLikeThisParameters.DEFAULT_MAX_QUERY_TERMS)); + luceneMltParams.setMaxNumTokensParsed(params.getInt(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED, MoreLikeThisParameters.DEFAULT_MAX_NUM_TOKENS_PARSED)); + luceneMltParams.enableBoost( params.getBool(MoreLikeThisParams.BOOST, false ) ); + luceneMltParams.setFieldToQueryTimeBoostFactor(boostFields); } - - private Query rawMLTQuery; + private Query boostedMLTQuery; private BooleanQuery realMLTQuery; - public Query getRawMLTQuery(){ - return rawMLTQuery; - } - public Query getBoostedMLTQuery(){ return boostedMLTQuery; } @@ -356,35 +356,12 @@ public Query getRealMLTQuery(){ return realMLTQuery; } - private Query getBoostedQuery(Query mltquery) { - BooleanQuery boostedQuery = (BooleanQuery)mltquery; - if (boostFields.size() > 0) { - BooleanQuery.Builder newQ = new BooleanQuery.Builder(); - newQ.setMinimumNumberShouldMatch(boostedQuery.getMinimumNumberShouldMatch()); - for (BooleanClause clause : boostedQuery) { - Query q = clause.getQuery(); - float originalBoost = 1f; - if (q instanceof BoostQuery) { - BoostQuery bq = (BoostQuery) q; - q = bq.getQuery(); - originalBoost = bq.getBoost(); - } - Float fieldBoost = boostFields.get(((TermQuery) q).getTerm().field()); - q = ((fieldBoost != null) ? new BoostQuery(q, fieldBoost * originalBoost) : clause.getQuery()); - newQ.add(q, clause.getOccur()); - } - boostedQuery = newQ.build(); - } - return boostedQuery; - } - public DocListAndSet getMoreLikeThis( int id, int start, int rows, List filters, List terms, int flags ) throws IOException { Document doc = reader.document(id); - rawMLTQuery = mlt.like(id); - boostedMLTQuery = getBoostedQuery( rawMLTQuery ); + boostedMLTQuery = mlt.like(id); if( terms != null ) { - fillInterestingTermsFromMLTQuery( rawMLTQuery, terms ); + fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms ); } // exclude current document from results @@ -406,13 +383,20 @@ public DocListAndSet getMoreLikeThis( int id, int start, int rows, List f public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List filters, List terms, int flags ) throws IOException { - // analyzing with the first field: previous (stupid) behavior - rawMLTQuery = mlt.like(mlt.getFieldNames()[0], reader); - boostedMLTQuery = getBoostedQuery( rawMLTQuery ); - if( terms != null ) { - fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms ); + BooleanQuery.Builder boostedMLTQueryBuilder = new BooleanQuery.Builder(); + String content = IOUtils.toString(reader); + for(String fieldName : mlt.getParameters().getFieldNames()){ + Analyzer fieldQueryAnalyzer = searcher.getSchema().getField(fieldName).getType().getQueryAnalyzer(); + mlt.getParameters().setAnalyzer(fieldQueryAnalyzer); + Query partialMltQuery = mlt.like(fieldName,content); + if( terms != null ) { + fillInterestingTermsFromMLTQuery( partialMltQuery, terms ); + } + boostedMLTQueryBuilder.add(partialMltQuery,BooleanClause.Occur.SHOULD); } + DocListAndSet results = new DocListAndSet(); + boostedMLTQuery = boostedMLTQueryBuilder.build(); if (this.needDocSet) { results = searcher.getDocListAndSet( boostedMLTQuery, filters, null, start, rows, flags); } else { @@ -434,7 +418,6 @@ public NamedList getMoreLikeTheseQuery(DocList docs) if (mltquery.clauses().size() == 0) { return result; } - mltquery = (BooleanQuery) getBoostedQuery(mltquery); // exclude current document from results BooleanQuery.Builder mltQuery = new BooleanQuery.Builder(); diff --git a/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java b/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java index 55edc631114c..91b01020e37e 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java @@ -332,7 +332,7 @@ ShardRequest buildShardQuery(ResponseBuilder rb, String q, String key) { s.params.remove(CommonParams.FL); // Should probably add something like this: - // String fl = s.params.get(MoreLikeThisParams.RETURN_FL, "*"); + // String fl = s.params.get(MoreLikeThisParameters.RETURN_FL, "*"); // if(fl != null){ // s.params.set(CommonParams.FL, fl + ",score"); // } @@ -387,7 +387,6 @@ NamedList getMoreLikeThese(ResponseBuilder rb, if (dbg != null) { SimpleOrderedMap docDbg = new SimpleOrderedMap<>(); - docDbg.add("rawMLTQuery", mltHelper.getRawMLTQuery().toString()); docDbg .add("boostedMLTQuery", mltHelper.getBoostedMLTQuery().toString()); docDbg.add("realMLTQuery", mltHelper.getRealMLTQuery().toString()); diff --git a/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java b/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java index 945047b097ad..ffd9e5e1966d 100644 --- a/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java +++ b/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java @@ -22,10 +22,14 @@ import java.util.Map; import java.util.regex.Pattern; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.legacy.LegacyNumericUtils; import org.apache.lucene.queries.mlt.MoreLikeThis; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; @@ -69,97 +73,69 @@ public Query parse() { String[] qf = localParams.getParams("qf"); Map boostFields = new HashMap<>(); MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader()); + MoreLikeThisParameters mltParams = new MoreLikeThisParameters(); + mlt.setParameters(mltParams); + mltParams.setMinTermFreq(localParams.getInt("mintf", MoreLikeThisParameters.DEFAULT_MIN_TERM_FREQ)); + mltParams.setMinDocFreq(localParams.getInt("mindf", 0)); + mltParams.setMinWordLen(localParams.getInt("minwl", MoreLikeThisParameters.DEFAULT_MIN_WORD_LENGTH)); + mltParams.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThisParameters.DEFAULT_MAX_WORD_LENGTH)); + mltParams.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThisParameters.DEFAULT_MAX_QUERY_TERMS)); + mltParams.setMaxNumTokensParsed(localParams.getInt("maxntp", MoreLikeThisParameters.DEFAULT_MAX_NUM_TOKENS_PARSED)); + mltParams.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThisParameters.DEFAULT_MAX_DOC_FREQ)); + + if (localParams.get("boost") != null) { + mltParams.enableBoost(localParams.getBool("boost")); + } - mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ)); - mlt.setMinDocFreq(localParams.getInt("mindf", 0)); - mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH)); - mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH)); - mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS)); - mlt.setMaxNumTokensParsed(localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED)); - mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ)); - - Boolean boost = localParams.getBool("boost", MoreLikeThis.DEFAULT_BOOST); - mlt.setBoost(boost); - - mlt.setAnalyzer(req.getSchema().getIndexAnalyzer()); + mltParams.setAnalyzer(req.getSchema().getIndexAnalyzer()); - Map> filteredDocument = new HashMap<>(); - String[] fieldNames; + Document filteredDocument = new Document(); + ArrayList fieldNames = new ArrayList<>(); if (qf != null) { - ArrayList fields = new ArrayList(); for (String fieldName : qf) { if (!StringUtils.isEmpty(fieldName)) { String[] strings = splitList.split(fieldName); for (String string : strings) { if (!StringUtils.isEmpty(string)) { - fields.add(string); + fieldNames.add(string); } } } } // Parse field names and boosts from the fields - boostFields = SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0])); - fieldNames = boostFields.keySet().toArray(new String[0]); + boostFields = SolrPluginUtils.parseFieldBoosts(fieldNames.toArray(new String[0])); + mltParams.setFieldToQueryTimeBoostFactor(boostFields); } else { - ArrayList fields = new ArrayList(); for (String field : doc.getFieldNames()) { // Only use fields that are stored and have an explicit analyzer. // This makes sense as the query uses tf/idf/.. for query construction. // We might want to relook and change this in the future though. SchemaField f = req.getSchema().getFieldOrNull(field); if (f != null && f.stored() && f.getType().isExplicitAnalyzer()) { - fields.add(field); + fieldNames.add(field); } } - fieldNames = fields.toArray(new String[0]); } - if (fieldNames.length < 1) { + if (fieldNames.size() < 1) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "MoreLikeThis requires at least one similarity field: qf" ); } - mlt.setFieldNames(fieldNames); + mltParams.setFieldNames(fieldNames.toArray(new String[fieldNames.size()])); + for (String field : fieldNames) { Collection fieldValues = doc.getFieldValues(field); if (fieldValues != null) { - Collection values = new ArrayList(); - for (Object val : fieldValues) { - if (val instanceof IndexableField) { - values.add(((IndexableField)val).stringValue()); - } - else { - values.add(val); + for (Object singleFieldValue : fieldValues) { + filteredDocument.add(new TextField(field, String.valueOf(singleFieldValue), Field.Store.YES)); } } - filteredDocument.put(field, values); - } } try { - Query rawMLTQuery = mlt.like(filteredDocument); - BooleanQuery boostedMLTQuery = (BooleanQuery) rawMLTQuery; - - if (boost && boostFields.size() > 0) { - BooleanQuery.Builder newQ = new BooleanQuery.Builder(); - newQ.setMinimumNumberShouldMatch(boostedMLTQuery.getMinimumNumberShouldMatch()); - - for (BooleanClause clause : boostedMLTQuery) { - Query q = clause.getQuery(); - float originalBoost = 1f; - if (q instanceof BoostQuery) { - BoostQuery bq = (BoostQuery) q; - q = bq.getQuery(); - originalBoost = bq.getBoost(); - } - Float fieldBoost = boostFields.get(((TermQuery) q).getTerm().field()); - q = ((fieldBoost != null) ? new BoostQuery(q, fieldBoost * originalBoost) : clause.getQuery()); - newQ.add(q, clause.getOccur()); - } - - boostedMLTQuery = newQ.build(); - } + BooleanQuery boostedMLTQuery = (BooleanQuery) mlt.like(filteredDocument); // exclude current document from results BooleanQuery.Builder realMLTQuery = new BooleanQuery.Builder(); diff --git a/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java b/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java index de6eb58286b8..8a836d6c828e 100644 --- a/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java +++ b/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java @@ -18,9 +18,9 @@ import org.apache.lucene.index.Term; import org.apache.lucene.legacy.LegacyNumericUtils; import org.apache.lucene.queries.mlt.MoreLikeThis; +import org.apache.lucene.queries.mlt.MoreLikeThisParameters; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; @@ -68,16 +68,21 @@ public Query parse() { "document with id [" + uniqueValue + "]"); ScoreDoc[] scoreDocs = td.scoreDocs; MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader()); - - mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ)); - mlt.setMinDocFreq(localParams.getInt("mindf", MoreLikeThis.DEFAULT_MIN_DOC_FREQ)); - mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH)); - mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH)); - mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS)); - mlt.setMaxNumTokensParsed(localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED)); - mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ)); - Boolean boost = localParams.getBool("boost", false); - mlt.setBoost(boost); + MoreLikeThisParameters mltParams = new MoreLikeThisParameters(); + mlt.setParameters(mltParams); + + mltParams.setMinTermFreq(localParams.getInt("mintf", MoreLikeThisParameters.DEFAULT_MIN_TERM_FREQ)); + mltParams.setMinDocFreq(localParams.getInt("mindf", MoreLikeThisParameters.DEFAULT_MIN_DOC_FREQ)); + mltParams.setMinWordLen(localParams.getInt("minwl", MoreLikeThisParameters.DEFAULT_MIN_WORD_LENGTH)); + mltParams.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThisParameters.DEFAULT_MAX_WORD_LENGTH)); + mltParams.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThisParameters.DEFAULT_MAX_QUERY_TERMS)); + mltParams.setMaxNumTokensParsed(localParams.getInt("maxntp", MoreLikeThisParameters.DEFAULT_MAX_NUM_TOKENS_PARSED)); + mltParams.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThisParameters.DEFAULT_MAX_DOC_FREQ)); + // what happens if value is explicitly set to false? + if(localParams.get("boost") != null) { + mltParams.enableBoost(localParams.getBool("boost", false)); + boostFields = SolrPluginUtils.parseFieldBoosts(qf); + } String[] fieldNames; @@ -111,31 +116,11 @@ public Query parse() { "MoreLikeThis requires at least one similarity field: qf" ); } - mlt.setFieldNames(fieldNames); - mlt.setAnalyzer(req.getSchema().getIndexAnalyzer()); - - Query rawMLTQuery = mlt.like(scoreDocs[0].doc); - BooleanQuery boostedMLTQuery = (BooleanQuery) rawMLTQuery; - - if (boost && boostFields.size() > 0) { - BooleanQuery.Builder newQ = new BooleanQuery.Builder(); - newQ.setMinimumNumberShouldMatch(boostedMLTQuery.getMinimumNumberShouldMatch()); + mltParams.setFieldNames(fieldNames); + mltParams.setAnalyzer(req.getSchema().getIndexAnalyzer()); + mltParams.setFieldToQueryTimeBoostFactor(boostFields); - for (BooleanClause clause : boostedMLTQuery) { - Query q = clause.getQuery(); - float originalBoost = 1f; - if (q instanceof BoostQuery) { - BoostQuery bq = (BoostQuery) q; - q = bq.getQuery(); - originalBoost = bq.getBoost(); - } - Float fieldBoost = boostFields.get(((TermQuery) q).getTerm().field()); - q = ((fieldBoost != null) ? new BoostQuery(q, fieldBoost * originalBoost) : clause.getQuery()); - newQ.add(q, clause.getOccur()); - } - - boostedMLTQuery = newQ.build(); - } + BooleanQuery boostedMLTQuery = (BooleanQuery) mlt.like(scoreDocs[0].doc); // exclude current document from results BooleanQuery.Builder realMLTQuery = new BooleanQuery.Builder(); From 562fb48acfe3cbf5df62c3818b89ab7904aa52a9 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Mon, 6 Feb 2017 23:09:57 +0000 Subject: [PATCH 2/3] [LUCENE-7498] minor fix in field names with boost analysis --- .../java/org/apache/solr/search/mlt/CloudMLTQParser.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java b/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java index ffd9e5e1966d..23923f75c735 100644 --- a/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java +++ b/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java @@ -93,19 +93,21 @@ public Query parse() { ArrayList fieldNames = new ArrayList<>(); if (qf != null) { + ArrayList fieldNamesWithBoost = new ArrayList<>(); for (String fieldName : qf) { if (!StringUtils.isEmpty(fieldName)) { String[] strings = splitList.split(fieldName); for (String string : strings) { if (!StringUtils.isEmpty(string)) { - fieldNames.add(string); + fieldNamesWithBoost.add(string); } } } } // Parse field names and boosts from the fields - boostFields = SolrPluginUtils.parseFieldBoosts(fieldNames.toArray(new String[0])); + boostFields = SolrPluginUtils.parseFieldBoosts(fieldNamesWithBoost.toArray(new String[0])); mltParams.setFieldToQueryTimeBoostFactor(boostFields); + fieldNames.addAll(boostFields.keySet()); } else { for (String field : doc.getFieldNames()) { // Only use fields that are stored and have an explicit analyzer. From 061ca863a9f2fadd0ba996c9041cc720128a127b Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Mon, 6 Feb 2017 23:32:56 +0000 Subject: [PATCH 3/3] [LUCENE-7498] original test was not correct, fixed --- .../test/org/apache/solr/search/mlt/CloudMLTQParserTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java b/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java index e3a8d7b2d64a..c5e32e2df05f 100644 --- a/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java +++ b/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java @@ -134,7 +134,7 @@ public void testBoost() throws Exception { queryResponse = cluster.getSolrClient().query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u^10,lowerfilt1_u^1000 boost=true mintf=0 mindf=0}30")); solrDocuments = queryResponse.getResults(); - expectedIds = new int[]{29, 31, 32, 18, 23, 13, 14, 20, 22, 19}; + expectedIds = new int[]{31, 29, 32, 13, 14, 20, 22, 18, 23, 19}; actualIds = new int[solrDocuments.size()]; i = 0; for (SolrDocument solrDocument : solrDocuments) {