Skip to content

Commit

Permalink
build javadoc
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcelloPerathoner committed Sep 3, 2024
1 parent 0964871 commit 9157385
Show file tree
Hide file tree
Showing 8 changed files with 102 additions and 7 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/gradle-jar.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
- name: Build
run: |
cd repo
./gradlew jar
./gradlew jar sourcesJar javadocJar
- name: Release
uses: softprops/action-gh-release@v2
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
.gradle
.vscode
*.code-workspace
lib/bin

# Ignore Gradle build output directory
build
55 changes: 52 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,52 @@ This project contains:
- a filter that converts roman numerals into arabic ones, and
- a value source that correctly sorts strings with numbers.

The Latin stemmer uses an algorithm by Schinke et al.
Latin Stem Filter
-----------------

.. seealso::
Usage example in :code:`schema.xml`:

.. code-block:: xml
<fieldType name="text_la_stem" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="de.uni_koeln.capitularia.lucene_tools.LatinStemFilterFactory"
preserveOriginal="true" minNounSize="3" minVerbSize="3"/>
</analyzer>
</fieldType>
The stemmer uses an algorithm by Schinke et al. See:

Schinke R, Greengrass M, Robertson AM and Willett P (1996)
:title:`A stemming algorithm for Latin text databases.`
Journal of Documentation, 52: 172-187.

https://snowballstem.org/otherapps/schinke/

The filter will convert roman "XLII" to arabic "42".

Roman Numerals Filter
---------------------

The filter will convert roman :code:`XLII` to arabic :code:`42`.

Usage example in :code:`schema.xml`:

.. code-block:: xml
<fieldType name="text_la_stem" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="de.uni_koeln.capitularia.lucene_tools.RomanNumeralsFilterFactory"
preserveOriginal="true"/>
</analyzer>
</fieldType>
Sorting Value Source
--------------------

The value source generates a string that can be used as a key to sort strings correctly
like this:
Expand All @@ -29,3 +64,17 @@ instead of alphabetically, like this:

#. paris-bn-lat-10528
#. paris-bn-lat-4638

Usage example in :code:`solrconfig.xml`:

.. code-block:: xml
<config>
<valueSourceParser
name="strnumsort"
class="de.uni_koeln.capitularia.lucene_tools.StringNumberSortValueSourceParser"
/>
...
</config>
In the query set the :code:`sort` parameter to: :code:`strnumsort(my_alphanum_id) asc`
17 changes: 14 additions & 3 deletions lib/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,26 @@ dependencies {
testImplementation("org.apache.solr:solr-test-framework:9.6.1")
}

// Apply a specific Java toolchain to ease working on different environments.
base {
archivesName = "capitularia-lucene-tools"
}

java {
withJavadocJar()
withSourcesJar()
toolchain {
languageVersion = JavaLanguageVersion.of(17)
}
}

tasks.jar {
archiveBaseName.set("capitularia-lucene-tools")
tasks.withType<Javadoc> {
options {
this as StandardJavadocDocletOptions
addStringOption(
"tag",
"lucene.spi:t:SPI Name:"
)
}
}

tasks.named<Test>("test") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
* See: {@link LatinStemmer}
*/
public final class LatinStemFilter extends TokenFilter {
/** The default for the parameter minNounSize */
public static final int DEFAULT_MIN_NOUN_SIZE = 2;
/** The default for the parameter minVerbSize */
public static final int DEFAULT_MIN_VERB_SIZE = 2;
/** The default for the parameter preserveOriginal */
public static final boolean DEFAULT_PRESERVE_ORIGINAL = true;

private static final Locale LOCALE =
Expand All @@ -36,6 +39,10 @@ public final class LatinStemFilter extends TokenFilter {
private final Deque<String> stack = new ArrayDeque<>();
private State state;

/**
* Constructor
* @param input The input token stream
*/
protected LatinStemFilter(TokenStream input) {
this(
input,
Expand All @@ -45,6 +52,13 @@ protected LatinStemFilter(TokenStream input) {
);
}

/**
* Constructor
* @param input The input token stream
* @param minNounSize The minimum noun size to stem
* @param minVerbSize The minimum verb size to stem
* @param preserveOriginal Preserves the original word if true
*/
protected LatinStemFilter(
TokenStream input,
int minNounSize,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* This filter replaces roman numerals with arabic ones, eg. "XLII" => "42"
*/
public final class RomanNumeralsFilter extends TokenFilter {
/** The default for the parameter preserveOriginal */
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;

private final CharTermAttribute termAttr;
Expand All @@ -23,13 +24,22 @@ public final class RomanNumeralsFilter extends TokenFilter {
private String original;
private State state;

/**
* Constructor
* @param input The input token stream
*/
protected RomanNumeralsFilter(TokenStream input) {
this(
input,
DEFAULT_PRESERVE_ORIGINAL
);
}

/**
* Constructor
* @param input The input token stream
* @param preserveOriginal Preserves the original word if true
*/
protected RomanNumeralsFilter(TokenStream input, boolean preserveOriginal) {
super(input);
this.preserveOriginal = preserveOriginal;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@
* </pre>
*/
public final class StringNumberSortValueSource extends SingleFunction {
/** The name of the value source */
public static final String NAME = "strnumsort";
/** The regex pattern used to find the numbers to convert */
public static final Pattern REGEX = Pattern.compile("\\d+");

/**
Expand All @@ -46,6 +48,13 @@ protected String name() {
return NAME;
}

/**
* Replaces the numbers with keys that sort correctly.
* @param doc The document
* @param vals The function values
* @return The string with numbers replaced
* @throws IOException If anything bad happened in strVal
*/
protected String func(int doc, FunctionValues vals) throws IOException {
String s = vals.strVal(doc);
return REGEX.matcher(s).replaceAll(mr -> mr.group().length() + mr.group());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* <pre>
* {@code <valueSourceParser name="strnumsort"
* class="de.uni_koeln.capitularia.lucene_tools.StringNumberSortValueSourceParser" />}
* </pre>
*/
public final class StringNumberSortValueSourceParser extends ValueSourceParser {
/**
Expand Down

0 comments on commit 9157385

Please sign in to comment.