Skip to content

Commit

Permalink
Allow stopwords to be configured at search time (castorini#1473)
Browse files Browse the repository at this point in the history
Stopwords can currently be configured at index-time. This PR adds support for
specifying stopwords at search-time as well.

This is helpful because generally the same stopwords are used at both index and
search time. If the stopword 'where' is dropped from the index but still
included in searches, then a search for 'where' only matches words with the same
stem like 'wheres'. This is tricky behavior and usually undesirable.
  • Loading branch information
jtibshirani committed Jan 28, 2021
1 parent d7ad388 commit 8d7c517
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 16 deletions.
28 changes: 28 additions & 0 deletions src/main/java/io/anserini/analysis/DefaultEnglishAnalyzer.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

package io.anserini.analysis;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
Expand All @@ -29,6 +31,10 @@
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

import java.io.File;
import java.io.IOException;
import java.util.List;

public class DefaultEnglishAnalyzer extends StopwordAnalyzerBase {
private final boolean stem;
private final String stemmer;
Expand Down Expand Up @@ -127,4 +133,26 @@ public static final DefaultEnglishAnalyzer newNonStemmingInstance() {
public static final DefaultEnglishAnalyzer newNonStemmingInstance(CharArraySet stopwords) {
return new DefaultEnglishAnalyzer(stopwords);
}

/**
* Creates an analyzer given common command line arguments.
* @param stemmer either "porter" or "krovetz"
* @param keepStopwords flag that allows keeping all stopwords. If true, then stopwordsFile must be null.
* @param stopwordsFile a text file with one stopword per line. If null, the default stopwords set is used.
*
* @return analyzer as configured
* @throws IOException if there's an error reading the stopwords file
*/
public static DefaultEnglishAnalyzer fromArguments(String stemmer, boolean keepStopwords, String stopwordsFile) throws IOException {
if (keepStopwords) {
assert stopwordsFile == null;
return DefaultEnglishAnalyzer.newStemmingInstance(stemmer, CharArraySet.EMPTY_SET);
} else if (stopwordsFile != null) {
List<String> stopWords = FileUtils.readLines(new File(stopwordsFile), "utf-8");
CharArraySet stopWordsSet = new CharArraySet(stopWords, false);
return DefaultEnglishAnalyzer.newStemmingInstance(stemmer, CharArraySet.unmodifiableSet(stopWordsSet));
} else {
return DefaultEnglishAnalyzer.newStemmingInstance(stemmer);
}
}
}
21 changes: 9 additions & 12 deletions src/main/java/io/anserini/index/IndexCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.core.config.Configurator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
Expand Down Expand Up @@ -89,7 +88,12 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
Expand Down Expand Up @@ -751,16 +755,9 @@ public Counters run() throws IOException {
final GermanAnalyzer germanAnalyzer = new GermanAnalyzer();
final SpanishAnalyzer spanishAnalyzer = new SpanishAnalyzer();
final WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer();
final DefaultEnglishAnalyzer analyzer;
if (args.keepStopwords) {
analyzer = DefaultEnglishAnalyzer.newStemmingInstance(args.stemmer, CharArraySet.EMPTY_SET);
} else if (args.stopwords != null) {
final List<String> stopWords = FileUtils.readLines(new File(args.stopwords), "utf-8");
final CharArraySet stopWordsSet = new CharArraySet(stopWords, false);
analyzer = DefaultEnglishAnalyzer.newStemmingInstance(args.stemmer, CharArraySet.unmodifiableSet(stopWordsSet));
} else {
analyzer = DefaultEnglishAnalyzer.newStemmingInstance(args.stemmer);
}

final DefaultEnglishAnalyzer analyzer = DefaultEnglishAnalyzer.fromArguments(
args.stemmer, args.keepStopwords, args.stopwords);
final TweetAnalyzer tweetAnalyzer = new TweetAnalyzer(args.tweetStemming);

final IndexWriterConfig config;
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/io/anserini/search/SearchArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ public class SearchArgs {
@Option(name = "-keepstopwords", usage = "Boolean switch to keep stopwords in the query topics")
public boolean keepstop = false;

@Option(name = "-stopwords", metaVar = "[file]", forbids = "-keepStopwords",
usage = "Path to file with stopwords.")
public String stopwords = null;

@Option(name = "-arbitraryScoreTieBreak", usage = "Break score ties arbitrarily (not recommended)")
public boolean arbitraryScoreTieBreak = false;

Expand Down
6 changes: 3 additions & 3 deletions src/main/java/io/anserini/search/SearchCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import io.anserini.rerank.lib.NewsBackgroundLinkingReranker;
import io.anserini.rerank.lib.Rm3Reranker;
import io.anserini.rerank.lib.ScoreTiesAdjusterReranker;
import io.anserini.search.query.BagOfWordsQueryGenerator;
import io.anserini.search.query.QueryGenerator;
import io.anserini.search.query.SdmQueryGenerator;
import io.anserini.search.similarity.AccurateBM25Similarity;
Expand Down Expand Up @@ -306,12 +307,11 @@ public SearchCollection(SearchArgs args) throws IOException {
LOG.info("Language: en_ws");
} else {
// Default to English
analyzer = args.keepstop ?
DefaultEnglishAnalyzer.newStemmingInstance(args.stemmer, CharArraySet.EMPTY_SET) :
DefaultEnglishAnalyzer.newStemmingInstance(args.stemmer);
analyzer = DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepstop, args.stopwords);
LOG.info("Language: en");
LOG.info("Stemmer: " + args.stemmer);
LOG.info("Keep stopwords? " + args.keepstop);
LOG.info("Stopwords file " + args.stopwords);
}

isRerank = args.rm3 || args.axiom || args.bm25prf;
Expand Down
21 changes: 20 additions & 1 deletion src/main/java/io/anserini/search/SearchMsmarco.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@

package io.anserini.search;

import io.anserini.analysis.DefaultEnglishAnalyzer;
import io.anserini.search.query.BagOfWordsQueryGenerator;
import io.anserini.search.query.DisjunctionMaxQueryGenerator;
import io.anserini.search.query.QueryGenerator;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
Expand Down Expand Up @@ -87,6 +90,17 @@ public static class Args {

@Option(name = "-dismax.tiebreaker", metaVar = "[value]", usage = "The tiebreaker weight to use in disjunction max queries.")
public float dismax_tiebreaker = 0.0f;

@Option(name = "-keepstopwords", usage = "Boolean switch to keep stopwords in the query topics")
public boolean keepstop = false;

@Option(name = "-stemmer", usage = "Stemmer: one of the following porter,krovetz,none. Default porter")
public String stemmer = "porter";

@Option(name = "-stopwords", metaVar = "[file]", forbids = "-keepStopwords",
usage = "Path to file with stopwords.")
public String stopwords = null;

}

public static void main(String[] args) throws Exception {
Expand All @@ -104,7 +118,12 @@ public static void main(String[] args) throws Exception {

long totalStartTime = System.nanoTime();

SimpleSearcher searcher = new SimpleSearcher(retrieveArgs.index);
Analyzer analyzer = DefaultEnglishAnalyzer.fromArguments(
retrieveArgs.stemmer, retrieveArgs.keepstop, retrieveArgs.stopwords);
System.out.println("Initializing analyzer with stemmer=" + retrieveArgs.stemmer + ", keepstop=" +
retrieveArgs.keepstop + ", stopwords=" + retrieveArgs.stopwords);

SimpleSearcher searcher = new SimpleSearcher(retrieveArgs.index, analyzer);
searcher.setBM25(retrieveArgs.k1, retrieveArgs.b);
System.out.println("Initializing BM25, setting k1=" + retrieveArgs.k1 + " and b=" + retrieveArgs.b + "");

Expand Down
45 changes: 45 additions & 0 deletions src/test/java/io/anserini/analysis/DefaultEnglishAnalyzerTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Anserini: A Lucene toolkit for replicable information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.analysis;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.junit.Test;

import java.util.Arrays;

import static io.anserini.analysis.DefaultEnglishAnalyzer.fromArguments;
import static org.junit.Assert.assertEquals;

public class DefaultEnglishAnalyzerTest {

@Test
public void testKeepStopwords() throws Exception {
DefaultEnglishAnalyzer defaultAnalyzer = fromArguments("porter", false, null);
assertEquals(EnglishAnalyzer.getDefaultStopSet(), defaultAnalyzer.getStopwordSet());

DefaultEnglishAnalyzer analyzer = fromArguments("porter", true, null);
assertEquals(CharArraySet.EMPTY_SET, analyzer.getStopwordSet());
}

@Test
public void testStopwordsLoading() throws Exception {
DefaultEnglishAnalyzer analyzer = fromArguments("porter", false, "src/test/resources/test-stopwords.txt");
CharArraySet expectedStopwords = new CharArraySet(Arrays.asList("some", "very", "common", "words"), false);
assertEquals(expectedStopwords, analyzer.getStopwordSet());
}
}
4 changes: 4 additions & 0 deletions src/test/resources/test-stopwords.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
some
very
common
words

0 comments on commit 8d7c517

Please sign in to comment.