Allow stopwords to be configured at search time (castorini#1473)

Stopwords can currently be configured at index-time. This PR adds support for specifying stopwords at search-time as well. This is helpful because generally the same stopwords are used at both index and search time. If the stopword 'where' is dropped from the index but still included in searches, then a search for 'where' only matches words with the same stem like 'wheres'. This is tricky behavior and usually undesirable.
yuki617 · Jan 28, 2021 · 8d7c517 · 8d7c517
1 parent d7ad388
commit 8d7c517
Show file tree

Hide file tree

Showing 7 changed files with 113 additions and 16 deletions.
diff --git a/src/main/java/io/anserini/analysis/DefaultEnglishAnalyzer.java b/src/main/java/io/anserini/analysis/DefaultEnglishAnalyzer.java
@@ -16,6 +16,8 @@
 
 package io.anserini.analysis;
 
+import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
@@ -29,6 +31,10 @@
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
 public class DefaultEnglishAnalyzer extends StopwordAnalyzerBase {
   private final boolean stem;
   private final String stemmer;
@@ -127,4 +133,26 @@ public static final DefaultEnglishAnalyzer newNonStemmingInstance() {
   public static final DefaultEnglishAnalyzer newNonStemmingInstance(CharArraySet stopwords) {
     return new DefaultEnglishAnalyzer(stopwords);
   }
+
+  /**
+   * Creates an analyzer given common command line arguments.
+   * @param stemmer either "porter" or "krovetz"
+   * @param keepStopwords flag that allows keeping all stopwords. If true, then stopwordsFile must be null.
+   * @param stopwordsFile a text file with one stopword per line. If null, the default stopwords set is used.
+   *
+   * @return analyzer as configured
+   * @throws IOException if there's an error reading the stopwords file
+   */
+  public static DefaultEnglishAnalyzer fromArguments(String stemmer, boolean keepStopwords, String stopwordsFile) throws IOException {
+    if (keepStopwords) {
+      assert stopwordsFile == null;
+      return DefaultEnglishAnalyzer.newStemmingInstance(stemmer, CharArraySet.EMPTY_SET);
+    } else if (stopwordsFile != null) {
+      List<String> stopWords = FileUtils.readLines(new File(stopwordsFile), "utf-8");
+      CharArraySet stopWordsSet = new CharArraySet(stopWords, false);
+      return DefaultEnglishAnalyzer.newStemmingInstance(stemmer, CharArraySet.unmodifiableSet(stopWordsSet));
+    } else {
+      return DefaultEnglishAnalyzer.newStemmingInstance(stemmer);
+    }
+  }
 }
diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java
@@ -49,7 +49,6 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.logging.log4j.core.config.Configurator;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.ar.ArabicAnalyzer;
 import org.apache.lucene.analysis.bn.BengaliAnalyzer;
 import org.apache.lucene.analysis.cjk.CJKAnalyzer;
@@ -89,7 +88,12 @@
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
 import java.util.concurrent.Executors;
 import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
@@ -751,16 +755,9 @@ public Counters run() throws IOException {
       final GermanAnalyzer germanAnalyzer = new GermanAnalyzer();
       final SpanishAnalyzer spanishAnalyzer = new SpanishAnalyzer();
       final WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer();
-      final DefaultEnglishAnalyzer analyzer;
-      if (args.keepStopwords) {
-        analyzer = DefaultEnglishAnalyzer.newStemmingInstance(args.stemmer, CharArraySet.EMPTY_SET);
-      } else if (args.stopwords != null) {
-        final List<String> stopWords = FileUtils.readLines(new File(args.stopwords), "utf-8");
-        final CharArraySet stopWordsSet = new CharArraySet(stopWords, false);
-        analyzer = DefaultEnglishAnalyzer.newStemmingInstance(args.stemmer, CharArraySet.unmodifiableSet(stopWordsSet));
-      } else {
-        analyzer = DefaultEnglishAnalyzer.newStemmingInstance(args.stemmer);
-      }
+
+      final DefaultEnglishAnalyzer analyzer = DefaultEnglishAnalyzer.fromArguments(
+              args.stemmer, args.keepStopwords, args.stopwords);
       final TweetAnalyzer tweetAnalyzer = new TweetAnalyzer(args.tweetStemming);
 
       final IndexWriterConfig config;

diff --git a/src/main/java/io/anserini/search/SearchArgs.java b/src/main/java/io/anserini/search/SearchArgs.java
@@ -80,6 +80,10 @@ public class SearchArgs {
   @Option(name = "-keepstopwords", usage = "Boolean switch to keep stopwords in the query topics")
   public boolean keepstop = false;
 
+  @Option(name = "-stopwords", metaVar = "[file]", forbids = "-keepStopwords",
+          usage = "Path to file with stopwords.")
+  public String stopwords = null;
+
   @Option(name = "-arbitraryScoreTieBreak", usage = "Break score ties arbitrarily (not recommended)")
   public boolean arbitraryScoreTieBreak = false;
 

diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java
@@ -31,6 +31,7 @@
 import io.anserini.rerank.lib.NewsBackgroundLinkingReranker;
 import io.anserini.rerank.lib.Rm3Reranker;
 import io.anserini.rerank.lib.ScoreTiesAdjusterReranker;
+import io.anserini.search.query.BagOfWordsQueryGenerator;
 import io.anserini.search.query.QueryGenerator;
 import io.anserini.search.query.SdmQueryGenerator;
 import io.anserini.search.similarity.AccurateBM25Similarity;
@@ -306,12 +307,11 @@ public SearchCollection(SearchArgs args) throws IOException {
       LOG.info("Language: en_ws");
     } else {
       // Default to English
-      analyzer = args.keepstop ?
-          DefaultEnglishAnalyzer.newStemmingInstance(args.stemmer, CharArraySet.EMPTY_SET) :
-          DefaultEnglishAnalyzer.newStemmingInstance(args.stemmer);
+      analyzer = DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepstop, args.stopwords);
       LOG.info("Language: en");
       LOG.info("Stemmer: " + args.stemmer);
       LOG.info("Keep stopwords? " + args.keepstop);
+      LOG.info("Stopwords file " + args.stopwords);
     }
 
     isRerank = args.rm3 || args.axiom || args.bm25prf;

diff --git a/src/main/java/io/anserini/search/SearchMsmarco.java b/src/main/java/io/anserini/search/SearchMsmarco.java
@@ -16,10 +16,13 @@
 
 package io.anserini.search;
 
+import io.anserini.analysis.DefaultEnglishAnalyzer;
 import io.anserini.search.query.BagOfWordsQueryGenerator;
 import io.anserini.search.query.DisjunctionMaxQueryGenerator;
 import io.anserini.search.query.QueryGenerator;
 import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
 import org.kohsuke.args4j.CmdLineException;
 import org.kohsuke.args4j.CmdLineParser;
 import org.kohsuke.args4j.Option;
@@ -87,6 +90,17 @@ public static class Args {
 
     @Option(name = "-dismax.tiebreaker", metaVar = "[value]", usage = "The tiebreaker weight to use in disjunction max queries.")
     public float dismax_tiebreaker = 0.0f;
+
+    @Option(name = "-keepstopwords", usage = "Boolean switch to keep stopwords in the query topics")
+    public boolean keepstop = false;
+
+    @Option(name = "-stemmer", usage = "Stemmer: one of the following porter,krovetz,none. Default porter")
+    public String stemmer = "porter";
+
+    @Option(name = "-stopwords", metaVar = "[file]", forbids = "-keepStopwords",
+            usage = "Path to file with stopwords.")
+    public String stopwords = null;
+
   }
 
   public static void main(String[] args) throws Exception {
@@ -104,7 +118,12 @@ public static void main(String[] args) throws Exception {
 
     long totalStartTime = System.nanoTime();
 
-    SimpleSearcher searcher = new SimpleSearcher(retrieveArgs.index);
+    Analyzer analyzer = DefaultEnglishAnalyzer.fromArguments(
+            retrieveArgs.stemmer, retrieveArgs.keepstop, retrieveArgs.stopwords);
+    System.out.println("Initializing analyzer with stemmer=" + retrieveArgs.stemmer + ", keepstop=" +
+            retrieveArgs.keepstop + ", stopwords=" + retrieveArgs.stopwords);
+
+    SimpleSearcher searcher = new SimpleSearcher(retrieveArgs.index, analyzer);
     searcher.setBM25(retrieveArgs.k1, retrieveArgs.b);
     System.out.println("Initializing BM25, setting k1=" + retrieveArgs.k1 + " and b=" + retrieveArgs.b + "");
 

diff --git a/src/test/java/io/anserini/analysis/DefaultEnglishAnalyzerTest.java b/src/test/java/io/anserini/analysis/DefaultEnglishAnalyzerTest.java
@@ -0,0 +1,45 @@
+/*
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.analysis;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.junit.Test;
+
+import java.util.Arrays;
+
+import static io.anserini.analysis.DefaultEnglishAnalyzer.fromArguments;
+import static org.junit.Assert.assertEquals;
+
+public class DefaultEnglishAnalyzerTest {
+
+  @Test
+  public void testKeepStopwords() throws Exception {
+    DefaultEnglishAnalyzer defaultAnalyzer = fromArguments("porter", false, null);
+    assertEquals(EnglishAnalyzer.getDefaultStopSet(), defaultAnalyzer.getStopwordSet());
+
+    DefaultEnglishAnalyzer analyzer = fromArguments("porter", true, null);
+    assertEquals(CharArraySet.EMPTY_SET, analyzer.getStopwordSet());
+  }
+
+  @Test
+  public void testStopwordsLoading() throws Exception {
+    DefaultEnglishAnalyzer analyzer = fromArguments("porter", false, "src/test/resources/test-stopwords.txt");
+    CharArraySet expectedStopwords = new CharArraySet(Arrays.asList("some", "very", "common", "words"), false);
+    assertEquals(expectedStopwords, analyzer.getStopwordSet());
+  }
+}
diff --git a/src/test/resources/test-stopwords.txt b/src/test/resources/test-stopwords.txt
@@ -0,0 +1,4 @@
+some
+very
+common
+words