From 5f3e7ff81207aab009991af6f6f4e000341f1cc2 Mon Sep 17 00:00:00 2001
From: Adityakolluru7 <kolluruaditya96@gmail.com>
Date: Thu, 3 Nov 2022 14:30:52 -0400
Subject: [PATCH 01/21] added topic interpretability to eval metrics

---
 .../interpretability_metrics.py               | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 octis/evaluation_metrics/interpretability_metrics.py

diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py
new file mode 100644
index 00000000..ad91f1d9
--- /dev/null
+++ b/octis/evaluation_metrics/interpretability_metrics.py
@@ -0,0 +1,45 @@
+from octis.evaluation_metrics.metrics import AbstractMetric
+from gensim.corpora.dictionary import Dictionary
+from octis.evaluation_metrics.diversity_metrics import TopicDiversity
+from octis.evaluation_metrics.coherence_metrics import Coherence
+
+
+class TopicInterpretability(AbstractMetric):
+    def __init__(
+        self,  # noqa
+        texts: str = None,
+        topk: int = 10,
+        coherence_measure: str = "c_npmi",
+    ) -> None:
+        """
+        Initialize metric
+        Parameters
+        ----------
+        texts : list of documents (list of lists of strings)
+        topk : how many most likely words to consider in
+        the evaluation
+        measure : (default 'c_npmi') measure to use.
+        """
+        super().__init__()
+        if texts is None:
+            raise Exception("There are no texts in the document")
+        else:
+            self._texts = texts
+        self._dictionary = Dictionary(self._texts)
+        self.topk = topk
+        self.coherence_measure = coherence_measure
+        c_npmi = Coherence(texts, topk=topk, measure=coherence_measure)
+        topic_diversity = TopicDiversity(topk=topk)
+        self.c_npmi = c_npmi
+        self.topic_diversity = topic_diversity
+
+    def score(self, model_output: dict) -> float:  # noqa
+
+        if self.c_npmi.score(model_output) != 0:
+            # 1 is added to convert npmi output into a positive scale
+            return (1+self.c_npmi.score(
+                model_output
+            )) * self.topic_diversity.score(model_output)
+
+        elif self.topic_diversity.score(model_output) == 0:
+            return 0

From 59b816e1e5b674c127a2a6eefa6812a95595eb81 Mon Sep 17 00:00:00 2001
From: Adityakolluru7 <kolluruaditya96@gmail.com>
Date: Mon, 7 Nov 2022 11:20:00 -0500
Subject: [PATCH 02/21] minor refactoring

---
 .../interpretability_metrics.py               | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py
index ad91f1d9..10be8994 100644
--- a/octis/evaluation_metrics/interpretability_metrics.py
+++ b/octis/evaluation_metrics/interpretability_metrics.py
@@ -1,15 +1,14 @@
 from octis.evaluation_metrics.metrics import AbstractMetric
 from gensim.corpora.dictionary import Dictionary
 from octis.evaluation_metrics.diversity_metrics import TopicDiversity
-from octis.evaluation_metrics.coherence_metrics import Coherence
+from octis.evaluation_metrics.coherence_metrics import Coherence,_load_default_texts
 
 
 class TopicInterpretability(AbstractMetric):
     def __init__(
         self,  # noqa
         texts: str = None,
-        topk: int = 10,
-        coherence_measure: str = "c_npmi",
+        topk: int = 10
     ) -> None:
         """
         Initialize metric
@@ -22,24 +21,16 @@ def __init__(
         """
         super().__init__()
         if texts is None:
-            raise Exception("There are no texts in the document")
+            self._texts = _load_default_texts()
         else:
             self._texts = texts
         self._dictionary = Dictionary(self._texts)
         self.topk = topk
-        self.coherence_measure = coherence_measure
-        c_npmi = Coherence(texts, topk=topk, measure=coherence_measure)
-        topic_diversity = TopicDiversity(topk=topk)
-        self.c_npmi = c_npmi
-        self.topic_diversity = topic_diversity
+        self.c_npmi = Coherence(texts, topk=topk, measure='c_npmi')
+        self.topic_diversity = TopicDiversity(topk=topk)
 
     def score(self, model_output: dict) -> float:  # noqa
-
-        if self.c_npmi.score(model_output) != 0:
             # 1 is added to convert npmi output into a positive scale
             return (1+self.c_npmi.score(
                 model_output
             )) * self.topic_diversity.score(model_output)
-
-        elif self.topic_diversity.score(model_output) == 0:
-            return 0

From c7af4ce59503787f863a38791e405dc05d88fcda Mon Sep 17 00:00:00 2001
From: Adityakolluru7 <kolluruaditya96@gmail.com>
Date: Mon, 7 Nov 2022 14:02:49 -0500
Subject: [PATCH 03/21] some more minor refactoring

---
 octis/evaluation_metrics/interpretability_metrics.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py
index 10be8994..ecb2068b 100644
--- a/octis/evaluation_metrics/interpretability_metrics.py
+++ b/octis/evaluation_metrics/interpretability_metrics.py
@@ -17,15 +17,12 @@ def __init__(
         texts : list of documents (list of lists of strings)
         topk : how many most likely words to consider in
         the evaluation
-        measure : (default 'c_npmi') measure to use.
         """
         super().__init__()
         if texts is None:
             self._texts = _load_default_texts()
         else:
             self._texts = texts
-        self._dictionary = Dictionary(self._texts)
-        self.topk = topk
         self.c_npmi = Coherence(texts, topk=topk, measure='c_npmi')
         self.topic_diversity = TopicDiversity(topk=topk)
 

From 3c06213fbe7856b266eb496c0e4d05efb0b6d8e9 Mon Sep 17 00:00:00 2001
From: Adityakolluru7 <kolluruaditya96@gmail.com>
Date: Mon, 7 Nov 2022 15:22:08 -0500
Subject: [PATCH 04/21] minor refactoring

---
 octis/evaluation_metrics/interpretability_metrics.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py
index ecb2068b..16d84b09 100644
--- a/octis/evaluation_metrics/interpretability_metrics.py
+++ b/octis/evaluation_metrics/interpretability_metrics.py
@@ -1,5 +1,4 @@
 from octis.evaluation_metrics.metrics import AbstractMetric
-from gensim.corpora.dictionary import Dictionary
 from octis.evaluation_metrics.diversity_metrics import TopicDiversity
 from octis.evaluation_metrics.coherence_metrics import Coherence,_load_default_texts
 

From 66d46a0789c85caceec7b57d5c1fbc0229b2c125 Mon Sep 17 00:00:00 2001
From: Adityakolluru7 <kolluruaditya96@gmail.com>
Date: Fri, 11 Nov 2022 10:28:25 -0500
Subject: [PATCH 05/21] logging topk in topic_interpretability

---
 octis/evaluation_metrics/interpretability_metrics.py |  1 +
 octis/optimization/optimizer_evaluation.py           | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py
index 16d84b09..dfa30c26 100644
--- a/octis/evaluation_metrics/interpretability_metrics.py
+++ b/octis/evaluation_metrics/interpretability_metrics.py
@@ -22,6 +22,7 @@ def __init__(
             self._texts = _load_default_texts()
         else:
             self._texts = texts
+        self.topk=topk
         self.c_npmi = Coherence(texts, topk=topk, measure='c_npmi')
         self.topic_diversity = TopicDiversity(topk=topk)
 
diff --git a/octis/optimization/optimizer_evaluation.py b/octis/optimization/optimizer_evaluation.py
index a7b2130a..acbba943 100644
--- a/octis/optimization/optimizer_evaluation.py
+++ b/octis/optimization/optimizer_evaluation.py
@@ -2,7 +2,8 @@
 import numpy as np
 import pandas as pd
 from octis.optimization.optimizer_tool import check_instance, save_search_space, convert_type
-
+from octis.evaluation_metrics.diversity_metrics import TopicDiversity
+from octis.evaluation_metrics.coherence_metrics import Coherence
 
 class OptimizerEvaluation:
 
@@ -21,8 +22,11 @@ def __init__(self, optimizer, BO_results):
         dict_metric_parameters = dict()
 
         for mp in metric_parameters:
-            if check_instance(getattr(optimizer.metric, mp)):
-                dict_metric_parameters.update({mp: getattr(optimizer.metric, mp)})
+            for mp in (metric_parameters):
+                if isinstance(getattr(optimizer.metric,mp),(Coherence,TopicDiversity)):
+                    pass
+                elif check_instance(getattr(optimizer.metric, mp)):
+                    dict_metric_parameters.update({mp: getattr(optimizer.metric, mp)})
 
         # Creation of model hyper-parameters saved in the json file
         model_parameters = optimizer.model.hyperparameters

From 1bc49da6858d5f4d4603cd24212e205a8e06321f Mon Sep 17 00:00:00 2001
From: Adityakolluru7 <kolluruaditya96@gmail.com>
Date: Fri, 11 Nov 2022 10:48:15 -0500
Subject: [PATCH 06/21] deleting 2nd loop

---
 octis/optimization/optimizer_evaluation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/octis/optimization/optimizer_evaluation.py b/octis/optimization/optimizer_evaluation.py
index acbba943..2c2f6a0e 100644
--- a/octis/optimization/optimizer_evaluation.py
+++ b/octis/optimization/optimizer_evaluation.py
@@ -22,7 +22,6 @@ def __init__(self, optimizer, BO_results):
         dict_metric_parameters = dict()
 
         for mp in metric_parameters:
-            for mp in (metric_parameters):
                 if isinstance(getattr(optimizer.metric,mp),(Coherence,TopicDiversity)):
                     pass
                 elif check_instance(getattr(optimizer.metric, mp)):

From 464bbe352b9dcc43c7a01fbda24d479171fcd054 Mon Sep 17 00:00:00 2001
From: Adityakolluru7 <kolluruaditya96@gmail.com>
Date: Fri, 11 Nov 2022 10:53:27 -0500
Subject: [PATCH 07/21] correcting intendation

---
 octis/optimization/optimizer_evaluation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/octis/optimization/optimizer_evaluation.py b/octis/optimization/optimizer_evaluation.py
index 2c2f6a0e..674398ef 100644
--- a/octis/optimization/optimizer_evaluation.py
+++ b/octis/optimization/optimizer_evaluation.py
@@ -22,10 +22,10 @@ def __init__(self, optimizer, BO_results):
         dict_metric_parameters = dict()
 
         for mp in metric_parameters:
-                if isinstance(getattr(optimizer.metric,mp),(Coherence,TopicDiversity)):
-                    pass
-                elif check_instance(getattr(optimizer.metric, mp)):
-                    dict_metric_parameters.update({mp: getattr(optimizer.metric, mp)})
+            if isinstance(getattr(optimizer.metric,mp),(Coherence,TopicDiversity)):
+                pass
+            elif check_instance(getattr(optimizer.metric, mp)):
+                dict_metric_parameters.update({mp: getattr(optimizer.metric, mp)})
 
         # Creation of model hyper-parameters saved in the json file
         model_parameters = optimizer.model.hyperparameters

From f178eeedf890b7ad9e5b7a630628cece8d30242f Mon Sep 17 00:00:00 2001
From: Adityakolluru7 <kolluruaditya96@gmail.com>
Date: Fri, 11 Nov 2022 11:19:41 -0500
Subject: [PATCH 08/21] minor refactoring

---
 octis/optimization/optimizer_evaluation.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/octis/optimization/optimizer_evaluation.py b/octis/optimization/optimizer_evaluation.py
index 674398ef..4b234031 100644
--- a/octis/optimization/optimizer_evaluation.py
+++ b/octis/optimization/optimizer_evaluation.py
@@ -2,8 +2,6 @@
 import numpy as np
 import pandas as pd
 from octis.optimization.optimizer_tool import check_instance, save_search_space, convert_type
-from octis.evaluation_metrics.diversity_metrics import TopicDiversity
-from octis.evaluation_metrics.coherence_metrics import Coherence
 
 class OptimizerEvaluation:
 
@@ -22,9 +20,7 @@ def __init__(self, optimizer, BO_results):
         dict_metric_parameters = dict()
 
         for mp in metric_parameters:
-            if isinstance(getattr(optimizer.metric,mp),(Coherence,TopicDiversity)):
-                pass
-            elif check_instance(getattr(optimizer.metric, mp)):
+            if check_instance(getattr(optimizer.metric, mp)):
                 dict_metric_parameters.update({mp: getattr(optimizer.metric, mp)})
 
         # Creation of model hyper-parameters saved in the json file

From a84efd475d1ed6572fefae43d58ab0d982a1f226 Mon Sep 17 00:00:00 2001
From: Ge Li <ge.li@mila.quebec>
Date: Mon, 14 Nov 2022 10:08:31 -0500
Subject: [PATCH 09/21] copy and paste preproc code from topic-modeling

---
 octis/preprocessing/preprocessing.py | 401 +++++++++++++++++++--------
 1 file changed, 283 insertions(+), 118 deletions(-)

diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py
index d06149c9..0c995132 100644
--- a/octis/preprocessing/preprocessing.py
+++ b/octis/preprocessing/preprocessing.py
@@ -1,35 +1,66 @@
-import string
-from typing import List, Union
+# mypy: ignore-errors
+# flake8: noqa
 
+import string
+import multiprocessing as mp
 import spacy
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split
-from tqdm.contrib.concurrent import process_map  # or thread_map
 from pathlib import Path
 from octis.dataset.dataset import Dataset
 from collections import Counter
+import numpy as np
+from tqdm import tqdm
+
 
 """
 Maps the language to its corresponding spacy model
 """
-spacy_model_mapping = {'chinese': 'zh_core_web_sm', 'danish': 'nl_core_news_sm', 'dutch': 'nl_core_news_sm',
-                       'english': 'en_core_web_sm', 'french': 'fr_core_news_sm', 'german': 'de_core_news_sm',
-                       'greek': 'el_core_news_sm', 'italian': 'it_core_news_sm', 'japanese': 'ja_core_news_sm',
-                       'lithuanian': 'lt_core_news_sm', 'norwegian': 'nb_core_news_sm', 'polish': 'pl_core_news_sm',
-                       'portuguese': 'pt_core_news_sm', 'romanian': 'ro_core_news_sm', 'russian': 'ru_core_news_sm',
-                       'spanish': 'es_core_news_sm'}
+spacy_model_mapping = {
+    "chinese": "zh_core_web_sm",
+    "danish": "nl_core_news_sm",
+    "dutch": "nl_core_news_sm",
+    "english": "en_core_web_lg",
+    "french": "fr_core_news_sm",
+    "german": "de_core_news_sm",
+    "greek": "el_core_news_sm",
+    "italian": "it_core_news_sm",
+    "japanese": "ja_core_news_sm",
+    "lithuanian": "lt_core_news_sm",
+    "norwegian": "nb_core_news_sm",
+    "polish": "pl_core_news_sm",
+    "portuguese": "pt_core_news_sm",
+    "romanian": "ro_core_news_sm",
+    "russian": "ru_core_news_sm",
+    "spanish": "es_core_news_sm",
+}
 
 
 class Preprocessing:
-    def __init__(self, lowercase: bool = True, vocabulary: List[str] = None, max_features: int = None,
-                 min_df: float = 0.0, max_df: float = 1.0, remove_punctuation: bool = True,
-                 punctuation: str = string.punctuation, remove_numbers: bool = True, lemmatize: bool = True,
-                 stopword_list: Union[str, List[str]] = None, min_chars: int = 1, min_words_docs: int = 0,
-                 language: str = 'english', split: bool = True, verbose: bool = False, num_processes: int = None,
-                 save_original_indexes=True, remove_stopwords_spacy: bool = True):
+    def __init__(
+        self,
+        lowercase: bool = True,
+        vocabulary: list[str] = None,
+        max_features: int = None,
+        min_df: float = 0.0,
+        max_df: float = 1.0,
+        remove_punctuation: bool = True,
+        punctuation: str = string.punctuation,
+        remove_numbers: bool = True,
+        lemmatize: bool = True,
+        stopword_list: str | list[str] = None,
+        min_chars: int = 1,
+        min_words_docs: int = 0,
+        language: str = "english",
+        split: bool = True,
+        verbose: bool = False,
+        num_processes: int = None,
+        save_original_indexes=True,
+        remove_stopwords_spacy: bool = True,
+        entities: list[str] = None,
+    ):
         """
         init Preprocessing
-
         :param lowercase: if true, words in documents are reduced to lowercase (default: true)
         :type lowercase: boolean
         :param vocabulary: the vocabulary of the corpus to preprocess (default: None)
@@ -70,6 +101,10 @@ def __init__(self, lowercase: bool = True, vocabulary: List[str] = None, max_fea
         :param num_processes: number of processes to run the preprocessing
         :type num_processes: int
         :param save_original_indexes: if true, it keeps track of the original indexes of the documents
+        :param entities: labels of entity types to be remove; accepted entity types are: CARDINAL, DATE,
+        EVENT, FAC, "GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY,
+        TIME, WORK_OF_ART (currently only implemented for english)
+        :type entities: list[str]
         """
         self.vocabulary = vocabulary
         self.lowercase = lowercase
@@ -83,20 +118,25 @@ def __init__(self, lowercase: bool = True, vocabulary: List[str] = None, max_fea
         self.num_processes = num_processes
         self.remove_numbers = remove_numbers
         self.save_original_indexes = save_original_indexes
+        self.entities = entities
 
         if self.lemmatize:
             lang = spacy_model_mapping[self.language]
             try:
                 self.spacy_model = spacy.load(lang)
             except IOError:
-                raise IOError("Can't find model " + lang + ". Check the data directory or download it using the "
-                                                           "following command:\npython -m spacy download " + lang)
+                raise IOError(
+                    "Can't find model "
+                    + lang
+                    + ". Check the data directory or download it using the "
+                    "following command:\npython -m spacy download " + lang
+                )
         self.split = split
         self.verbose = verbose
 
         self.remove_stopwords_spacy = remove_stopwords_spacy
 
-        stopwords = []
+        stopwords = set()
         # if stopwords is None then stopwords are not removed
         if stopword_list is None:
             self.remove_stopwords_spacy = False
@@ -106,24 +146,33 @@ def __init__(self, lowercase: bool = True, vocabulary: List[str] = None, max_fea
                 stopwords = set(stopword_list)
                 self.remove_stopwords_spacy = False
             elif self.remove_stopwords_spacy:
-                assert stopword_list == language
+                assert stopword_list == language  # nosec
             else:
                 # if remove_stopwords_spacy is false, then use MALLET English stopwords
-                if 'english' in stopword_list:
-                    stop_word_path = Path(__file__).parent.joinpath('stopwords', 'english.txt')
+                if "english" in stopword_list:
+                    stop_word_path = Path(__file__).parent.joinpath(
+                        "stopwords", "english.txt"
+                    )
                     with open(stop_word_path) as fr:
-                        stopwords = [line.strip() for line in fr.readlines()]
-                        assert stopword_list == language
+                        stopwords = set(
+                            [line.strip() for line in fr.readlines()]
+                        )
+                        assert stopword_list == language  # nosec
 
         self.stopwords = stopwords
         self.min_chars = min_chars
         self.min_doc_words = min_words_docs
         self.preprocessing_steps = []
 
-    def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False):
+    def preprocess_dataset(
+        self,
+        documents_path=None,
+        labels_path=None,
+        multilabel=False,
+        do_simple_preprocessing=True,
+    ):
         """
         preprocess the input dataset
-
         :param documents_path: path to the documents file. Each row of the file represents a document
         :type documents_path: str
         :param labels_path: path to the documents file. Each row of the file represents a label. Its index corresponds
@@ -131,22 +180,30 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
         :type labels_path: str
         :param multilabel: if true, a document is supposed to have more than one label (labels are split by whitespace)
         :type multilabel: bool
-
+        :param do_simple_preprocessing: if true, perform simple_preprocessing_steps (including lemmatization and
+        :stopwords removal etc). if false, skip simple_preprocessing_steps, which is needed during the merge step.
         :return octis.dataset.dataset.Dataset
         """
-        docs = [line.strip() for line in open(documents_path, 'r').readlines()]
-        if self.num_processes is not None:
-            # with Pool(self.num_processes) as p:
-            #    docs = p.map(self.simple_preprocessing_steps, docs)
-            docs = process_map(self.simple_preprocessing_steps, docs, max_workers=self.num_processes, chunksize=1)
+        docs = [line.strip() for line in open(documents_path, "r").readlines()]
+        if do_simple_preprocessing:
+            if self.num_processes is not None:
+
+                docs_splits = np.array_split(docs, self.num_processes)
+                with mp.Pool(self.num_processes) as p:
+                    docs = np.hstack(
+                        p.map(self.simple_preprocessing_steps, docs_splits)
+                    )
+
+            else:
+                docs = self.simple_preprocessing_steps(docs)
+            if self.lowercase:
+                self.preprocessing_steps.append("lowercase")
+            if self.remove_punctuation:
+                self.preprocessing_steps.append("remove_punctuation")
+            if self.lemmatize:
+                self.preprocessing_steps.append("lemmatize")
         else:
-            docs = self.simple_preprocessing_steps(docs)
-        if self.lowercase:
-            self.preprocessing_steps.append("lowercase")
-        if self.remove_punctuation:
-            self.preprocessing_steps.append('remove_punctuation')
-        if self.lemmatize:
-            self.preprocessing_steps.append('lemmatize')
+            print("Skip simple processing!")
 
         vocabulary = self.filter_words(docs)
         print("created vocab")
@@ -156,9 +213,14 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
         final_docs, final_labels, document_indexes = [], [], []
         if labels_path is not None:
             if multilabel:
-                labels = [line.strip().split() for line in open(labels_path, 'r').readlines()]
+                labels = [
+                    line.strip().split()
+                    for line in open(labels_path, "r").readlines()
+                ]
             else:
-                labels = [line.strip() for line in open(labels_path, 'r').readlines()]
+                labels = [
+                    line.strip() for line in open(labels_path, "r").readlines()
+                ]
 
             for i, doc, label in zip(range(len(docs)), docs, labels):
                 vocab = set(vocabulary)
@@ -168,7 +230,9 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
                     final_labels.append(label)
                     document_indexes.append(i)
 
-            labels_to_remove = set([k for k, v in dict(Counter(final_labels)).items() if v <= 3])
+            labels_to_remove = set(
+                [k for k, v in dict(Counter(final_labels)).items() if v <= 3]
+            )
             if len(labels_to_remove) > 0:
                 docs = final_docs
                 labels = final_labels
@@ -186,128 +250,229 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
                     final_docs.append(new_doc)
                     document_indexes.append(i)
 
-        self.preprocessing_steps.append('filter documents with less than ' + str(self.min_doc_words) + " words")
+        self.preprocessing_steps.append(
+            "filter documents with less than "
+            + str(self.min_doc_words)
+            + " words"
+        )
         if self.verbose:
             print("words filtering done")
-        metadata = {"total_documents": len(docs), "vocabulary_length": len(vocabulary),
-                    "preprocessing-info": self.preprocessing_steps
-                    # ,"labels": list(set(final_labels)), "total_labels": len(set(final_labels))
-                    }
+        metadata = {
+            "total_documents": len(docs),
+            "vocabulary_length": len(vocabulary),
+            "preprocessing-info": self.preprocessing_steps
+            # ,"labels": list(set(final_labels)), "total_labels": len(set(final_labels))
+        }
         if self.split:
             if len(final_labels) > 0:
                 train, test, y_train, y_test = train_test_split(
-                    range(len(final_docs)), final_labels, test_size=0.15, random_state=1, shuffle=True)#stratify=final_labels)
+                    range(len(final_docs)),
+                    final_labels,
+                    test_size=0.15,
+                    random_state=1,
+                    shuffle=True,
+                )  # stratify=final_labels)
 
-                train, validation = train_test_split(train, test_size=3 / 17, random_state=1, shuffle=True)# stratify=y_train)
+                train, validation = train_test_split(
+                    train, test_size=3 / 17, random_state=1, shuffle=True
+                )  # stratify=y_train)
 
-                partitioned_labels = [final_labels[doc] for doc in train + validation + test]
-                partitioned_corpus = [final_docs[doc] for doc in train + validation + test]
-                document_indexes = [document_indexes[doc] for doc in train + validation + test]
+                partitioned_labels = [
+                    final_labels[doc] for doc in train + validation + test
+                ]
+                partitioned_corpus = [
+                    final_docs[doc] for doc in train + validation + test
+                ]
+                document_indexes = [
+                    document_indexes[doc] for doc in train + validation + test
+                ]
                 metadata["last-training-doc"] = len(train)
                 metadata["last-validation-doc"] = len(validation) + len(train)
                 if self.save_original_indexes:
-                    return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata,
-                                   labels=partitioned_labels, document_indexes=document_indexes)
+                    return Dataset(
+                        partitioned_corpus,
+                        vocabulary=vocabulary,
+                        metadata=metadata,
+                        labels=partitioned_labels,
+                        document_indexes=document_indexes,
+                    )
                 else:
-                    return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata,
-                                   labels=partitioned_labels)
+                    return Dataset(
+                        partitioned_corpus,
+                        vocabulary=vocabulary,
+                        metadata=metadata,
+                        labels=partitioned_labels,
+                    )
             else:
-                train, test = train_test_split(range(len(final_docs)), test_size=0.15, random_state=1)
-                train, validation = train_test_split(train, test_size=3 / 17, random_state=1)
+                train, test = train_test_split(
+                    range(len(final_docs)), test_size=0.15, random_state=1
+                )
+                train, validation = train_test_split(
+                    train, test_size=3 / 17, random_state=1
+                )
 
                 metadata["last-training-doc"] = len(train)
                 metadata["last-validation-doc"] = len(validation) + len(train)
-                partitioned_corpus = [final_docs[doc] for doc in train + validation + test]
-                document_indexes = [document_indexes[doc] for doc in train + validation + test]
+                partitioned_corpus = [
+                    final_docs[doc] for doc in train + validation + test
+                ]
+                document_indexes = [
+                    document_indexes[doc] for doc in train + validation + test
+                ]
                 if self.save_original_indexes:
-                    return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=final_labels,
-                                   document_indexes=document_indexes)
+                    return Dataset(
+                        partitioned_corpus,
+                        vocabulary=vocabulary,
+                        metadata=metadata,
+                        labels=final_labels,
+                        document_indexes=document_indexes,
+                    )
                 else:
-                    return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=final_labels,
-                                   document_indexes=document_indexes)
+                    return Dataset(
+                        partitioned_corpus,
+                        vocabulary=vocabulary,
+                        metadata=metadata,
+                        labels=final_labels,
+                        document_indexes=document_indexes,
+                    )
         else:
             if self.save_original_indexes:
-                return Dataset(final_docs, vocabulary=vocabulary, metadata=metadata, labels=final_labels,
-                               document_indexes=document_indexes)
+                return Dataset(
+                    final_docs,
+                    vocabulary=vocabulary,
+                    metadata=metadata,
+                    labels=final_labels,
+                    document_indexes=document_indexes,
+                )
             else:
 
-                return Dataset(final_docs, vocabulary=vocabulary, metadata=metadata, labels=final_labels)
+                return Dataset(
+                    final_docs,
+                    vocabulary=vocabulary,
+                    metadata=metadata,
+                    labels=final_labels,
+                )
 
     def filter_words(self, docs):
         if self.vocabulary is not None:
-            self.preprocessing_steps.append('filter words by vocabulary')
-            self.preprocessing_steps.append('filter words with document frequency lower than ' + str(self.min_df) +
-                                            ' and higher than ' + str(self.max_df))
-            self.preprocessing_steps.append('filter words with less than ' + str(self.min_chars) + " character")
-            vectorizer = TfidfVectorizer(df_max_freq=self.max_df, df_min_freq=self.min_df, vocabulary=self.vocabulary,
-                                         token_pattern=r"(?u)\b\w{" + str(self.min_chars) + ",}\b",
-                                         lowercase=self.lowercase, stop_words=self.stopwords)
+            self.preprocessing_steps.append("filter words by vocabulary")
+            self.preprocessing_steps.append(
+                "filter words with document frequency lower than "
+                + str(self.min_df)
+                + " and higher than "
+                + str(self.max_df)
+            )
+            self.preprocessing_steps.append(
+                "filter words with less than "
+                + str(self.min_chars)
+                + " character"
+            )
+            vectorizer = TfidfVectorizer(
+                df_max_freq=self.max_df,
+                df_min_freq=self.min_df,
+                vocabulary=self.vocabulary,
+                token_pattern=r"(?u)\b\w{" + str(self.min_chars) + ",}\b",
+                lowercase=self.lowercase,
+                stop_words=self.stopwords,
+            )
 
         elif self.max_features is not None:
-            self.preprocessing_steps.append('filter vocabulary to ' + str(self.max_features) + ' terms')
-            self.preprocessing_steps.append('filter words with document frequency lower than ' + str(self.min_df) +
-                                            ' and higher than ' + str(self.max_df))
-            self.preprocessing_steps.append('filter words with less than ' + str(self.min_chars) + " character")
+            self.preprocessing_steps.append(
+                "filter vocabulary to " + str(self.max_features) + " terms"
+            )
+            self.preprocessing_steps.append(
+                "filter words with document frequency lower than "
+                + str(self.min_df)
+                + " and higher than "
+                + str(self.max_df)
+            )
+            self.preprocessing_steps.append(
+                "filter words with less than "
+                + str(self.min_chars)
+                + " character"
+            )
             # we ignore df_max_freq e df_min_freq because self.max_features is not None
-            vectorizer = TfidfVectorizer(lowercase=self.lowercase, max_features=self.max_features,
-                                         stop_words=self.stopwords,
-                                         token_pattern=r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b")
+            vectorizer = TfidfVectorizer(
+                lowercase=self.lowercase,
+                max_features=self.max_features,
+                stop_words=self.stopwords,
+                token_pattern=r"(?u)\b[\w|\-]{"
+                + str(self.min_chars)
+                + r",}\b",
+            )
 
         else:
 
-            #string.punctuation
-
-            self.preprocessing_steps.append('filter words with document frequency lower than ' + str(self.min_df) +
-                                            ' and higher than ' + str(self.max_df))
-            self.preprocessing_steps.append('filter words with less than ' + str(self.min_chars) + " character")
-            vectorizer = TfidfVectorizer(max_df=self.max_df, min_df=self.min_df, lowercase=self.lowercase,
-                                         token_pattern=r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b",
-                                         stop_words=self.stopwords)
+            # string.punctuation
 
+            self.preprocessing_steps.append(
+                "filter words with document frequency lower than "
+                + str(self.min_df)
+                + " and higher than "
+                + str(self.max_df)
+            )
+            self.preprocessing_steps.append(
+                "filter words with less than "
+                + str(self.min_chars)
+                + " character"
+            )
+            vectorizer = TfidfVectorizer(
+                max_df=self.max_df,
+                min_df=self.min_df,
+                lowercase=self.lowercase,
+                token_pattern=r"(?u)\b[\w|\-]{"
+                + str(self.min_chars)
+                + r",}\b",
+                stop_words=self.stopwords,
+            )
         vectorizer.fit_transform(docs)
         vocabulary = vectorizer.get_feature_names()
-        return vocabulary
 
-    '''
-    def _foo(self, docs, vocabulary, labels_path):
-        final_docs, final_labels = [], []
-        if labels_path is not None:
-            labels = [line.strip() for line in open(labels_path, 'r').readlines()]
-            for doc, label in zip(docs, labels):
-                new_doc = [w for w in doc.split() if w in set(vocabulary)]
-                if len(new_doc) > self.min_doc_words:
-                    final_docs.append(new_doc)
-                    final_labels.append(label)
-            return final_docs, final_labels
-        else:
-            for doc in docs:
-                new_doc = [w for w in doc.split() if w in set(vocabulary)]
-                if len(new_doc) > self.min_doc_words:
-                    final_docs.append(new_doc)
-            return final_docs, []
-    '''
+        return vocabulary
 
     def simple_preprocessing_steps(self, docs):
         tmp_docs = []
-        for d in docs:
-            new_d = d
-            new_d = new_d.replace('\n', '')
-            new_d = new_d.replace('\t', '')
+        for d in tqdm(docs):
+
+            new_d = " ".join(d.split())
             if self.lowercase:
                 new_d = new_d.lower()
+
+            new_d = [token for token in self.spacy_model(new_d)]
+            if self.entities:
+                new_d = [
+                    token
+                    for token in new_d
+                    if token.ent_type_ not in self.entities
+                ]
             if self.lemmatize:
                 if self.remove_stopwords_spacy:
-                    new_d = ' '.join([token.lemma_ for token in self.spacy_model(new_d) if not token.is_stop])
+                    new_d = [
+                        token.lemma_ for token in new_d if not token.is_stop
+                    ]
                 elif self.stopwords:
-                    new_d = ' '.join(
-                        [token.lemma_ for token in self.spacy_model(new_d) if token.lemma_ not in set(self.stopwords)])
+                    new_d = [
+                        token.lemma_
+                        for token in new_d
+                        if token.lemma_ not in set(self.stopwords)
+                    ]
                 else:
-                    new_d = ' '.join([token.lemma_ for token in self.spacy_model(new_d)])
+                    new_d = [token.lemma_ for token in self.spacy_model(new_d)]
 
+            new_d = " ".join([token_text for token_text in new_d])
             if self.remove_punctuation:
-                new_d = new_d.translate(str.maketrans(self.punctuation, ' ' * len(self.punctuation)))
+                new_d = new_d.translate(
+                    str.maketrans(
+                        self.punctuation, " " * len(self.punctuation)
+                    )
+                )
             if self.remove_numbers:
-                new_d = new_d.translate(str.maketrans("0123456789", ' ' * len("0123456789")))
+                new_d = new_d.translate(
+                    str.maketrans("0123456789", " " * len("0123456789"))
+                )
+
+            # TODO: figure out which of the preproc steps above
+            # introduces white spaces between tokens
             new_d = " ".join(new_d.split())
             tmp_docs.append(new_d)
         return tmp_docs

From 3541e1d2704ea27c26a3b5ab1ca697ae425a15a2 Mon Sep 17 00:00:00 2001
From: stepgazaille <stepgazaille@gmail.com>
Date: Wed, 16 Nov 2022 10:07:48 -0500
Subject: [PATCH 10/21] add email filter

---
 octis/preprocessing/preprocessing.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py
index 0c995132..75fca011 100644
--- a/octis/preprocessing/preprocessing.py
+++ b/octis/preprocessing/preprocessing.py
@@ -1,6 +1,7 @@
 # mypy: ignore-errors
 # flake8: noqa
 
+import re
 import string
 import multiprocessing as mp
 import spacy
@@ -16,7 +17,7 @@
 """
 Maps the language to its corresponding spacy model
 """
-spacy_model_mapping = {
+SPACY_MODEL_MAPPING = {
     "chinese": "zh_core_web_sm",
     "danish": "nl_core_news_sm",
     "dutch": "nl_core_news_sm",
@@ -36,6 +37,9 @@
 }
 
 
+EMAIL_PATTERN = "\S*@\S*\s?"
+
+
 class Preprocessing:
     def __init__(
         self,
@@ -47,6 +51,7 @@ def __init__(
         remove_punctuation: bool = True,
         punctuation: str = string.punctuation,
         remove_numbers: bool = True,
+        remove_emails: bool = True,
         lemmatize: bool = True,
         stopword_list: str | list[str] = None,
         min_chars: int = 1,
@@ -57,7 +62,7 @@ def __init__(
         num_processes: int = None,
         save_original_indexes=True,
         remove_stopwords_spacy: bool = True,
-        entities: list[str] = None,
+        entities: list[str] = None,        
     ):
         """
         init Preprocessing
@@ -79,6 +84,8 @@ def __init__(
         :type punctuation: str
         :param remove_numbers: if true, numbers will be removed
         :type remove_numbers: bool
+        :param remove_emails: if true, email addresses will be removed
+        :type remove_emails: bool
         :param remove_stopwords_spacy: bool , if true use spacy to remove stopwords (default: true)
         :param lemmatize: if true, words will be lemmatized using a spacy model according to the language that has been
         set (default: true)
@@ -117,11 +124,12 @@ def __init__(
         self.language = language
         self.num_processes = num_processes
         self.remove_numbers = remove_numbers
+        self.remove_emails = remove_emails
         self.save_original_indexes = save_original_indexes
         self.entities = entities
 
         if self.lemmatize:
-            lang = spacy_model_mapping[self.language]
+            lang = SPACY_MODEL_MAPPING[self.language]
             try:
                 self.spacy_model = spacy.load(lang)
             except IOError:
@@ -403,8 +411,6 @@ def filter_words(self, docs):
 
         else:
 
-            # string.punctuation
-
             self.preprocessing_steps.append(
                 "filter words with document frequency lower than "
                 + str(self.min_df)
@@ -435,6 +441,10 @@ def simple_preprocessing_steps(self, docs):
         for d in tqdm(docs):
 
             new_d = " ".join(d.split())
+
+            if self.remove_emails:
+                new_d = re.sub(EMAIL_PATTERN, "", new_d)
+                
             if self.lowercase:
                 new_d = new_d.lower()
 

From 18c8d3f9baff8496ed707f480d5b416f916ffc30 Mon Sep 17 00:00:00 2001
From: stepgazaille <stepgazaille@gmail.com>
Date: Wed, 16 Nov 2022 10:16:54 -0500
Subject: [PATCH 11/21] add URL filter

---
 octis/preprocessing/preprocessing.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py
index 75fca011..62ad85eb 100644
--- a/octis/preprocessing/preprocessing.py
+++ b/octis/preprocessing/preprocessing.py
@@ -38,6 +38,7 @@
 
 
 EMAIL_PATTERN = "\S*@\S*\s?"
+URL_PATTERN = "http\S+"
 
 
 class Preprocessing:
@@ -86,6 +87,8 @@ def __init__(
         :type remove_numbers: bool
         :param remove_emails: if true, email addresses will be removed
         :type remove_emails: bool
+        :param remove_urls: if true, URLs will be removed
+        :type remove_urls: bool
         :param remove_stopwords_spacy: bool , if true use spacy to remove stopwords (default: true)
         :param lemmatize: if true, words will be lemmatized using a spacy model according to the language that has been
         set (default: true)
@@ -125,6 +128,7 @@ def __init__(
         self.num_processes = num_processes
         self.remove_numbers = remove_numbers
         self.remove_emails = remove_emails
+        self.remove_urls = remove_urls
         self.save_original_indexes = save_original_indexes
         self.entities = entities
 
@@ -444,6 +448,9 @@ def simple_preprocessing_steps(self, docs):
 
             if self.remove_emails:
                 new_d = re.sub(EMAIL_PATTERN, "", new_d)
+
+            if self.remove_urls:
+                new_d = re.sub(URL_PATTERN, "", new_d)
                 
             if self.lowercase:
                 new_d = new_d.lower()

From 14c99a98065d25f4b421bd25364076bdf22d99dd Mon Sep 17 00:00:00 2001
From: stepgazaille <stepgazaille@gmail.com>
Date: Wed, 16 Nov 2022 10:48:12 -0500
Subject: [PATCH 12/21] add missing constructor arg for remove_urls

---
 octis/preprocessing/preprocessing.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py
index 62ad85eb..3f40fcb4 100644
--- a/octis/preprocessing/preprocessing.py
+++ b/octis/preprocessing/preprocessing.py
@@ -53,6 +53,7 @@ def __init__(
         punctuation: str = string.punctuation,
         remove_numbers: bool = True,
         remove_emails: bool = True,
+        remove_urls: bool = True,
         lemmatize: bool = True,
         stopword_list: str | list[str] = None,
         min_chars: int = 1,

From f97d4d2b6aca4993eee56c70e8add6c0a2f75571 Mon Sep 17 00:00:00 2001
From: Ge Li <ge.li@mila.quebec>
Date: Thu, 17 Nov 2022 13:09:12 -0500
Subject: [PATCH 13/21] fix bug with spacy

---
 octis/preprocessing/preprocessing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py
index 3f40fcb4..e40920b4 100644
--- a/octis/preprocessing/preprocessing.py
+++ b/octis/preprocessing/preprocessing.py
@@ -475,9 +475,9 @@ def simple_preprocessing_steps(self, docs):
                         if token.lemma_ not in set(self.stopwords)
                     ]
                 else:
-                    new_d = [token.lemma_ for token in self.spacy_model(new_d)]
+                    new_d = [token.lemma_ for token in new_d]
 
-            new_d = " ".join([token_text for token_text in new_d])
+            new_d = " ".join([str(token_text) for token_text in new_d])
             if self.remove_punctuation:
                 new_d = new_d.translate(
                     str.maketrans(

From a99d445e53745587d1d6608deb2d95ccab97d9ea Mon Sep 17 00:00:00 2001
From: Ge Li <ge.li2@mail.mcgill.ca>
Date: Mon, 21 Nov 2022 16:06:00 -0500
Subject: [PATCH 14/21] Update requirements.txt

---
 requirements.txt | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 694fba13..85ca73fa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,14 @@
-gensim>=4.0.0
-nltk
-pandas
-spacy
+gensim==4.2.0
+nltk==3.7
+pandas==1.5.0
+spacy==3.4.2
 scikit-learn==0.24.2
-scikit-optimize>=0.8.1
-matplotlib
-torch
-numpy>=1.19.1
-libsvm
-flask
+scikit-optimize==0.9.0
+matplotlib==3.6.1
+torch==1.12.1
+numpy==1.23.4
+libsvm==3.23.0.4
+flask==2.2.2
 sentence_transformers
-requests
-tomotopy
+requests==2.28.1
+tomotopy==0.12.3

From 39652ecc59fd06cfbb47521c9a6370bdd0cf136f Mon Sep 17 00:00:00 2001
From: stepgazaille <stepgazaille@gmail.com>
Date: Tue, 22 Nov 2022 08:53:25 -0500
Subject: [PATCH 15/21] add "sentence_transformers" version to requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 85ca73fa..4458ba81 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,6 @@ torch==1.12.1
 numpy==1.23.4
 libsvm==3.23.0.4
 flask==2.2.2
-sentence_transformers
+sentence_transformers==2.2.2
 requests==2.28.1
 tomotopy==0.12.3

From e941956328e4afcaa33481e6367f7f2df8f6de94 Mon Sep 17 00:00:00 2001
From: stepgazaille <stepgazaille@gmail.com>
Date: Tue, 22 Nov 2022 09:22:28 -0500
Subject: [PATCH 16/21] freeze requirements versions

---
 requirements.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4458ba81..cc418a4a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
 gensim==4.2.0
 nltk==3.7
-pandas==1.5.0
-spacy==3.4.2
+pandas==1.5.1
+spacy==3.4.3
 scikit-learn==0.24.2
 scikit-optimize==0.9.0
-matplotlib==3.6.1
-torch==1.12.1
-numpy==1.23.4
+matplotlib==3.6.2
+torch==1.13.0
+numpy==1.23.5
 libsvm==3.23.0.4
 flask==2.2.2
 sentence_transformers==2.2.2

From 2d02bef045d2e964b2f7a811669714f5a43d9873 Mon Sep 17 00:00:00 2001
From: Ge Li <ge.li2@mail.mcgill.ca>
Date: Tue, 22 Nov 2022 09:58:52 -0500
Subject: [PATCH 17/21] Update requirements.txt

---
 requirements.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index cc418a4a..4e56a7c3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
 gensim==4.2.0
 nltk==3.7
 pandas==1.5.1
-spacy==3.4.3
+spacy==3.4.2
 scikit-learn==0.24.2
 scikit-optimize==0.9.0
-matplotlib==3.6.2
-torch==1.13.0
-numpy==1.23.5
+matplotlib==3.6.1
+torch==1.12.1
+numpy==1.23.4
 libsvm==3.23.0.4
 flask==2.2.2
 sentence_transformers==2.2.2

From 9c119c44959999a0635fd62c8010f863999c7609 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Gazaille?=
 <stepgazaille@users.noreply.github.com>
Date: Tue, 22 Nov 2022 10:10:36 -0500
Subject: [PATCH 18/21] Update requirements.txt

---
 requirements.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4e56a7c3..cc418a4a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
 gensim==4.2.0
 nltk==3.7
 pandas==1.5.1
-spacy==3.4.2
+spacy==3.4.3
 scikit-learn==0.24.2
 scikit-optimize==0.9.0
-matplotlib==3.6.1
-torch==1.12.1
-numpy==1.23.4
+matplotlib==3.6.2
+torch==1.13.0
+numpy==1.23.5
 libsvm==3.23.0.4
 flask==2.2.2
 sentence_transformers==2.2.2

From b002a15f13e827fb9d185a0d2b011ef4acc4c621 Mon Sep 17 00:00:00 2001
From: stepgazaille <stepgazaille@gmail.com>
Date: Tue, 22 Nov 2022 11:58:27 -0500
Subject: [PATCH 19/21] fix typing

---
 octis/preprocessing/preprocessing.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py
index e40920b4..6639edca 100644
--- a/octis/preprocessing/preprocessing.py
+++ b/octis/preprocessing/preprocessing.py
@@ -1,17 +1,18 @@
 # mypy: ignore-errors
 # flake8: noqa
 
+from collections import Counter
+import multiprocessing as mp
+from pathlib import Path
 import re
 import string
-import multiprocessing as mp
-import spacy
+from typing import Union
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split
-from pathlib import Path
-from octis.dataset.dataset import Dataset
-from collections import Counter
 import numpy as np
+import spacy
 from tqdm import tqdm
+from octis.dataset.dataset import Dataset
 
 
 """
@@ -55,7 +56,7 @@ def __init__(
         remove_emails: bool = True,
         remove_urls: bool = True,
         lemmatize: bool = True,
-        stopword_list: str | list[str] = None,
+        stopword_list: Union[str, list[str]] = None,
         min_chars: int = 1,
         min_words_docs: int = 0,
         language: str = "english",

From d05af05ac3517c4ef23844b2168e58b1cf87f5d9 Mon Sep 17 00:00:00 2001
From: stepgazaille <stepgazaille@gmail.com>
Date: Fri, 2 Dec 2022 13:37:10 -0500
Subject: [PATCH 20/21] bugfix vocab with terms not in docs

---
 octis/preprocessing/preprocessing.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py
index 6639edca..2c3ef3b4 100644
--- a/octis/preprocessing/preprocessing.py
+++ b/octis/preprocessing/preprocessing.py
@@ -4,6 +4,7 @@
 from collections import Counter
 import multiprocessing as mp
 from pathlib import Path
+from gensim.corpora.dictionary import Dictionary
 import re
 import string
 from typing import Union
@@ -221,9 +222,7 @@ def preprocess_dataset(
 
         vocabulary = self.filter_words(docs)
         print("created vocab")
-        # with Pool(self.num_processes) as p:
-        #    final_docs, final_labels = p.starmap(self._foo, product(docs, vocabulary, labels_path, repeat=2))
-        print(len(vocabulary))
+        
         final_docs, final_labels, document_indexes = [], [], []
         if labels_path is not None:
             if multilabel:
@@ -271,11 +270,15 @@ def preprocess_dataset(
         )
         if self.verbose:
             print("words filtering done")
+        
+        # Make sure vocabulary is still consistent with the content of docs:
+        dictionary = Dictionary(final_docs,  prune_at=None)
+        vocabulary = sorted(dictionary.token2id.keys())
+        
         metadata = {
             "total_documents": len(docs),
             "vocabulary_length": len(vocabulary),
             "preprocessing-info": self.preprocessing_steps
-            # ,"labels": list(set(final_labels)), "total_labels": len(set(final_labels))
         }
         if self.split:
             if len(final_labels) > 0:

From 78805d646732eb8c2494b338bf727c1552566ffa Mon Sep 17 00:00:00 2001
From: Ge Li <ge.li@mila.quebec>
Date: Fri, 18 Aug 2023 10:36:19 -0400
Subject: [PATCH 21/21] fix TI implementation

---
 octis/evaluation_metrics/interpretability_metrics.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py
index dfa30c26..ecf0631f 100644
--- a/octis/evaluation_metrics/interpretability_metrics.py
+++ b/octis/evaluation_metrics/interpretability_metrics.py
@@ -1,3 +1,4 @@
+import numpy as np
 from octis.evaluation_metrics.metrics import AbstractMetric
 from octis.evaluation_metrics.diversity_metrics import TopicDiversity
 from octis.evaluation_metrics.coherence_metrics import Coherence,_load_default_texts
@@ -27,7 +28,7 @@ def __init__(
         self.topic_diversity = TopicDiversity(topk=topk)
 
     def score(self, model_output: dict) -> float:  # noqa
-            # 1 is added to convert npmi output into a positive scale
-            return (1+self.c_npmi.score(
-                model_output
-            )) * self.topic_diversity.score(model_output)
+        # exp(tc*td)
+         return np.exp(
+            self.c_npmi.score(model_output) * self.topic_diversity.score(model_output)
+        )
\ No newline at end of file