From 5f3e7ff81207aab009991af6f6f4e000341f1cc2 Mon Sep 17 00:00:00 2001 From: Adityakolluru7 Date: Thu, 3 Nov 2022 14:30:52 -0400 Subject: [PATCH 01/21] added topic interpretability to eval metrics --- .../interpretability_metrics.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 octis/evaluation_metrics/interpretability_metrics.py diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py new file mode 100644 index 00000000..ad91f1d9 --- /dev/null +++ b/octis/evaluation_metrics/interpretability_metrics.py @@ -0,0 +1,45 @@ +from octis.evaluation_metrics.metrics import AbstractMetric +from gensim.corpora.dictionary import Dictionary +from octis.evaluation_metrics.diversity_metrics import TopicDiversity +from octis.evaluation_metrics.coherence_metrics import Coherence + + +class TopicInterpretability(AbstractMetric): + def __init__( + self, # noqa + texts: str = None, + topk: int = 10, + coherence_measure: str = "c_npmi", + ) -> None: + """ + Initialize metric + Parameters + ---------- + texts : list of documents (list of lists of strings) + topk : how many most likely words to consider in + the evaluation + measure : (default 'c_npmi') measure to use. + """ + super().__init__() + if texts is None: + raise Exception("There are no texts in the document") + else: + self._texts = texts + self._dictionary = Dictionary(self._texts) + self.topk = topk + self.coherence_measure = coherence_measure + c_npmi = Coherence(texts, topk=topk, measure=coherence_measure) + topic_diversity = TopicDiversity(topk=topk) + self.c_npmi = c_npmi + self.topic_diversity = topic_diversity + + def score(self, model_output: dict) -> float: # noqa + + if self.c_npmi.score(model_output) != 0: + # 1 is added to convert npmi output into a positive scale + return (1+self.c_npmi.score( + model_output + )) * self.topic_diversity.score(model_output) + + elif self.topic_diversity.score(model_output) == 0: + return 0 From 59b816e1e5b674c127a2a6eefa6812a95595eb81 Mon Sep 17 00:00:00 2001 From: Adityakolluru7 Date: Mon, 7 Nov 2022 11:20:00 -0500 Subject: [PATCH 02/21] minor refactoring --- .../interpretability_metrics.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py index ad91f1d9..10be8994 100644 --- a/octis/evaluation_metrics/interpretability_metrics.py +++ b/octis/evaluation_metrics/interpretability_metrics.py @@ -1,15 +1,14 @@ from octis.evaluation_metrics.metrics import AbstractMetric from gensim.corpora.dictionary import Dictionary from octis.evaluation_metrics.diversity_metrics import TopicDiversity -from octis.evaluation_metrics.coherence_metrics import Coherence +from octis.evaluation_metrics.coherence_metrics import Coherence,_load_default_texts class TopicInterpretability(AbstractMetric): def __init__( self, # noqa texts: str = None, - topk: int = 10, - coherence_measure: str = "c_npmi", + topk: int = 10 ) -> None: """ Initialize metric @@ -22,24 +21,16 @@ def __init__( """ super().__init__() if texts is None: - raise Exception("There are no texts in the document") + self._texts = _load_default_texts() else: self._texts = texts self._dictionary = Dictionary(self._texts) self.topk = topk - self.coherence_measure = coherence_measure - c_npmi = Coherence(texts, topk=topk, measure=coherence_measure) - topic_diversity = TopicDiversity(topk=topk) - self.c_npmi = c_npmi - self.topic_diversity = topic_diversity + self.c_npmi = Coherence(texts, topk=topk, measure='c_npmi') + self.topic_diversity = TopicDiversity(topk=topk) def score(self, model_output: dict) -> float: # noqa - - if self.c_npmi.score(model_output) != 0: # 1 is added to convert npmi output into a positive scale return (1+self.c_npmi.score( model_output )) * self.topic_diversity.score(model_output) - - elif self.topic_diversity.score(model_output) == 0: - return 0 From c7af4ce59503787f863a38791e405dc05d88fcda Mon Sep 17 00:00:00 2001 From: Adityakolluru7 Date: Mon, 7 Nov 2022 14:02:49 -0500 Subject: [PATCH 03/21] some more minor refactoring --- octis/evaluation_metrics/interpretability_metrics.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py index 10be8994..ecb2068b 100644 --- a/octis/evaluation_metrics/interpretability_metrics.py +++ b/octis/evaluation_metrics/interpretability_metrics.py @@ -17,15 +17,12 @@ def __init__( texts : list of documents (list of lists of strings) topk : how many most likely words to consider in the evaluation - measure : (default 'c_npmi') measure to use. """ super().__init__() if texts is None: self._texts = _load_default_texts() else: self._texts = texts - self._dictionary = Dictionary(self._texts) - self.topk = topk self.c_npmi = Coherence(texts, topk=topk, measure='c_npmi') self.topic_diversity = TopicDiversity(topk=topk) From 3c06213fbe7856b266eb496c0e4d05efb0b6d8e9 Mon Sep 17 00:00:00 2001 From: Adityakolluru7 Date: Mon, 7 Nov 2022 15:22:08 -0500 Subject: [PATCH 04/21] minor refactoring --- octis/evaluation_metrics/interpretability_metrics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py index ecb2068b..16d84b09 100644 --- a/octis/evaluation_metrics/interpretability_metrics.py +++ b/octis/evaluation_metrics/interpretability_metrics.py @@ -1,5 +1,4 @@ from octis.evaluation_metrics.metrics import AbstractMetric -from gensim.corpora.dictionary import Dictionary from octis.evaluation_metrics.diversity_metrics import TopicDiversity from octis.evaluation_metrics.coherence_metrics import Coherence,_load_default_texts From 66d46a0789c85caceec7b57d5c1fbc0229b2c125 Mon Sep 17 00:00:00 2001 From: Adityakolluru7 Date: Fri, 11 Nov 2022 10:28:25 -0500 Subject: [PATCH 05/21] logging topk in topic_interpretability --- octis/evaluation_metrics/interpretability_metrics.py | 1 + octis/optimization/optimizer_evaluation.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py index 16d84b09..dfa30c26 100644 --- a/octis/evaluation_metrics/interpretability_metrics.py +++ b/octis/evaluation_metrics/interpretability_metrics.py @@ -22,6 +22,7 @@ def __init__( self._texts = _load_default_texts() else: self._texts = texts + self.topk=topk self.c_npmi = Coherence(texts, topk=topk, measure='c_npmi') self.topic_diversity = TopicDiversity(topk=topk) diff --git a/octis/optimization/optimizer_evaluation.py b/octis/optimization/optimizer_evaluation.py index a7b2130a..acbba943 100644 --- a/octis/optimization/optimizer_evaluation.py +++ b/octis/optimization/optimizer_evaluation.py @@ -2,7 +2,8 @@ import numpy as np import pandas as pd from octis.optimization.optimizer_tool import check_instance, save_search_space, convert_type - +from octis.evaluation_metrics.diversity_metrics import TopicDiversity +from octis.evaluation_metrics.coherence_metrics import Coherence class OptimizerEvaluation: @@ -21,8 +22,11 @@ def __init__(self, optimizer, BO_results): dict_metric_parameters = dict() for mp in metric_parameters: - if check_instance(getattr(optimizer.metric, mp)): - dict_metric_parameters.update({mp: getattr(optimizer.metric, mp)}) + for mp in (metric_parameters): + if isinstance(getattr(optimizer.metric,mp),(Coherence,TopicDiversity)): + pass + elif check_instance(getattr(optimizer.metric, mp)): + dict_metric_parameters.update({mp: getattr(optimizer.metric, mp)}) # Creation of model hyper-parameters saved in the json file model_parameters = optimizer.model.hyperparameters From 1bc49da6858d5f4d4603cd24212e205a8e06321f Mon Sep 17 00:00:00 2001 From: Adityakolluru7 Date: Fri, 11 Nov 2022 10:48:15 -0500 Subject: [PATCH 06/21] deleting 2nd loop --- octis/optimization/optimizer_evaluation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/octis/optimization/optimizer_evaluation.py b/octis/optimization/optimizer_evaluation.py index acbba943..2c2f6a0e 100644 --- a/octis/optimization/optimizer_evaluation.py +++ b/octis/optimization/optimizer_evaluation.py @@ -22,7 +22,6 @@ def __init__(self, optimizer, BO_results): dict_metric_parameters = dict() for mp in metric_parameters: - for mp in (metric_parameters): if isinstance(getattr(optimizer.metric,mp),(Coherence,TopicDiversity)): pass elif check_instance(getattr(optimizer.metric, mp)): From 464bbe352b9dcc43c7a01fbda24d479171fcd054 Mon Sep 17 00:00:00 2001 From: Adityakolluru7 Date: Fri, 11 Nov 2022 10:53:27 -0500 Subject: [PATCH 07/21] correcting intendation --- octis/optimization/optimizer_evaluation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/octis/optimization/optimizer_evaluation.py b/octis/optimization/optimizer_evaluation.py index 2c2f6a0e..674398ef 100644 --- a/octis/optimization/optimizer_evaluation.py +++ b/octis/optimization/optimizer_evaluation.py @@ -22,10 +22,10 @@ def __init__(self, optimizer, BO_results): dict_metric_parameters = dict() for mp in metric_parameters: - if isinstance(getattr(optimizer.metric,mp),(Coherence,TopicDiversity)): - pass - elif check_instance(getattr(optimizer.metric, mp)): - dict_metric_parameters.update({mp: getattr(optimizer.metric, mp)}) + if isinstance(getattr(optimizer.metric,mp),(Coherence,TopicDiversity)): + pass + elif check_instance(getattr(optimizer.metric, mp)): + dict_metric_parameters.update({mp: getattr(optimizer.metric, mp)}) # Creation of model hyper-parameters saved in the json file model_parameters = optimizer.model.hyperparameters From f178eeedf890b7ad9e5b7a630628cece8d30242f Mon Sep 17 00:00:00 2001 From: Adityakolluru7 Date: Fri, 11 Nov 2022 11:19:41 -0500 Subject: [PATCH 08/21] minor refactoring --- octis/optimization/optimizer_evaluation.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/octis/optimization/optimizer_evaluation.py b/octis/optimization/optimizer_evaluation.py index 674398ef..4b234031 100644 --- a/octis/optimization/optimizer_evaluation.py +++ b/octis/optimization/optimizer_evaluation.py @@ -2,8 +2,6 @@ import numpy as np import pandas as pd from octis.optimization.optimizer_tool import check_instance, save_search_space, convert_type -from octis.evaluation_metrics.diversity_metrics import TopicDiversity -from octis.evaluation_metrics.coherence_metrics import Coherence class OptimizerEvaluation: @@ -22,9 +20,7 @@ def __init__(self, optimizer, BO_results): dict_metric_parameters = dict() for mp in metric_parameters: - if isinstance(getattr(optimizer.metric,mp),(Coherence,TopicDiversity)): - pass - elif check_instance(getattr(optimizer.metric, mp)): + if check_instance(getattr(optimizer.metric, mp)): dict_metric_parameters.update({mp: getattr(optimizer.metric, mp)}) # Creation of model hyper-parameters saved in the json file From a84efd475d1ed6572fefae43d58ab0d982a1f226 Mon Sep 17 00:00:00 2001 From: Ge Li Date: Mon, 14 Nov 2022 10:08:31 -0500 Subject: [PATCH 09/21] copy and paste preproc code from topic-modeling --- octis/preprocessing/preprocessing.py | 401 +++++++++++++++++++-------- 1 file changed, 283 insertions(+), 118 deletions(-) diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py index d06149c9..0c995132 100644 --- a/octis/preprocessing/preprocessing.py +++ b/octis/preprocessing/preprocessing.py @@ -1,35 +1,66 @@ -import string -from typing import List, Union +# mypy: ignore-errors +# flake8: noqa +import string +import multiprocessing as mp import spacy from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split -from tqdm.contrib.concurrent import process_map # or thread_map from pathlib import Path from octis.dataset.dataset import Dataset from collections import Counter +import numpy as np +from tqdm import tqdm + """ Maps the language to its corresponding spacy model """ -spacy_model_mapping = {'chinese': 'zh_core_web_sm', 'danish': 'nl_core_news_sm', 'dutch': 'nl_core_news_sm', - 'english': 'en_core_web_sm', 'french': 'fr_core_news_sm', 'german': 'de_core_news_sm', - 'greek': 'el_core_news_sm', 'italian': 'it_core_news_sm', 'japanese': 'ja_core_news_sm', - 'lithuanian': 'lt_core_news_sm', 'norwegian': 'nb_core_news_sm', 'polish': 'pl_core_news_sm', - 'portuguese': 'pt_core_news_sm', 'romanian': 'ro_core_news_sm', 'russian': 'ru_core_news_sm', - 'spanish': 'es_core_news_sm'} +spacy_model_mapping = { + "chinese": "zh_core_web_sm", + "danish": "nl_core_news_sm", + "dutch": "nl_core_news_sm", + "english": "en_core_web_lg", + "french": "fr_core_news_sm", + "german": "de_core_news_sm", + "greek": "el_core_news_sm", + "italian": "it_core_news_sm", + "japanese": "ja_core_news_sm", + "lithuanian": "lt_core_news_sm", + "norwegian": "nb_core_news_sm", + "polish": "pl_core_news_sm", + "portuguese": "pt_core_news_sm", + "romanian": "ro_core_news_sm", + "russian": "ru_core_news_sm", + "spanish": "es_core_news_sm", +} class Preprocessing: - def __init__(self, lowercase: bool = True, vocabulary: List[str] = None, max_features: int = None, - min_df: float = 0.0, max_df: float = 1.0, remove_punctuation: bool = True, - punctuation: str = string.punctuation, remove_numbers: bool = True, lemmatize: bool = True, - stopword_list: Union[str, List[str]] = None, min_chars: int = 1, min_words_docs: int = 0, - language: str = 'english', split: bool = True, verbose: bool = False, num_processes: int = None, - save_original_indexes=True, remove_stopwords_spacy: bool = True): + def __init__( + self, + lowercase: bool = True, + vocabulary: list[str] = None, + max_features: int = None, + min_df: float = 0.0, + max_df: float = 1.0, + remove_punctuation: bool = True, + punctuation: str = string.punctuation, + remove_numbers: bool = True, + lemmatize: bool = True, + stopword_list: str | list[str] = None, + min_chars: int = 1, + min_words_docs: int = 0, + language: str = "english", + split: bool = True, + verbose: bool = False, + num_processes: int = None, + save_original_indexes=True, + remove_stopwords_spacy: bool = True, + entities: list[str] = None, + ): """ init Preprocessing - :param lowercase: if true, words in documents are reduced to lowercase (default: true) :type lowercase: boolean :param vocabulary: the vocabulary of the corpus to preprocess (default: None) @@ -70,6 +101,10 @@ def __init__(self, lowercase: bool = True, vocabulary: List[str] = None, max_fea :param num_processes: number of processes to run the preprocessing :type num_processes: int :param save_original_indexes: if true, it keeps track of the original indexes of the documents + :param entities: labels of entity types to be remove; accepted entity types are: CARDINAL, DATE, + EVENT, FAC, "GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, + TIME, WORK_OF_ART (currently only implemented for english) + :type entities: list[str] """ self.vocabulary = vocabulary self.lowercase = lowercase @@ -83,20 +118,25 @@ def __init__(self, lowercase: bool = True, vocabulary: List[str] = None, max_fea self.num_processes = num_processes self.remove_numbers = remove_numbers self.save_original_indexes = save_original_indexes + self.entities = entities if self.lemmatize: lang = spacy_model_mapping[self.language] try: self.spacy_model = spacy.load(lang) except IOError: - raise IOError("Can't find model " + lang + ". Check the data directory or download it using the " - "following command:\npython -m spacy download " + lang) + raise IOError( + "Can't find model " + + lang + + ". Check the data directory or download it using the " + "following command:\npython -m spacy download " + lang + ) self.split = split self.verbose = verbose self.remove_stopwords_spacy = remove_stopwords_spacy - stopwords = [] + stopwords = set() # if stopwords is None then stopwords are not removed if stopword_list is None: self.remove_stopwords_spacy = False @@ -106,24 +146,33 @@ def __init__(self, lowercase: bool = True, vocabulary: List[str] = None, max_fea stopwords = set(stopword_list) self.remove_stopwords_spacy = False elif self.remove_stopwords_spacy: - assert stopword_list == language + assert stopword_list == language # nosec else: # if remove_stopwords_spacy is false, then use MALLET English stopwords - if 'english' in stopword_list: - stop_word_path = Path(__file__).parent.joinpath('stopwords', 'english.txt') + if "english" in stopword_list: + stop_word_path = Path(__file__).parent.joinpath( + "stopwords", "english.txt" + ) with open(stop_word_path) as fr: - stopwords = [line.strip() for line in fr.readlines()] - assert stopword_list == language + stopwords = set( + [line.strip() for line in fr.readlines()] + ) + assert stopword_list == language # nosec self.stopwords = stopwords self.min_chars = min_chars self.min_doc_words = min_words_docs self.preprocessing_steps = [] - def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False): + def preprocess_dataset( + self, + documents_path=None, + labels_path=None, + multilabel=False, + do_simple_preprocessing=True, + ): """ preprocess the input dataset - :param documents_path: path to the documents file. Each row of the file represents a document :type documents_path: str :param labels_path: path to the documents file. Each row of the file represents a label. Its index corresponds @@ -131,22 +180,30 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) :type labels_path: str :param multilabel: if true, a document is supposed to have more than one label (labels are split by whitespace) :type multilabel: bool - + :param do_simple_preprocessing: if true, perform simple_preprocessing_steps (including lemmatization and + :stopwords removal etc). if false, skip simple_preprocessing_steps, which is needed during the merge step. :return octis.dataset.dataset.Dataset """ - docs = [line.strip() for line in open(documents_path, 'r').readlines()] - if self.num_processes is not None: - # with Pool(self.num_processes) as p: - # docs = p.map(self.simple_preprocessing_steps, docs) - docs = process_map(self.simple_preprocessing_steps, docs, max_workers=self.num_processes, chunksize=1) + docs = [line.strip() for line in open(documents_path, "r").readlines()] + if do_simple_preprocessing: + if self.num_processes is not None: + + docs_splits = np.array_split(docs, self.num_processes) + with mp.Pool(self.num_processes) as p: + docs = np.hstack( + p.map(self.simple_preprocessing_steps, docs_splits) + ) + + else: + docs = self.simple_preprocessing_steps(docs) + if self.lowercase: + self.preprocessing_steps.append("lowercase") + if self.remove_punctuation: + self.preprocessing_steps.append("remove_punctuation") + if self.lemmatize: + self.preprocessing_steps.append("lemmatize") else: - docs = self.simple_preprocessing_steps(docs) - if self.lowercase: - self.preprocessing_steps.append("lowercase") - if self.remove_punctuation: - self.preprocessing_steps.append('remove_punctuation') - if self.lemmatize: - self.preprocessing_steps.append('lemmatize') + print("Skip simple processing!") vocabulary = self.filter_words(docs) print("created vocab") @@ -156,9 +213,14 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) final_docs, final_labels, document_indexes = [], [], [] if labels_path is not None: if multilabel: - labels = [line.strip().split() for line in open(labels_path, 'r').readlines()] + labels = [ + line.strip().split() + for line in open(labels_path, "r").readlines() + ] else: - labels = [line.strip() for line in open(labels_path, 'r').readlines()] + labels = [ + line.strip() for line in open(labels_path, "r").readlines() + ] for i, doc, label in zip(range(len(docs)), docs, labels): vocab = set(vocabulary) @@ -168,7 +230,9 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) final_labels.append(label) document_indexes.append(i) - labels_to_remove = set([k for k, v in dict(Counter(final_labels)).items() if v <= 3]) + labels_to_remove = set( + [k for k, v in dict(Counter(final_labels)).items() if v <= 3] + ) if len(labels_to_remove) > 0: docs = final_docs labels = final_labels @@ -186,128 +250,229 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) final_docs.append(new_doc) document_indexes.append(i) - self.preprocessing_steps.append('filter documents with less than ' + str(self.min_doc_words) + " words") + self.preprocessing_steps.append( + "filter documents with less than " + + str(self.min_doc_words) + + " words" + ) if self.verbose: print("words filtering done") - metadata = {"total_documents": len(docs), "vocabulary_length": len(vocabulary), - "preprocessing-info": self.preprocessing_steps - # ,"labels": list(set(final_labels)), "total_labels": len(set(final_labels)) - } + metadata = { + "total_documents": len(docs), + "vocabulary_length": len(vocabulary), + "preprocessing-info": self.preprocessing_steps + # ,"labels": list(set(final_labels)), "total_labels": len(set(final_labels)) + } if self.split: if len(final_labels) > 0: train, test, y_train, y_test = train_test_split( - range(len(final_docs)), final_labels, test_size=0.15, random_state=1, shuffle=True)#stratify=final_labels) + range(len(final_docs)), + final_labels, + test_size=0.15, + random_state=1, + shuffle=True, + ) # stratify=final_labels) - train, validation = train_test_split(train, test_size=3 / 17, random_state=1, shuffle=True)# stratify=y_train) + train, validation = train_test_split( + train, test_size=3 / 17, random_state=1, shuffle=True + ) # stratify=y_train) - partitioned_labels = [final_labels[doc] for doc in train + validation + test] - partitioned_corpus = [final_docs[doc] for doc in train + validation + test] - document_indexes = [document_indexes[doc] for doc in train + validation + test] + partitioned_labels = [ + final_labels[doc] for doc in train + validation + test + ] + partitioned_corpus = [ + final_docs[doc] for doc in train + validation + test + ] + document_indexes = [ + document_indexes[doc] for doc in train + validation + test + ] metadata["last-training-doc"] = len(train) metadata["last-validation-doc"] = len(validation) + len(train) if self.save_original_indexes: - return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, - labels=partitioned_labels, document_indexes=document_indexes) + return Dataset( + partitioned_corpus, + vocabulary=vocabulary, + metadata=metadata, + labels=partitioned_labels, + document_indexes=document_indexes, + ) else: - return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, - labels=partitioned_labels) + return Dataset( + partitioned_corpus, + vocabulary=vocabulary, + metadata=metadata, + labels=partitioned_labels, + ) else: - train, test = train_test_split(range(len(final_docs)), test_size=0.15, random_state=1) - train, validation = train_test_split(train, test_size=3 / 17, random_state=1) + train, test = train_test_split( + range(len(final_docs)), test_size=0.15, random_state=1 + ) + train, validation = train_test_split( + train, test_size=3 / 17, random_state=1 + ) metadata["last-training-doc"] = len(train) metadata["last-validation-doc"] = len(validation) + len(train) - partitioned_corpus = [final_docs[doc] for doc in train + validation + test] - document_indexes = [document_indexes[doc] for doc in train + validation + test] + partitioned_corpus = [ + final_docs[doc] for doc in train + validation + test + ] + document_indexes = [ + document_indexes[doc] for doc in train + validation + test + ] if self.save_original_indexes: - return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=final_labels, - document_indexes=document_indexes) + return Dataset( + partitioned_corpus, + vocabulary=vocabulary, + metadata=metadata, + labels=final_labels, + document_indexes=document_indexes, + ) else: - return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=final_labels, - document_indexes=document_indexes) + return Dataset( + partitioned_corpus, + vocabulary=vocabulary, + metadata=metadata, + labels=final_labels, + document_indexes=document_indexes, + ) else: if self.save_original_indexes: - return Dataset(final_docs, vocabulary=vocabulary, metadata=metadata, labels=final_labels, - document_indexes=document_indexes) + return Dataset( + final_docs, + vocabulary=vocabulary, + metadata=metadata, + labels=final_labels, + document_indexes=document_indexes, + ) else: - return Dataset(final_docs, vocabulary=vocabulary, metadata=metadata, labels=final_labels) + return Dataset( + final_docs, + vocabulary=vocabulary, + metadata=metadata, + labels=final_labels, + ) def filter_words(self, docs): if self.vocabulary is not None: - self.preprocessing_steps.append('filter words by vocabulary') - self.preprocessing_steps.append('filter words with document frequency lower than ' + str(self.min_df) + - ' and higher than ' + str(self.max_df)) - self.preprocessing_steps.append('filter words with less than ' + str(self.min_chars) + " character") - vectorizer = TfidfVectorizer(df_max_freq=self.max_df, df_min_freq=self.min_df, vocabulary=self.vocabulary, - token_pattern=r"(?u)\b\w{" + str(self.min_chars) + ",}\b", - lowercase=self.lowercase, stop_words=self.stopwords) + self.preprocessing_steps.append("filter words by vocabulary") + self.preprocessing_steps.append( + "filter words with document frequency lower than " + + str(self.min_df) + + " and higher than " + + str(self.max_df) + ) + self.preprocessing_steps.append( + "filter words with less than " + + str(self.min_chars) + + " character" + ) + vectorizer = TfidfVectorizer( + df_max_freq=self.max_df, + df_min_freq=self.min_df, + vocabulary=self.vocabulary, + token_pattern=r"(?u)\b\w{" + str(self.min_chars) + ",}\b", + lowercase=self.lowercase, + stop_words=self.stopwords, + ) elif self.max_features is not None: - self.preprocessing_steps.append('filter vocabulary to ' + str(self.max_features) + ' terms') - self.preprocessing_steps.append('filter words with document frequency lower than ' + str(self.min_df) + - ' and higher than ' + str(self.max_df)) - self.preprocessing_steps.append('filter words with less than ' + str(self.min_chars) + " character") + self.preprocessing_steps.append( + "filter vocabulary to " + str(self.max_features) + " terms" + ) + self.preprocessing_steps.append( + "filter words with document frequency lower than " + + str(self.min_df) + + " and higher than " + + str(self.max_df) + ) + self.preprocessing_steps.append( + "filter words with less than " + + str(self.min_chars) + + " character" + ) # we ignore df_max_freq e df_min_freq because self.max_features is not None - vectorizer = TfidfVectorizer(lowercase=self.lowercase, max_features=self.max_features, - stop_words=self.stopwords, - token_pattern=r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b") + vectorizer = TfidfVectorizer( + lowercase=self.lowercase, + max_features=self.max_features, + stop_words=self.stopwords, + token_pattern=r"(?u)\b[\w|\-]{" + + str(self.min_chars) + + r",}\b", + ) else: - #string.punctuation - - self.preprocessing_steps.append('filter words with document frequency lower than ' + str(self.min_df) + - ' and higher than ' + str(self.max_df)) - self.preprocessing_steps.append('filter words with less than ' + str(self.min_chars) + " character") - vectorizer = TfidfVectorizer(max_df=self.max_df, min_df=self.min_df, lowercase=self.lowercase, - token_pattern=r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b", - stop_words=self.stopwords) + # string.punctuation + self.preprocessing_steps.append( + "filter words with document frequency lower than " + + str(self.min_df) + + " and higher than " + + str(self.max_df) + ) + self.preprocessing_steps.append( + "filter words with less than " + + str(self.min_chars) + + " character" + ) + vectorizer = TfidfVectorizer( + max_df=self.max_df, + min_df=self.min_df, + lowercase=self.lowercase, + token_pattern=r"(?u)\b[\w|\-]{" + + str(self.min_chars) + + r",}\b", + stop_words=self.stopwords, + ) vectorizer.fit_transform(docs) vocabulary = vectorizer.get_feature_names() - return vocabulary - ''' - def _foo(self, docs, vocabulary, labels_path): - final_docs, final_labels = [], [] - if labels_path is not None: - labels = [line.strip() for line in open(labels_path, 'r').readlines()] - for doc, label in zip(docs, labels): - new_doc = [w for w in doc.split() if w in set(vocabulary)] - if len(new_doc) > self.min_doc_words: - final_docs.append(new_doc) - final_labels.append(label) - return final_docs, final_labels - else: - for doc in docs: - new_doc = [w for w in doc.split() if w in set(vocabulary)] - if len(new_doc) > self.min_doc_words: - final_docs.append(new_doc) - return final_docs, [] - ''' + return vocabulary def simple_preprocessing_steps(self, docs): tmp_docs = [] - for d in docs: - new_d = d - new_d = new_d.replace('\n', '') - new_d = new_d.replace('\t', '') + for d in tqdm(docs): + + new_d = " ".join(d.split()) if self.lowercase: new_d = new_d.lower() + + new_d = [token for token in self.spacy_model(new_d)] + if self.entities: + new_d = [ + token + for token in new_d + if token.ent_type_ not in self.entities + ] if self.lemmatize: if self.remove_stopwords_spacy: - new_d = ' '.join([token.lemma_ for token in self.spacy_model(new_d) if not token.is_stop]) + new_d = [ + token.lemma_ for token in new_d if not token.is_stop + ] elif self.stopwords: - new_d = ' '.join( - [token.lemma_ for token in self.spacy_model(new_d) if token.lemma_ not in set(self.stopwords)]) + new_d = [ + token.lemma_ + for token in new_d + if token.lemma_ not in set(self.stopwords) + ] else: - new_d = ' '.join([token.lemma_ for token in self.spacy_model(new_d)]) + new_d = [token.lemma_ for token in self.spacy_model(new_d)] + new_d = " ".join([token_text for token_text in new_d]) if self.remove_punctuation: - new_d = new_d.translate(str.maketrans(self.punctuation, ' ' * len(self.punctuation))) + new_d = new_d.translate( + str.maketrans( + self.punctuation, " " * len(self.punctuation) + ) + ) if self.remove_numbers: - new_d = new_d.translate(str.maketrans("0123456789", ' ' * len("0123456789"))) + new_d = new_d.translate( + str.maketrans("0123456789", " " * len("0123456789")) + ) + + # TODO: figure out which of the preproc steps above + # introduces white spaces between tokens new_d = " ".join(new_d.split()) tmp_docs.append(new_d) return tmp_docs From 3541e1d2704ea27c26a3b5ab1ca697ae425a15a2 Mon Sep 17 00:00:00 2001 From: stepgazaille Date: Wed, 16 Nov 2022 10:07:48 -0500 Subject: [PATCH 10/21] add email filter --- octis/preprocessing/preprocessing.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py index 0c995132..75fca011 100644 --- a/octis/preprocessing/preprocessing.py +++ b/octis/preprocessing/preprocessing.py @@ -1,6 +1,7 @@ # mypy: ignore-errors # flake8: noqa +import re import string import multiprocessing as mp import spacy @@ -16,7 +17,7 @@ """ Maps the language to its corresponding spacy model """ -spacy_model_mapping = { +SPACY_MODEL_MAPPING = { "chinese": "zh_core_web_sm", "danish": "nl_core_news_sm", "dutch": "nl_core_news_sm", @@ -36,6 +37,9 @@ } +EMAIL_PATTERN = "\S*@\S*\s?" + + class Preprocessing: def __init__( self, @@ -47,6 +51,7 @@ def __init__( remove_punctuation: bool = True, punctuation: str = string.punctuation, remove_numbers: bool = True, + remove_emails: bool = True, lemmatize: bool = True, stopword_list: str | list[str] = None, min_chars: int = 1, @@ -57,7 +62,7 @@ def __init__( num_processes: int = None, save_original_indexes=True, remove_stopwords_spacy: bool = True, - entities: list[str] = None, + entities: list[str] = None, ): """ init Preprocessing @@ -79,6 +84,8 @@ def __init__( :type punctuation: str :param remove_numbers: if true, numbers will be removed :type remove_numbers: bool + :param remove_emails: if true, email addresses will be removed + :type remove_emails: bool :param remove_stopwords_spacy: bool , if true use spacy to remove stopwords (default: true) :param lemmatize: if true, words will be lemmatized using a spacy model according to the language that has been set (default: true) @@ -117,11 +124,12 @@ def __init__( self.language = language self.num_processes = num_processes self.remove_numbers = remove_numbers + self.remove_emails = remove_emails self.save_original_indexes = save_original_indexes self.entities = entities if self.lemmatize: - lang = spacy_model_mapping[self.language] + lang = SPACY_MODEL_MAPPING[self.language] try: self.spacy_model = spacy.load(lang) except IOError: @@ -403,8 +411,6 @@ def filter_words(self, docs): else: - # string.punctuation - self.preprocessing_steps.append( "filter words with document frequency lower than " + str(self.min_df) @@ -435,6 +441,10 @@ def simple_preprocessing_steps(self, docs): for d in tqdm(docs): new_d = " ".join(d.split()) + + if self.remove_emails: + new_d = re.sub(EMAIL_PATTERN, "", new_d) + if self.lowercase: new_d = new_d.lower() From 18c8d3f9baff8496ed707f480d5b416f916ffc30 Mon Sep 17 00:00:00 2001 From: stepgazaille Date: Wed, 16 Nov 2022 10:16:54 -0500 Subject: [PATCH 11/21] add URL filter --- octis/preprocessing/preprocessing.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py index 75fca011..62ad85eb 100644 --- a/octis/preprocessing/preprocessing.py +++ b/octis/preprocessing/preprocessing.py @@ -38,6 +38,7 @@ EMAIL_PATTERN = "\S*@\S*\s?" +URL_PATTERN = "http\S+" class Preprocessing: @@ -86,6 +87,8 @@ def __init__( :type remove_numbers: bool :param remove_emails: if true, email addresses will be removed :type remove_emails: bool + :param remove_urls: if true, URLs will be removed + :type remove_urls: bool :param remove_stopwords_spacy: bool , if true use spacy to remove stopwords (default: true) :param lemmatize: if true, words will be lemmatized using a spacy model according to the language that has been set (default: true) @@ -125,6 +128,7 @@ def __init__( self.num_processes = num_processes self.remove_numbers = remove_numbers self.remove_emails = remove_emails + self.remove_urls = remove_urls self.save_original_indexes = save_original_indexes self.entities = entities @@ -444,6 +448,9 @@ def simple_preprocessing_steps(self, docs): if self.remove_emails: new_d = re.sub(EMAIL_PATTERN, "", new_d) + + if self.remove_urls: + new_d = re.sub(URL_PATTERN, "", new_d) if self.lowercase: new_d = new_d.lower() From 14c99a98065d25f4b421bd25364076bdf22d99dd Mon Sep 17 00:00:00 2001 From: stepgazaille Date: Wed, 16 Nov 2022 10:48:12 -0500 Subject: [PATCH 12/21] add missing constructor arg for remove_urls --- octis/preprocessing/preprocessing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py index 62ad85eb..3f40fcb4 100644 --- a/octis/preprocessing/preprocessing.py +++ b/octis/preprocessing/preprocessing.py @@ -53,6 +53,7 @@ def __init__( punctuation: str = string.punctuation, remove_numbers: bool = True, remove_emails: bool = True, + remove_urls: bool = True, lemmatize: bool = True, stopword_list: str | list[str] = None, min_chars: int = 1, From f97d4d2b6aca4993eee56c70e8add6c0a2f75571 Mon Sep 17 00:00:00 2001 From: Ge Li Date: Thu, 17 Nov 2022 13:09:12 -0500 Subject: [PATCH 13/21] fix bug with spacy --- octis/preprocessing/preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py index 3f40fcb4..e40920b4 100644 --- a/octis/preprocessing/preprocessing.py +++ b/octis/preprocessing/preprocessing.py @@ -475,9 +475,9 @@ def simple_preprocessing_steps(self, docs): if token.lemma_ not in set(self.stopwords) ] else: - new_d = [token.lemma_ for token in self.spacy_model(new_d)] + new_d = [token.lemma_ for token in new_d] - new_d = " ".join([token_text for token_text in new_d]) + new_d = " ".join([str(token_text) for token_text in new_d]) if self.remove_punctuation: new_d = new_d.translate( str.maketrans( From a99d445e53745587d1d6608deb2d95ccab97d9ea Mon Sep 17 00:00:00 2001 From: Ge Li Date: Mon, 21 Nov 2022 16:06:00 -0500 Subject: [PATCH 14/21] Update requirements.txt --- requirements.txt | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/requirements.txt b/requirements.txt index 694fba13..85ca73fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,14 @@ -gensim>=4.0.0 -nltk -pandas -spacy +gensim==4.2.0 +nltk==3.7 +pandas==1.5.0 +spacy==3.4.2 scikit-learn==0.24.2 -scikit-optimize>=0.8.1 -matplotlib -torch -numpy>=1.19.1 -libsvm -flask +scikit-optimize==0.9.0 +matplotlib==3.6.1 +torch==1.12.1 +numpy==1.23.4 +libsvm==3.23.0.4 +flask==2.2.2 sentence_transformers -requests -tomotopy +requests==2.28.1 +tomotopy==0.12.3 From 39652ecc59fd06cfbb47521c9a6370bdd0cf136f Mon Sep 17 00:00:00 2001 From: stepgazaille Date: Tue, 22 Nov 2022 08:53:25 -0500 Subject: [PATCH 15/21] add "sentence_transformers" version to requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 85ca73fa..4458ba81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,6 @@ torch==1.12.1 numpy==1.23.4 libsvm==3.23.0.4 flask==2.2.2 -sentence_transformers +sentence_transformers==2.2.2 requests==2.28.1 tomotopy==0.12.3 From e941956328e4afcaa33481e6367f7f2df8f6de94 Mon Sep 17 00:00:00 2001 From: stepgazaille Date: Tue, 22 Nov 2022 09:22:28 -0500 Subject: [PATCH 16/21] freeze requirements versions --- requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4458ba81..cc418a4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ gensim==4.2.0 nltk==3.7 -pandas==1.5.0 -spacy==3.4.2 +pandas==1.5.1 +spacy==3.4.3 scikit-learn==0.24.2 scikit-optimize==0.9.0 -matplotlib==3.6.1 -torch==1.12.1 -numpy==1.23.4 +matplotlib==3.6.2 +torch==1.13.0 +numpy==1.23.5 libsvm==3.23.0.4 flask==2.2.2 sentence_transformers==2.2.2 From 2d02bef045d2e964b2f7a811669714f5a43d9873 Mon Sep 17 00:00:00 2001 From: Ge Li Date: Tue, 22 Nov 2022 09:58:52 -0500 Subject: [PATCH 17/21] Update requirements.txt --- requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index cc418a4a..4e56a7c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ gensim==4.2.0 nltk==3.7 pandas==1.5.1 -spacy==3.4.3 +spacy==3.4.2 scikit-learn==0.24.2 scikit-optimize==0.9.0 -matplotlib==3.6.2 -torch==1.13.0 -numpy==1.23.5 +matplotlib==3.6.1 +torch==1.12.1 +numpy==1.23.4 libsvm==3.23.0.4 flask==2.2.2 sentence_transformers==2.2.2 From 9c119c44959999a0635fd62c8010f863999c7609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gazaille?= Date: Tue, 22 Nov 2022 10:10:36 -0500 Subject: [PATCH 18/21] Update requirements.txt --- requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4e56a7c3..cc418a4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ gensim==4.2.0 nltk==3.7 pandas==1.5.1 -spacy==3.4.2 +spacy==3.4.3 scikit-learn==0.24.2 scikit-optimize==0.9.0 -matplotlib==3.6.1 -torch==1.12.1 -numpy==1.23.4 +matplotlib==3.6.2 +torch==1.13.0 +numpy==1.23.5 libsvm==3.23.0.4 flask==2.2.2 sentence_transformers==2.2.2 From b002a15f13e827fb9d185a0d2b011ef4acc4c621 Mon Sep 17 00:00:00 2001 From: stepgazaille Date: Tue, 22 Nov 2022 11:58:27 -0500 Subject: [PATCH 19/21] fix typing --- octis/preprocessing/preprocessing.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py index e40920b4..6639edca 100644 --- a/octis/preprocessing/preprocessing.py +++ b/octis/preprocessing/preprocessing.py @@ -1,17 +1,18 @@ # mypy: ignore-errors # flake8: noqa +from collections import Counter +import multiprocessing as mp +from pathlib import Path import re import string -import multiprocessing as mp -import spacy +from typing import Union from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split -from pathlib import Path -from octis.dataset.dataset import Dataset -from collections import Counter import numpy as np +import spacy from tqdm import tqdm +from octis.dataset.dataset import Dataset """ @@ -55,7 +56,7 @@ def __init__( remove_emails: bool = True, remove_urls: bool = True, lemmatize: bool = True, - stopword_list: str | list[str] = None, + stopword_list: Union[str, list[str]] = None, min_chars: int = 1, min_words_docs: int = 0, language: str = "english", From d05af05ac3517c4ef23844b2168e58b1cf87f5d9 Mon Sep 17 00:00:00 2001 From: stepgazaille Date: Fri, 2 Dec 2022 13:37:10 -0500 Subject: [PATCH 20/21] bugfix vocab with terms not in docs --- octis/preprocessing/preprocessing.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py index 6639edca..2c3ef3b4 100644 --- a/octis/preprocessing/preprocessing.py +++ b/octis/preprocessing/preprocessing.py @@ -4,6 +4,7 @@ from collections import Counter import multiprocessing as mp from pathlib import Path +from gensim.corpora.dictionary import Dictionary import re import string from typing import Union @@ -221,9 +222,7 @@ def preprocess_dataset( vocabulary = self.filter_words(docs) print("created vocab") - # with Pool(self.num_processes) as p: - # final_docs, final_labels = p.starmap(self._foo, product(docs, vocabulary, labels_path, repeat=2)) - print(len(vocabulary)) + final_docs, final_labels, document_indexes = [], [], [] if labels_path is not None: if multilabel: @@ -271,11 +270,15 @@ def preprocess_dataset( ) if self.verbose: print("words filtering done") + + # Make sure vocabulary is still consistent with the content of docs: + dictionary = Dictionary(final_docs, prune_at=None) + vocabulary = sorted(dictionary.token2id.keys()) + metadata = { "total_documents": len(docs), "vocabulary_length": len(vocabulary), "preprocessing-info": self.preprocessing_steps - # ,"labels": list(set(final_labels)), "total_labels": len(set(final_labels)) } if self.split: if len(final_labels) > 0: From 78805d646732eb8c2494b338bf727c1552566ffa Mon Sep 17 00:00:00 2001 From: Ge Li Date: Fri, 18 Aug 2023 10:36:19 -0400 Subject: [PATCH 21/21] fix TI implementation --- octis/evaluation_metrics/interpretability_metrics.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/octis/evaluation_metrics/interpretability_metrics.py b/octis/evaluation_metrics/interpretability_metrics.py index dfa30c26..ecf0631f 100644 --- a/octis/evaluation_metrics/interpretability_metrics.py +++ b/octis/evaluation_metrics/interpretability_metrics.py @@ -1,3 +1,4 @@ +import numpy as np from octis.evaluation_metrics.metrics import AbstractMetric from octis.evaluation_metrics.diversity_metrics import TopicDiversity from octis.evaluation_metrics.coherence_metrics import Coherence,_load_default_texts @@ -27,7 +28,7 @@ def __init__( self.topic_diversity = TopicDiversity(topk=topk) def score(self, model_output: dict) -> float: # noqa - # 1 is added to convert npmi output into a positive scale - return (1+self.c_npmi.score( - model_output - )) * self.topic_diversity.score(model_output) + # exp(tc*td) + return np.exp( + self.c_npmi.score(model_output) * self.topic_diversity.score(model_output) + ) \ No newline at end of file