diff --git a/tests/test_representation.py b/tests/test_representation.py index 4eab08fa..10f855ac 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -120,13 +120,13 @@ def _tfidf(term, corpus, document_index): vector_s = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7]) -document_term_df = pd.DataFrame( - [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], columns=["a", "b"], -).astype("Sparse[float64, nan]") +df = pd.DataFrame([[1.0, 0.0], [0.0, 0.0]], index=[5, 7], columns=["a", "b"],).astype( + "Sparse[float64, nan]" +) test_cases_dim_reduction_and_clustering = [ - # format: [function_name, function, correct output for s_vector_series and s_documenttermDF input above] + # format: [function_name, function, correct output for s_vector_series and df input above] ["pca", representation.pca, pd.Series([[-0.5, 0.0], [0.5, 0.0]], index=[5, 7],),], [ "nmf", @@ -232,7 +232,7 @@ def test_dim_reduction_and_clustering_with_vector_series_input( ) @parameterized.expand(test_cases_dim_reduction_and_clustering) - def test_dim_reduction_and_clustering_with_documenttermDF_input( + def test_dim_reduction_and_clustering_with_dataframe_input( self, name, test_function, correct_output ): s_true = correct_output @@ -242,11 +242,11 @@ def test_dim_reduction_and_clustering_with_documenttermDF_input( return if name == "kmeans": - result_s = test_function(document_term_df, random_state=42, n_clusters=2) - elif name == "dbscan" or name == "meanshift": - result_s = test_function(document_term_df) + result_s = test_function(df, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift" or name == "normalize": + result_s = test_function(df) else: - result_s = test_function(document_term_df, random_state=42) + result_s = test_function(df, random_state=42) pd.testing.assert_series_equal( s_true, @@ -257,10 +257,10 @@ def test_dim_reduction_and_clustering_with_documenttermDF_input( check_category_order=False, ) - def test_normalize_document_term_df_also_as_output(self): - # normalize should also return DocumentTermDF output for DocumentTermDF + def test_normalize_DataFrame_also_as_output(self): + # normalize should also return DataFrame output for DataFrame # input so we test it separately - result = representation.normalize(document_term_df) + result = representation.normalize(df) correct_output = pd.DataFrame( [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], columns=["a", "b"], ) diff --git a/tests/test_types.py b/tests/test_types.py index 670acb61..f054a695 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -72,20 +72,13 @@ def f(s): except TypeError: self.fail("Failed although input type is correct.") - def test_inputseries_correct_type_documentrepresentationseries(self): - @_types.InputSeries(_types.RepresentationSeries) + def test_inputseries_correct_type_DataFrame(self): + @_types.InputSeries(_types.DataFrame) def f(s): pass try: - f( - pd.Series( - [1, 2, 3], - index=pd.MultiIndex.from_tuples( - [("doc1", "word1"), ("doc1", "word2"), ("doc2", "word1")] - ), - ) - ) + f(pd.DataFrame([[1, 2, 3]], columns=["a", "b", "c"], dtype="Sparse",)) except TypeError: self.fail("Failed although input type is correct.") @@ -118,3 +111,23 @@ def f(s): f(pd.Series([np.nan, pd.NA, [0, 1, 2]])) except TypeError: self.fail("Failed although input type is correct.") + + def test_several_possible_types_correct_type(self): + @_types.InputSeries([_types.DataFrame, _types.VectorSeries]) + def f(x): + pass + + try: + f(pd.DataFrame([[1, 2, 3]], columns=["a", "b", "c"], dtype="Sparse",)) + + f(pd.Series([[1.0, 2.0]])) + + except TypeError: + self.fail("Failed although input type is correct.") + + def test_several_possible_types_wrong_type(self): + @_types.InputSeries([_types.DataFrame, _types.VectorSeries]) + def f(x): + pass + + self.assertRaises(TypeError, f, pd.Series([["token", "ized"]])) diff --git a/texthero/_types.py b/texthero/_types.py index 3bb5d8c7..16125109 100644 --- a/texthero/_types.py +++ b/texthero/_types.py @@ -11,9 +11,8 @@ The goal is to be able to do something like this: -@OutputSeries(RepresentationSeries) @InputSeries(TokenSeries) -def tfidf(s: TokenSeries) -> RepresentationSeries: +def tfidf(s: TokenSeries) -> DataFrame: ... The decorator (@...) makes python check whether the input is @@ -26,7 +25,7 @@ def tfidf(s: TokenSeries) -> RepresentationSeries: The typing helps the users understand the code more easily as they'll be able to see immediately from the documentation on what types of Series a function operates. This is much more -verbose and clearer than e.g. "tfidf(s: pd.Series) -> pd.Series". +verbose and clearer than e.g. "tfidf(s: pd.Series) -> pd.DataFrame". Note that users can and should of course still simply use ordinary pd.Series objects. The custom types are just subclasses of pd.Series so @@ -43,18 +42,26 @@ def tfidf(s: TokenSeries) -> RepresentationSeries: - TextSeries: cells are text (i.e. strings), e.g. "Test" - TokenSeries: cells are lists of tokens (i.e. lists of strings), e.g. ["word1", "word2"] - VectorSeries: cells are vector representations of text, e.g. [0.25, 0.75] -- RepresentationSeries: Series is multiindexed with level one -being the document, level two being the individual features and their values -The classes are lightweight subclasses of pd.Series and serve 2 purposes: +The implemented types are lightweight subclasses of pd.Series and serve 2 purposes: 1. Good documentation for users through docstring. 2. Function to check if a pd.Series has the required properties. +Additionally, sometimes Texthero functions (most that accept a +VectorSeries as input) also accept a Pandas DataFrame +as input that is representing a matrix. Every cell value +is then one entry in the matrix. We only have a subclass +DataFrame(HeroSeries) to easily support the type check +with the InputSeries decorator below and +give a good error message / documentation to users. + """ import functools import pandas as pd +from typing import Tuple + """ The Hero Series classes. @@ -63,13 +70,13 @@ def tfidf(s: TokenSeries) -> RepresentationSeries: # This class is mainly for documentation in the docstring. -class HeroSeries(pd.Series): +class HeroTypes(pd.Series, pd.DataFrame): """ Hero Series Types ================= In texthero, most functions operate on a Pandas Series as input and give a Pandas Series as output. There are currently four - main types of Series' in use, which are supported as classes + main types of Series / DataFrames in use, which are supported as classes by the library: 1. TextSeries: Every cell is a text, i.e. a string. For example, @@ -81,68 +88,72 @@ class HeroSeries(pd.Series): 3. VectorSeries: Every cell is a vector representing text, i.e. a list of floats. For example, `pd.Series([[1.0, 2.0], [3.0]])` is a valid VectorSeries. - 4. RepresentationSeries: Series is multiindexed with level one - being the document, level two being the individual features and their values. - For example, - `pd.Series([1, 2, 3], index=pd.MultiIndex.from_tuples([("doc1", "word1"), ("doc1", "word2"), ("doc2", "word1")]))` - is a valid RepresentationSeries. + Additionally, some Texthero functions (most that accept + VectorSeries input) accept a Pandas DataFrame as input that is + representing a matrix. + Every cell value is one entry in the matrix. + An example is + `pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["word1", "word2", "word3"])`. These types of Series are supposed to make using the library easier and more intuitive. For example, if you see a function head ``` - def tfidf(s: TokenSeries) -> RepresentationSeries + def tfidf(s: TokenSeries) -> DataFrame ``` then you know that the function takes a Pandas Series whose cells are lists of strings (tokens) and will - return a Pandas Series whose cells are vectors of floats. + return a sparse Pandas DataFrame where every subcolumn is one feature + (in this case one word). """ @staticmethod - def check_series(): + def check_type(): raise NotImplementedError() # Every Hero Series type has to have this. -class TextSeries(HeroSeries): +class TextSeries(HeroTypes): """ In a TextSeries, every cell has to be a text, i.e. a string. For example, `pd.Series(["test", "test"])` is a valid TextSeries. """ @staticmethod - def check_series(s: pd.Series) -> bool: + def check_type(s: pd.Series) -> Tuple[bool, str]: """ Check if a given Pandas Series has the properties of a TextSeries. """ error_string = ( - "The input Series should consist only of strings in every cell." - " See help(hero.HeroSeries) for more information." + "should be TextSeries: the input Series should consist only of strings in every cell." + " See help(hero.HeroTypes) for more information." ) try: first_non_nan_value = s.loc[s.first_valid_index()] - if not isinstance(first_non_nan_value, str) or s.index.nlevels != 1: - raise TypeError(error_string) + if not isinstance(first_non_nan_value, str): + return False, error_string except KeyError: # Only NaNs in Series -> same warning applies - raise TypeError(error_string) + return False, error_string + + return True, "" -class TokenSeries(HeroSeries): +class TokenSeries(HeroTypes): """ In a TokenSeries, every cell has to be a list of words/tokens, i.e. a list of strings. For example, `pd.Series([["test"], ["token2", "token3"]])` is a valid TokenSeries. """ @staticmethod - def check_series(s: pd.Series) -> bool: + def check_type(s: pd.Series) -> Tuple[bool, str]: """ Check if a given Pandas Series has the properties of a TokenSeries. """ error_string = ( - "There are non-token cells (every cell should be a list of words/tokens) in the given Series." - " See help(hero.HeroSeries) for more information." + "should be TokenSeries: there are non-token cells (every cell should be a list of words/tokens) in the given Series." + " See help(hero.HeroTypes) for more information." ) def is_list_of_strings(cell): @@ -152,13 +163,15 @@ def is_list_of_strings(cell): try: first_non_nan_value = s.loc[s.first_valid_index()] - if not is_list_of_strings(first_non_nan_value) or s.index.nlevels != 1: - raise TypeError(error_string) + if not is_list_of_strings(first_non_nan_value): + return False, error_string except KeyError: # Only NaNs in Series -> same warning applies - raise TypeError(error_string) + return False, error_string + return True, "" -class VectorSeries(HeroSeries): + +class VectorSeries(HeroTypes): """ In a VectorSeries, every cell is a vector representing text, i.e. a list of numbers. @@ -166,14 +179,14 @@ class VectorSeries(HeroSeries): """ @staticmethod - def check_series(s: pd.Series, input_output="") -> bool: + def check_type(s: pd.Series, input_output="") -> Tuple[bool, str]: """ - Check if a given Pandas Series has the properties of a RepresentationSeries. + Check if a given Pandas Series has the properties of a VectorSeries. """ error_string = ( - "There are non-representation cells (every cell should be a list of floats) in the given Series." - " See help(hero.HeroSeries) for more information." + "should be VectorSeries: there are non-representation cells (every cell should be a list of floats) in the given Series." + " See help(hero.HeroTypes) for more information." ) def is_numeric(x): @@ -185,45 +198,50 @@ def is_numeric(x): return True def is_list_of_numbers(cell): - return all(is_numeric(x) for x in cell) and isinstance(cell, (list, tuple)) + return isinstance(cell, (list, tuple)) and all(is_numeric(x) for x in cell) try: first_non_nan_value = s.loc[s.first_valid_index()] - if not is_list_of_numbers(first_non_nan_value) or s.index.nlevels != 1: - raise TypeError(error_string) + if not is_list_of_numbers(first_non_nan_value): + return False, error_string except KeyError: # Only NaNs in Series -> same warning applies - raise TypeError(error_string) + return False, error_string + + return True, "" -class RepresentationSeries(HeroSeries): +class DataFrame(HeroTypes): """ - A RepresentationSeries is multiindexed with level one - being the document, and level two being the individual features and their values. + A Pandas DataFrame + representing a matrix (e.g. a Document-Term-Matrix). + Every cell value is one entry in the matrix. For example, - `pd.Series([1, 2, 3], index=pd.MultiIndex.from_tuples([("doc1", "word1"), ("doc1", "word2"), ("doc2", "word1")]))` - is a valid RepresentationSeries. + `pd.DataFrame([[1, 2, 3], [4,5,6]], columns=["word1", "word2", "word3"]))`. + """ @staticmethod - def check_series(s: pd.Series, input_output="") -> bool: + def check_type(df: pd.DataFrame, input_output="") -> Tuple[bool, str]: """ - Check if a given Pandas Series has the properties of a RepresentationSeries. + Check if a given Pandas Series has the properties of a DataFrame. """ error_string = ( - "The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex." - " See help(hero.HeroSeries) for more information." + "should be DataFrame: The input should be a Pandas DataFrame" + " representing a matrix, where every cell is one entry of the matrix." + " See help(hero.HeroTypes) for more information." ) - if not isinstance(s.index, pd.MultiIndex) or s.index.nlevels != 2: - raise TypeError(error_string) + if not isinstance(df, pd.DataFrame): + return False, error_string + else: + return True, "" -def InputSeries(allowed_hero_series_type): +def InputSeries(allowed_hero_series_types): """ Check if first argument of function has / fulfills type allowed_hero_series_type - Examples -------- >>> from texthero._types import * @@ -235,6 +253,11 @@ def InputSeries(allowed_hero_series_type): >>> # throws a type error with a nice explaination >>> f(pd.Series([["I", "am", "tokenized"]])) >>> # passes + With several possible types: + + >>> @InputSeries([DataFrame, VectorSeries]) + ... def g(x): + ... pass """ def decorator(func): @@ -242,7 +265,33 @@ def decorator(func): def wrapper(*args, **kwargs): s = args[0] # The first input argument will be checked. # Check if input series can fulfill type. - allowed_hero_series_type.check_series(s) + + # list -> several possible types + if isinstance(allowed_hero_series_types, list): + + # Output of check_type is always Bool, Error_String where the Bool is True + # if the type is fulfilled, else false. + # if no type is fulfilled (so check_type first output is False for all allowed types), + # combine all the error strings to show the user all allowed types in the TypeError. + if not any( + allowed_type.check_type(s)[0] + for allowed_type in allowed_hero_series_types + ): + + error_string = ( + "Possible types:\n\nEither " + + allowed_hero_series_types[0].check_type(s)[1] + ) + + for allowed_type in allowed_hero_series_types[1:]: + error_string += "\n\nOr " + allowed_type.check_type(s)[1] + + raise TypeError(error_string) + + else: # only one possible type + fulfills, error_string = allowed_hero_series_types.check_type(s) + if not fulfills: + raise TypeError(error_string) # If we get here, the type can be fulfilled -> execute function as usual. return func(*args, **kwargs) diff --git a/texthero/representation.py b/texthero/representation.py index 5d610134..4ef7e0a9 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -15,6 +15,13 @@ from scipy.sparse import coo_matrix from typing import Optional, Union, Any +from texthero._types import ( + TextSeries, + TokenSeries, + VectorSeries, + DataFrame, + InputSeries, +) from texthero import preprocessing @@ -42,8 +49,9 @@ """ +@InputSeries([TokenSeries, TextSeries]) def count( - s: pd.Series, + s: Union[TokenSeries, TextSeries], max_features: Optional[int] = None, min_df=1, max_df=1.0, @@ -457,7 +465,7 @@ def tsne( n_iter=1000, random_state=None, n_jobs=-1, -) -> pd.Series: +) -> VectorSeries: """ Performs t-Distributed Stochastic Neighbor Embedding on the given input. @@ -563,6 +571,7 @@ def tsne( """ +@InputSeries([VectorSeries, DataFrame]) def kmeans( input_matrix: Union[pd.Series, pd.DataFrame], n_clusters=5, @@ -570,7 +579,7 @@ def kmeans( max_iter=300, random_state=None, algorithm="auto", -): +) -> VectorSeries: """ Performs K-means clustering algorithm on the given input. @@ -668,6 +677,7 @@ def kmeans( ).astype("category") +@InputSeries([VectorSeries, DataFrame]) def dbscan( input_matrix: Union[pd.Series, pd.DataFrame], eps=0.5, @@ -676,7 +686,7 @@ def dbscan( metric_params=None, leaf_size=30, n_jobs=-1, -): +) -> VectorSeries: """ Perform DBSCAN clustering on the given input. @@ -783,6 +793,7 @@ def dbscan( ).astype("category") +@InputSeries([VectorSeries, DataFrame]) def meanshift( input_matrix: Union[pd.Series, pd.DataFrame], bandwidth=None, @@ -791,7 +802,7 @@ def meanshift( cluster_all=True, n_jobs=-1, max_iter=300, -): +) -> VectorSeries: """ Perform mean shift clustering on the given input. @@ -938,7 +949,7 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser See Also -------- - Representation Series link TODO add link to tutorial + DataFrame link TODO add link to tutorial `Norm on Wikipedia `_ diff --git a/texthero/visualization.py b/texthero/visualization.py index 94556a93..0b3893b2 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -71,11 +71,11 @@ def scatterplot( >>> df["pca"] = ( ... hero.tfidf(df["texts"]) ... .pipe(hero.pca, n_components=3) - ... ) # TODO: when others get Representation Support: remove flatten + ... ) >>> df["topics"] = ( ... hero.tfidf(df["texts"]) ... .pipe(hero.kmeans, n_clusters=2) - ... ) # TODO: when others get Representation Support: remove flatten + ... ) >>> hero.scatterplot(df, col="pca", color="topics", ... hover_data=["texts"]) # doctest: +SKIP """