Skip to content

Commit

Permalink
Implement columnar sorting in bar and heatap. (ResidentMario#87)
Browse files Browse the repository at this point in the history
  • Loading branch information
ResidentMario committed Jul 8, 2019
1 parent 1d67f91 commit cbcc8d5
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 33 deletions.
46 changes: 20 additions & 26 deletions missingno/missingno.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,11 @@ def matrix(df,
"""
A matrix visualization of the nullity of the given DataFrame.
For optimal performance, please stay within 250 rows and 50 columns.
:param df: The `DataFrame` being mapped.
:param filter: The filter to apply to the heatmap. Should be one of "top", "bottom", or None (default).
:param n: The max number of columns to include in the filtered DataFrame.
:param p: The max percentage fill of the columns in the filtered DataFrame.
:param sort: The sort to apply to the heatmap. Should be one of "ascending", "descending", or None (default).
:param sort: The row sort order to apply. Can be "ascending", "descending", or None.
:param figsize: The size of the figure to display.
:param fontsize: The figure's font size. Default to 16.
:param labels: Whether or not to display the column names. Defaults to the underlying data labels when there are
Expand All @@ -35,7 +33,7 @@ def matrix(df,
:return: If `inline` is False, the underlying `matplotlib.figure` object. Else, nothing.
"""
df = nullity_filter(df, filter=filter, n=n, p=p)
df = nullity_sort(df, sort=sort)
df = nullity_sort(df, sort=sort, axis='columns')

height = df.shape[0]
width = df.shape[1]
Expand Down Expand Up @@ -202,17 +200,17 @@ def bar(df, figsize=(24, 10), fontsize=16, labels=None, log=False, color='dimgra
:param filter: The filter to apply to the heatmap. Should be one of "top", "bottom", or None (default).
:param n: The cap on the number of columns to include in the filtered DataFrame.
:param p: The cap on the percentage fill of the columns in the filtered DataFrame.
:param sort: The sort to apply to the heatmap. Should be one of "ascending", "descending", or None (default).
:param sort: The column sort order to apply. Can be "ascending", "descending", or None.
:param figsize: The size of the figure to display.
:param fontsize: The figure's font size. This default to 16.
:param labels: Whether or not to display the column names. Would need to be turned off on particularly large
displays. Defaults to True.
:param color: The color of the filled columns. Default to the RGB multiple `(0.25, 0.25, 0.25)`.
:return: If `inline` is False, the underlying `matplotlib.figure` object. Else, nothing.
"""
nullity_counts = len(df) - df.isnull().sum()
df = nullity_filter(df, filter=filter, n=n, p=p)
df = nullity_sort(df, sort=sort)
df = nullity_sort(df, sort=sort, axis='rows')
nullity_counts = len(df) - df.isnull().sum()

plt.figure(figsize=figsize)
(nullity_counts / len(df)).plot(kind='bar', figsize=figsize, fontsize=fontsize, log=log, color=color)
Expand Down Expand Up @@ -280,8 +278,7 @@ def heatmap(df, inline=False,
more information.
:param p: The cap on the percentage fill of the columns in the filtered DataFrame. See `nullity_filter()` for
more information.
:param sort: The sort to apply to the heatmap. Should be one of "ascending", "descending", or None. See
`nullity_sort()` for more information.
:param sort: The column sort order to apply. Can be "ascending", "descending", or None.
:param figsize: The size of the figure to display. This is a `matplotlib` parameter which defaults to (20, 12).
:param fontsize: The figure's font size.
:param labels: Whether or not to label each matrix entry with its correlation (default is True).
Expand All @@ -294,7 +291,7 @@ def heatmap(df, inline=False,
"""
# Apply filters and sorts, set up the figure.
df = nullity_filter(df, filter=filter, n=n, p=p)
df = nullity_sort(df, sort=sort)
df = nullity_sort(df, sort=sort, axis='rows')

plt.figure(figsize=figsize)
gs = gridspec.GridSpec(1, 1)
Expand Down Expand Up @@ -347,14 +344,14 @@ def heatmap(df, inline=False,


def dendrogram(df, method='average',
filter=None, n=0, p=0, sort=None,
filter=None, n=0, p=0,
orientation=None, figsize=None,
fontsize=16, inline=False
):
"""
Fits a `scipy` hierarchical clustering algorithm to the given DataFrame's variables and visualizes the results as
a `scipy` dendrogram.
The default vertical display will fit up to 50 columns. If more than 50 columns are specified and orientation is
left unspecified the dendrogram will automatically swap to a horizontal display to fit the additional variables.
Expand All @@ -364,7 +361,6 @@ def dendrogram(df, method='average',
:param filter: The filter to apply to the heatmap. Should be one of "top", "bottom", or None (default).
:param n: The cap on the number of columns to include in the filtered DataFrame.
:param p: The cap on the percentage fill of the columns in the filtered DataFrame.
:param sort: The sort to apply to the heatmap. Should be one of "ascending", "descending", or None.
:param figsize: The size of the figure to display. This is a `matplotlib` parameter which defaults to `(25, 10)`.
:param fontsize: The figure's font size.
:param orientation: The way the dendrogram is oriented. Defaults to top-down if there are less than or equal to 50
Expand All @@ -377,14 +373,13 @@ def dendrogram(df, method='average',
if len(df.columns) <= 50 or orientation == 'top' or orientation == 'bottom':
figsize = (25, 10)
else:
figsize = (25, (25 + len(df.columns) - 50)*0.5)
figsize = (25, (25 + len(df.columns) - 50) * 0.5)

plt.figure(figsize=figsize)
gs = gridspec.GridSpec(1, 1)
ax0 = plt.subplot(gs[0])

df = nullity_filter(df, filter=filter, n=n, p=p)
df = nullity_sort(df, sort=sort)

# Link the hierarchical output matrix, figure out orientation, construct base dendrogram.
x = np.transpose(df.isnull().astype(int).values)
Expand All @@ -396,14 +391,15 @@ def dendrogram(df, method='average',
else:
orientation = 'bottom'

hierarchy.dendrogram(z,
orientation=orientation,
labels=df.columns.tolist(),
distance_sort='descending',
link_color_func=lambda c: 'black',
leaf_font_size=fontsize,
ax=ax0
)
hierarchy.dendrogram(
z,
orientation=orientation,
labels=df.columns.tolist(),
distance_sort='descending',
link_color_func=lambda c: 'black',
leaf_font_size=fontsize,
ax=ax0
)

# Remove extraneous default visual elements.
ax0.set_aspect('auto')
Expand Down Expand Up @@ -435,7 +431,7 @@ def dendrogram(df, method='average',


def geoplot(df,
filter=None, n=0, p=0, sort=None,
filter=None, n=0, p=0,
x=None, y=None, figsize=(25, 10), inline=False,
by=None, cmap='YlGn', **kwargs):
"""
Expand All @@ -447,7 +443,6 @@ def geoplot(df,
:param df: The DataFrame whose completeness is being geoplotted.
:param filter: The filter to apply to the heatmap. Should be one of "top", "bottom", or None (default).
:param sort: The sort to apply to the heatmap. Should be one of "ascending", "descending", or None.
:param n: The cap on the number of columns to include in the filtered DataFrame.
:param p: The cap on the percentage fill of the columns in the filtered DataFrame.
:param figsize: The size of the figure to display. This is a `matplotlib` parameter which defaults to `(25, 10)`.
Expand All @@ -466,7 +461,6 @@ def geoplot(df,
from shapely.geometry import Point

df = nullity_filter(df, filter=filter, n=n, p=p)
df = nullity_sort(df, sort=sort)

nullity = df.notnull().sum(axis='columns') / df.shape[1]
if x and y:
Expand Down
26 changes: 19 additions & 7 deletions missingno/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,32 @@
import numpy as np


def nullity_sort(df, sort=None):
def nullity_sort(df, sort=None, axis='columns'):
"""
Sorts a DataFrame according to its nullity, in either ascending or descending order.
:param df: The DataFrame object being sorted.
:param sort: The sorting method: either "ascending", "descending", or None (default).
:return: The nullity-sorted DataFrame.
"""
if sort == 'ascending':
return df.iloc[np.argsort(df.count(axis='columns').values), :]
elif sort == 'descending':
return df.iloc[np.flipud(np.argsort(df.count(axis='columns').values)), :]
else:
if sort is None:
return df
elif sort not in ['ascending', 'descending']:
raise ValueError('The "sort" parameter must be set to "ascending" or "descending".')

if axis not in ['rows', 'columns']:
raise ValueError('The "axis" parameter must be set to "rows" or "columns".')

if axis == 'columns':
if sort == 'ascending':
return df.iloc[np.argsort(df.count(axis='columns').values), :]
elif sort == 'descending':
return df.iloc[np.flipud(np.argsort(df.count(axis='columns').values)), :]
elif axis == 'rows':
if sort == 'ascending':
return df.iloc[:, np.argsort(df.count(axis='rows').values)]
elif sort == 'descending':
return df.iloc[:, np.flipud(np.argsort(df.count(axis='rows').values))]


def nullity_filter(df, filter=None, p=0, n=0):
Expand Down Expand Up @@ -44,4 +56,4 @@ def nullity_filter(df, filter=None, p=0, n=0):
df = df.iloc[:, [c <= p for c in df.count(axis='rows').values / len(df)]]
if n:
df = df.iloc[:, np.sort(np.argsort(df.count(axis='rows').values)[:n])]
return df
return df

0 comments on commit cbcc8d5

Please sign in to comment.