Skip to content

Commit

Permalink
Fixing ResampleRowsWeighted
Browse files Browse the repository at this point in the history
  • Loading branch information
AllenDowney committed Oct 13, 2014
1 parent 62fb91c commit d77b6fd
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 25 deletions.
137 changes: 113 additions & 24 deletions code/thinkstats2.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@
import numpy as np
import pandas

import scipy.stats
import scipy.special
import scipy.ndimage
import scipy
from scipy import stats
from scipy import special
from scipy import ndimage

from io import open

ROOT2 = math.sqrt(2)

Expand Down Expand Up @@ -567,6 +570,17 @@ def Var(self, mu=None):
var += p * (x - mu) ** 2
return var

def Std(self, mu=None):
"""Computes the standard deviation of a PMF.
mu: the point around which the variance is computed;
if omitted, computes the mean
returns: float standard deviation
"""
var = self.Var(mu)
return math.sqrt(var)

def MaximumLikelihood(self):
"""Returns the value with the highest probability.
Expand Down Expand Up @@ -651,6 +665,70 @@ def SubPmf(self, other):
pmf.Incr(v1 - v2, p1 * p2)
return pmf

def __mul__(self, other):
"""Computes the Pmf of the product of values drawn from self and other.
other: another Pmf
returns: new Pmf
"""
try:
return self.MulPmf(other)
except AttributeError:
return self.MulConstant(other)

def MulPmf(self, other):
"""Computes the Pmf of the diff of values drawn from self and other.
other: another Pmf
returns: new Pmf
"""
pmf = Pmf()
for v1, p1 in self.Items():
for v2, p2 in other.Items():
pmf.Incr(v1 * v2, p1 * p2)
return pmf

def MulConstant(self, other):
"""Computes the Pmf of the product of a constant and values from self.
other: a number
returns: new Pmf
"""
pmf = Pmf()
for v1, p1 in self.Items():
pmf.Set(v1 * other, p1)
return pmf

def __div__(self, other):
"""Computes the Pmf of the ratio of values drawn from self and other.
other: another Pmf
returns: new Pmf
"""
try:
return self.DivPmf(other)
except AttributeError:
return self.MulConstant(1/other)

__truediv__ = __div__

def DivPmf(self, other):
"""Computes the Pmf of the ratio of values drawn from self and other.
other: another Pmf
returns: new Pmf
"""
pmf = Pmf()
for v1, p1 in self.Items():
for v2, p2 in other.Items():
pmf.Incr(v1 / v2, p1 * p2)
return pmf

def Max(self, k):
"""Computes the CDF of the maximum of k selections from this dist.
Expand Down Expand Up @@ -972,7 +1050,8 @@ def Shift(self, term):
term: how much to add
"""
new = self.Copy()
new.xs = [x + term for x in self.xs]
# don't use +=, or else an int array + float yields int array
new.xs = new.xs + term
return new

def Scale(self, factor):
Expand All @@ -981,7 +1060,8 @@ def Scale(self, factor):
factor: what to multiply by
"""
new = self.Copy()
new.xs = [x * factor for x in self.xs]
# don't use *=, or else an int array * float yields int array
new.xs = new.xs * factor
return new

def Prob(self, x):
Expand Down Expand Up @@ -1072,8 +1152,8 @@ def Random(self):
def Sample(self, n):
"""Generates a random sample from this distribution.
Args:
n: int length of the sample
n: int length of the sample
returns: NumPy array
"""
ps = np.random.random(n)
return self.ValueArray(ps)
Expand Down Expand Up @@ -1469,7 +1549,7 @@ def Density(self, xs):
returns: float or NumPy array of probability density
"""
return scipy.stats.norm.pdf(xs, self.mu, self.sigma)
return stats.norm.pdf(xs, self.mu, self.sigma)


class ExponentialPdf(Pdf):
Expand Down Expand Up @@ -1502,7 +1582,7 @@ def Density(self, xs):
returns: float or NumPy array of probability density
"""
return scipy.stats.expon.pdf(xs, scale=1.0/self.lam)
return stats.expon.pdf(xs, scale=1.0/self.lam)


class EstimatedPdf(Pdf):
Expand All @@ -1515,7 +1595,7 @@ def __init__(self, sample, label=None):
label: string
"""
self.label = label if label is not None else '_nolegend_'
self.kde = scipy.stats.gaussian_kde(sample)
self.kde = stats.gaussian_kde(sample)
low = min(sample)
high = max(sample)
self.linspace = np.linspace(low, high, 101)
Expand Down Expand Up @@ -1642,7 +1722,7 @@ def EvalNormalPdf(x, mu, sigma):
returns: float probability density
"""
return scipy.stats.norm.pdf(x, mu, sigma)
return stats.norm.pdf(x, mu, sigma)


def MakeNormalPmf(mu, sigma, num_sigmas, n=201):
Expand All @@ -1667,11 +1747,20 @@ def MakeNormalPmf(mu, sigma, num_sigmas, n=201):


def EvalBinomialPmf(k, n, p):
"""Evaluates the binomial pmf.
"""Evaluates the binomial PMF.
Returns the probabily of k successes in n trials with probability p.
"""
return scipy.stats.binom.pmf(k, n, p)
return stats.binom.pmf(k, n, p)


def EvalHypergeomPmf(k, N, K, n):
"""Evaluates the hypergeometric PMF.
Returns the probabily of k successes in n trials from a population
N with K successes in it.
"""
return stats.hypergeom.pmf(k, N, K, n)


def EvalPoissonPmf(k, lam):
Expand All @@ -1684,8 +1773,8 @@ def EvalPoissonPmf(k, lam):
"""
# don't use the scipy function (yet). for lam=0 it returns NaN;
# should be 0.0
# return scipy.stats.poisson.pmf(k, lam)
return lam ** k * math.exp(-lam) / math.factorial(k)
# return stats.poisson.pmf(k, lam)
return lam ** k * math.exp(-lam) / special.gamma(k+1)


def MakePoissonPmf(lam, high, step=1):
Expand Down Expand Up @@ -1765,7 +1854,7 @@ def EvalNormalCdf(x, mu=0, sigma=1):
Returns:
float
"""
return scipy.stats.norm.cdf(x, loc=mu, scale=sigma)
return stats.norm.cdf(x, loc=mu, scale=sigma)


def EvalNormalCdfInverse(p, mu=0, sigma=1):
Expand All @@ -1783,7 +1872,7 @@ def EvalNormalCdfInverse(p, mu=0, sigma=1):
Returns:
float
"""
return scipy.stats.norm.ppf(p, loc=mu, scale=sigma)
return stats.norm.ppf(p, loc=mu, scale=sigma)


def EvalLognormalCdf(x, mu=0, sigma=1):
Expand All @@ -1795,7 +1884,7 @@ def EvalLognormalCdf(x, mu=0, sigma=1):
Returns: float or sequence
"""
return scipy.stats.lognorm.cdf(x, loc=mu, scale=sigma)
return stats.lognorm.cdf(x, loc=mu, scale=sigma)


def RenderExpoCdf(lam, low, high, n=101):
Expand All @@ -1810,7 +1899,7 @@ def RenderExpoCdf(lam, low, high, n=101):
"""
xs = np.linspace(low, high, n)
ps = 1 - np.exp(-lam * xs)
#ps = scipy.stats.expon.cdf(xs, scale=1.0/lam)
#ps = stats.expon.cdf(xs, scale=1.0/lam)
return xs, ps


Expand All @@ -1826,7 +1915,7 @@ def RenderNormalCdf(mu, sigma, low, high, n=101):
returns: numpy arrays (xs, ps)
"""
xs = np.linspace(low, high, n)
ps = scipy.stats.norm.cdf(xs, mu, sigma)
ps = stats.norm.cdf(xs, mu, sigma)
return xs, ps


Expand All @@ -1845,7 +1934,7 @@ def RenderParetoCdf(xmin, alpha, low, high, n=50):
low = xmin
xs = np.linspace(low, high, n)
ps = 1 - (xs / xmin) ** -alpha
#ps = scipy.stats.pareto.cdf(xs, scale=xmin, b=alpha)
#ps = stats.pareto.cdf(xs, scale=xmin, b=alpha)
return xs, ps


Expand Down Expand Up @@ -1914,7 +2003,7 @@ def MakePmf(self, steps=101, label=None):
def MakeCdf(self, steps=101):
"""Returns the CDF of this distribution."""
xs = [i / (steps - 1.0) for i in range(steps)]
ps = [scipy.special.betainc(self.alpha, self.beta, x) for x in xs]
ps = [special.betainc(self.alpha, self.beta, x) for x in xs]
cdf = Cdf(xs, ps)
return cdf

Expand Down Expand Up @@ -2581,7 +2670,7 @@ def ResampleRowsWeighted(df, column='finalwgt'):
returns: DataFrame
"""
weights = df[column]
cdf = Pmf(weights.iteritems()).MakeCdf()
cdf = Cdf(dict(weights))
indices = cdf.Sample(len(weights))
sample = df.loc[indices]
return sample
Expand Down Expand Up @@ -2626,7 +2715,7 @@ def Smooth(xs, sigma=2, **options):
xs: sequence
sigma: standard deviation of the filter
"""
return scipy.ndimage.filters.gaussian_filter1d(xs, sigma, **options)
return ndimage.filters.gaussian_filter1d(xs, sigma, **options)


class HypothesisTest(object):
Expand Down
50 changes: 49 additions & 1 deletion code/thinkstats2_test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""This file contains code for use with "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2010 Allen B. Downey
Copyright 2014 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""

from __future__ import print_function, division

import unittest
import random

Expand Down Expand Up @@ -133,6 +135,36 @@ def testPmf(self):
xs, ys = pmf.Render()
self.assertEqual(tuple(xs), tuple(sorted(pmf.Values())))

def testPmfAddSub(self):
pmf = thinkstats2.Pmf([1, 2, 3, 4, 5, 6])

pmf1 = pmf + 1
self.assertAlmostEqual(pmf1.Mean(), 4.5)

pmf2 = pmf + pmf
self.assertAlmostEqual(pmf2.Mean(), 7.0)

pmf3 = pmf - 1
self.assertAlmostEqual(pmf3.Mean(), 2.5)

pmf4 = pmf - pmf
self.assertAlmostEqual(pmf4.Mean(), 0)

def testPmfMulDiv(self):
pmf = thinkstats2.Pmf([1, 2, 3, 4, 5, 6])

pmf1 = pmf * 2
self.assertAlmostEqual(pmf1.Mean(), 7)

pmf2 = pmf * pmf
self.assertAlmostEqual(pmf2.Mean(), 12.25)

pmf3 = pmf / 2
self.assertAlmostEqual(pmf3.Mean(), 1.75)

pmf4 = pmf / pmf
self.assertAlmostEqual(pmf4.Mean(), 1.4291667)

def testPmfProbLess(self):
d6 = thinkstats2.Pmf(range(1,7))
self.assertEqual(d6.ProbLess(4), 0.5)
Expand Down Expand Up @@ -249,6 +281,18 @@ def testCdf(self):
self.assertEqual(cdf2.Prob(2), 0.6)
self.assertEqual(cdf2.Value(0.6), 2)

def testShift(self):
t = [1, 2, 2, 3, 5]
cdf = thinkstats2.Cdf(t)
cdf2 = cdf.Shift(1)
self.assertEqual(cdf[1], cdf2[2])

def testScale(self):
t = [1, 2, 2, 3, 5]
cdf = thinkstats2.Cdf(t)
cdf2 = cdf.Scale(2)
self.assertEqual(cdf[2], cdf2[4])

def testCdfRender(self):
t = [1, 2, 2, 3, 5]
cdf = thinkstats2.Cdf(t)
Expand Down Expand Up @@ -330,6 +374,10 @@ def testEvalNormalCdf(self):
x = thinkstats2.EvalNormalCdfInverse(0.05, 0, 1)
self.assertAlmostEqual(x, -1.64485362695)

def testEvalPoissonPmf(self):
p = thinkstats2.EvalPoissonPmf(2, 1)
self.assertAlmostEqual(p, 0.1839397205)

def testCov(self):
t = [0, 4, 7, 3, 8, 1, 6, 2, 9, 5]
a = np.array(t)
Expand Down

0 comments on commit d77b6fd

Please sign in to comment.