Fixing ResampleRowsWeighted

wleftwich · Oct 13, 2014 · d77b6fd · d77b6fd
1 parent 62fb91c
commit d77b6fd
Show file tree

Hide file tree

Showing 2 changed files with 162 additions and 25 deletions.
diff --git a/code/thinkstats2.py b/code/thinkstats2.py
@@ -36,9 +36,12 @@
 import numpy as np
 import pandas
 
-import scipy.stats
-import scipy.special
-import scipy.ndimage
+import scipy
+from scipy import stats
+from scipy import special
+from scipy import ndimage
+
+from io import open
 
 ROOT2 = math.sqrt(2)
 
@@ -567,6 +570,17 @@ def Var(self, mu=None):
             var += p * (x - mu) ** 2
         return var
 
+    def Std(self, mu=None):
+        """Computes the standard deviation of a PMF.
+
+        mu: the point around which the variance is computed;
+                if omitted, computes the mean
+
+        returns: float standard deviation
+        """
+        var = self.Var(mu)
+        return math.sqrt(var)
+
     def MaximumLikelihood(self):
         """Returns the value with the highest probability.
 
@@ -651,6 +665,70 @@ def SubPmf(self, other):
                 pmf.Incr(v1 - v2, p1 * p2)
         return pmf
 
+    def __mul__(self, other):
+        """Computes the Pmf of the product of values drawn from self and other.
+
+        other: another Pmf
+
+        returns: new Pmf
+        """
+        try:
+            return self.MulPmf(other)
+        except AttributeError:
+            return self.MulConstant(other)
+
+    def MulPmf(self, other):
+        """Computes the Pmf of the diff of values drawn from self and other.
+
+        other: another Pmf
+
+        returns: new Pmf
+        """
+        pmf = Pmf()
+        for v1, p1 in self.Items():
+            for v2, p2 in other.Items():
+                pmf.Incr(v1 * v2, p1 * p2)
+        return pmf
+
+    def MulConstant(self, other):
+        """Computes the Pmf of the product of a constant and values from self.
+
+        other: a number
+
+        returns: new Pmf
+        """
+        pmf = Pmf()
+        for v1, p1 in self.Items():
+            pmf.Set(v1 * other, p1)
+        return pmf
+
+    def __div__(self, other):
+        """Computes the Pmf of the ratio of values drawn from self and other.
+
+        other: another Pmf
+
+        returns: new Pmf
+        """
+        try:
+            return self.DivPmf(other)
+        except AttributeError:
+            return self.MulConstant(1/other)
+
+    __truediv__ = __div__
+
+    def DivPmf(self, other):
+        """Computes the Pmf of the ratio of values drawn from self and other.
+
+        other: another Pmf
+
+        returns: new Pmf
+        """
+        pmf = Pmf()
+        for v1, p1 in self.Items():
+            for v2, p2 in other.Items():
+                pmf.Incr(v1 / v2, p1 * p2)
+        return pmf
+
     def Max(self, k):
         """Computes the CDF of the maximum of k selections from this dist.
 
@@ -972,7 +1050,8 @@ def Shift(self, term):
         term: how much to add
         """
         new = self.Copy()
-        new.xs = [x + term for x in self.xs]
+        # don't use +=, or else an int array + float yields int array
+        new.xs = new.xs + term
         return new
 
     def Scale(self, factor):
@@ -981,7 +1060,8 @@ def Scale(self, factor):
         factor: what to multiply by
         """
         new = self.Copy()
-        new.xs = [x * factor for x in self.xs]
+        # don't use *=, or else an int array * float yields int array
+        new.xs = new.xs * factor
         return new
 
     def Prob(self, x):
@@ -1072,8 +1152,8 @@ def Random(self):
     def Sample(self, n):
         """Generates a random sample from this distribution.
         
-        Args:
-            n: int length of the sample
+        n: int length of the sample
+        returns: NumPy array
         """
         ps = np.random.random(n)
         return self.ValueArray(ps)
@@ -1469,7 +1549,7 @@ def Density(self, xs):
 
         returns: float or NumPy array of probability density
         """
-        return scipy.stats.norm.pdf(xs, self.mu, self.sigma)
+        return stats.norm.pdf(xs, self.mu, self.sigma)
 
 
 class ExponentialPdf(Pdf):
@@ -1502,7 +1582,7 @@ def Density(self, xs):
 
         returns: float or NumPy array of probability density
         """
-        return scipy.stats.expon.pdf(xs, scale=1.0/self.lam)
+        return stats.expon.pdf(xs, scale=1.0/self.lam)
 
 
 class EstimatedPdf(Pdf):
@@ -1515,7 +1595,7 @@ def __init__(self, sample, label=None):
         label: string
         """
         self.label = label if label is not None else '_nolegend_'
-        self.kde = scipy.stats.gaussian_kde(sample)
+        self.kde = stats.gaussian_kde(sample)
         low = min(sample)
         high = max(sample)
         self.linspace = np.linspace(low, high, 101)
@@ -1642,7 +1722,7 @@ def EvalNormalPdf(x, mu, sigma):
     
     returns: float probability density
     """
-    return scipy.stats.norm.pdf(x, mu, sigma)
+    return stats.norm.pdf(x, mu, sigma)
 
 
 def MakeNormalPmf(mu, sigma, num_sigmas, n=201):
@@ -1667,11 +1747,20 @@ def MakeNormalPmf(mu, sigma, num_sigmas, n=201):
 
 
 def EvalBinomialPmf(k, n, p):
-    """Evaluates the binomial pmf.
+    """Evaluates the binomial PMF.
 
     Returns the probabily of k successes in n trials with probability p.
     """
-    return scipy.stats.binom.pmf(k, n, p)
+    return stats.binom.pmf(k, n, p)
+
+
+def EvalHypergeomPmf(k, N, K, n):
+    """Evaluates the hypergeometric PMF.
+
+    Returns the probabily of k successes in n trials from a population
+    N with K successes in it.
+    """
+    return stats.hypergeom.pmf(k, N, K, n)
 
 
 def EvalPoissonPmf(k, lam):
@@ -1684,8 +1773,8 @@ def EvalPoissonPmf(k, lam):
     """
     # don't use the scipy function (yet).  for lam=0 it returns NaN;
     # should be 0.0
-    # return scipy.stats.poisson.pmf(k, lam)
-    return lam ** k * math.exp(-lam) / math.factorial(k)
+    # return stats.poisson.pmf(k, lam)
+    return lam ** k * math.exp(-lam) / special.gamma(k+1)
 
 
 def MakePoissonPmf(lam, high, step=1):
@@ -1765,7 +1854,7 @@ def EvalNormalCdf(x, mu=0, sigma=1):
     Returns:
         float
     """
-    return scipy.stats.norm.cdf(x, loc=mu, scale=sigma)
+    return stats.norm.cdf(x, loc=mu, scale=sigma)
 
 
 def EvalNormalCdfInverse(p, mu=0, sigma=1):
@@ -1783,7 +1872,7 @@ def EvalNormalCdfInverse(p, mu=0, sigma=1):
     Returns:
         float
     """
-    return scipy.stats.norm.ppf(p, loc=mu, scale=sigma)
+    return stats.norm.ppf(p, loc=mu, scale=sigma)
 
 
 def EvalLognormalCdf(x, mu=0, sigma=1):
@@ -1795,7 +1884,7 @@ def EvalLognormalCdf(x, mu=0, sigma=1):
                 
     Returns: float or sequence
     """
-    return scipy.stats.lognorm.cdf(x, loc=mu, scale=sigma)
+    return stats.lognorm.cdf(x, loc=mu, scale=sigma)
 
 
 def RenderExpoCdf(lam, low, high, n=101):
@@ -1810,7 +1899,7 @@ def RenderExpoCdf(lam, low, high, n=101):
     """
     xs = np.linspace(low, high, n)
     ps = 1 - np.exp(-lam * xs)
-    #ps = scipy.stats.expon.cdf(xs, scale=1.0/lam)
+    #ps = stats.expon.cdf(xs, scale=1.0/lam)
     return xs, ps
 
 
@@ -1826,7 +1915,7 @@ def RenderNormalCdf(mu, sigma, low, high, n=101):
     returns: numpy arrays (xs, ps)
     """
     xs = np.linspace(low, high, n)
-    ps = scipy.stats.norm.cdf(xs, mu, sigma)
+    ps = stats.norm.cdf(xs, mu, sigma)
     return xs, ps
 
 
@@ -1845,7 +1934,7 @@ def RenderParetoCdf(xmin, alpha, low, high, n=50):
         low = xmin
     xs = np.linspace(low, high, n)
     ps = 1 - (xs / xmin) ** -alpha
-    #ps = scipy.stats.pareto.cdf(xs, scale=xmin, b=alpha)
+    #ps = stats.pareto.cdf(xs, scale=xmin, b=alpha)
     return xs, ps
 
 
@@ -1914,7 +2003,7 @@ def MakePmf(self, steps=101, label=None):
     def MakeCdf(self, steps=101):
         """Returns the CDF of this distribution."""
         xs = [i / (steps - 1.0) for i in range(steps)]
-        ps = [scipy.special.betainc(self.alpha, self.beta, x) for x in xs]
+        ps = [special.betainc(self.alpha, self.beta, x) for x in xs]
         cdf = Cdf(xs, ps)
         return cdf
 
@@ -2581,7 +2670,7 @@ def ResampleRowsWeighted(df, column='finalwgt'):
     returns: DataFrame
     """
     weights = df[column]
-    cdf = Pmf(weights.iteritems()).MakeCdf()
+    cdf = Cdf(dict(weights))
     indices = cdf.Sample(len(weights))
     sample = df.loc[indices]
     return sample
@@ -2626,7 +2715,7 @@ def Smooth(xs, sigma=2, **options):
     xs: sequence
     sigma: standard deviation of the filter
     """
-    return scipy.ndimage.filters.gaussian_filter1d(xs, sigma, **options)
+    return ndimage.filters.gaussian_filter1d(xs, sigma, **options)
 
 
 class HypothesisTest(object):

diff --git a/code/thinkstats2_test.py b/code/thinkstats2_test.py
@@ -1,10 +1,12 @@
 """This file contains code for use with "Think Stats",
 by Allen B. Downey, available from greenteapress.com
 
-Copyright 2010 Allen B. Downey
+Copyright 2014 Allen B. Downey
 License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
 """
 
+from __future__ import print_function, division
+
 import unittest
 import random
 
@@ -133,6 +135,36 @@ def testPmf(self):
         xs, ys = pmf.Render()
         self.assertEqual(tuple(xs), tuple(sorted(pmf.Values())))        
 
+    def testPmfAddSub(self):
+        pmf = thinkstats2.Pmf([1, 2, 3, 4, 5, 6])
+
+        pmf1 = pmf + 1
+        self.assertAlmostEqual(pmf1.Mean(), 4.5)
+
+        pmf2 = pmf + pmf
+        self.assertAlmostEqual(pmf2.Mean(), 7.0)
+
+        pmf3 = pmf - 1
+        self.assertAlmostEqual(pmf3.Mean(), 2.5)
+
+        pmf4 = pmf - pmf
+        self.assertAlmostEqual(pmf4.Mean(), 0)
+
+    def testPmfMulDiv(self):
+        pmf = thinkstats2.Pmf([1, 2, 3, 4, 5, 6])
+
+        pmf1 = pmf * 2
+        self.assertAlmostEqual(pmf1.Mean(), 7)
+
+        pmf2 = pmf * pmf
+        self.assertAlmostEqual(pmf2.Mean(), 12.25)
+
+        pmf3 = pmf / 2
+        self.assertAlmostEqual(pmf3.Mean(), 1.75)
+
+        pmf4 = pmf / pmf
+        self.assertAlmostEqual(pmf4.Mean(), 1.4291667)
+
     def testPmfProbLess(self):
         d6 = thinkstats2.Pmf(range(1,7))
         self.assertEqual(d6.ProbLess(4), 0.5)
@@ -249,6 +281,18 @@ def testCdf(self):
         self.assertEqual(cdf2.Prob(2), 0.6)
         self.assertEqual(cdf2.Value(0.6), 2)
 
+    def testShift(self):
+        t = [1, 2, 2, 3, 5]
+        cdf = thinkstats2.Cdf(t)
+        cdf2 = cdf.Shift(1)
+        self.assertEqual(cdf[1], cdf2[2])
+
+    def testScale(self):
+        t = [1, 2, 2, 3, 5]
+        cdf = thinkstats2.Cdf(t)
+        cdf2 = cdf.Scale(2)
+        self.assertEqual(cdf[2], cdf2[4])
+
     def testCdfRender(self):
         t = [1, 2, 2, 3, 5]
         cdf = thinkstats2.Cdf(t)
@@ -330,6 +374,10 @@ def testEvalNormalCdf(self):
         x = thinkstats2.EvalNormalCdfInverse(0.05, 0, 1)
         self.assertAlmostEqual(x, -1.64485362695)
 
+    def testEvalPoissonPmf(self):
+        p = thinkstats2.EvalPoissonPmf(2, 1)
+        self.assertAlmostEqual(p, 0.1839397205)
+
     def testCov(self):
         t = [0, 4, 7, 3, 8, 1, 6, 2, 9, 5]
         a = np.array(t)