Gaussian curve fit

added function to curve fit data to a gaussian curve and added missing #include in basicfxn
bytefoot · Mar 31, 2024 · bd4915f · bd4915f
1 parent 5b4239e
commit bd4915f
Show file tree

Hide file tree

Showing 5 changed files with 184 additions and 18 deletions.
diff --git a/include/diceforge_distributions.h b/include/diceforge_distributions.h
@@ -136,6 +136,16 @@ namespace DiceForge {
             real_t get_mu() const;
             /// @brief Returns standard deviation of the distribution
             real_t get_sigma() const;
+
+            /// @brief Fits the given sample points (x, y=pdf(x)) to a Gaussian distribution using non-linear least squares regression
+            /// following Gauss-Newton methods
+            /// @param x list of x coordinates
+            /// @param y list of corresponding y coordinates where y = pdf(x)
+            /// @param max_iter maximum iterations to attempt to fit the data (higher to try for better fits)
+            /// @param epsilon minimum acceptable error tolerance while attempting to fit the data (smaller to try for better fits)
+            /// @return A Gaussian distribution fit to the given sample points
+            Gaussian fitToGaussian(const std::vector<real_t>& x, const std::vector<real_t>& y, int max_iter = 10000, real_t epsilon = 1e-6);
+
     };
 
     /// @brief DiceForge::Maxwell - A Continuous Probability Distribution (Maxwell) 

diff --git a/src/Core/basicfxn.cpp b/src/Core/basicfxn.cpp
@@ -1,4 +1,5 @@
 #include "basicfxn.h"
+#include <limits>
 
 namespace DiceForge
 {

diff --git a/src/Distributions/Continuous/Gaussian/Gaussian.cpp b/src/Distributions/Continuous/Gaussian/Gaussian.cpp
@@ -1,60 +1,61 @@
 #include "Gaussian.h"
-#include <vector>
+#include "basicfxn.h"
 
 namespace DiceForge
-{            
+{
     Gaussian::Gaussian(real_t mu, real_t sigma)
-    : mu(mu), sigma(sigma)
+        : mu(mu), sigma(sigma)
     {
         if (sigma < std::numeric_limits<real_t>().epsilon())
         {
             std::cerr << "Error :"
-                        "\n\tDiceForge::Gaussian::Gaussian(real_t mu, real_t sigma) : "
-                        "\n\t\tValue of sigma must be positive\n" << std::endl;
+                         "\n\tDiceForge::Gaussian::Gaussian(real_t mu, real_t sigma) : "
+                         "\n\t\tValue of sigma must be positive\n"
+                      << std::endl;
             exit(EXIT_FAILURE);
         }
     }
 
-    real_t Gaussian::next(real_t r1, real_t r2) 
+    real_t Gaussian::next(real_t r1, real_t r2)
     {
         return (sqrt(-2.0 * log(r1)) * cos(2 * M_PI * r2)) * sigma + mu;
     }
 
-    real_t Gaussian::variance() const 
+    real_t Gaussian::variance() const
     {
         return sigma * sigma;
     }
 
-    real_t Gaussian::expectation() const 
+    real_t Gaussian::expectation() const
     {
         return mu;
     }
 
-    real_t Gaussian::minValue() const 
+    real_t Gaussian::minValue() const
     {
         return std::numeric_limits<real_t>().min();
     }
 
-    real_t Gaussian::maxValue() const 
+    real_t Gaussian::maxValue() const
     {
         return std::numeric_limits<real_t>().max();
     }
 
-    real_t Gaussian::pdf(real_t x) const 
+    real_t Gaussian::pdf(real_t x) const
     {
-       return exp(-0.5 * pow((x - mu) / sigma, 2) )/ (sqrt(2.0 * M_PI) * sigma);
+        return exp(-0.5 * pow((x - mu) / sigma, 2)) / (sqrt(2.0 * M_PI) * sigma);
     }
 
-    real_t Gaussian::cdf(real_t x) const 
-    {   
+    real_t Gaussian::cdf(real_t x) const
+    {
         real_t erf = myerf((x - mu) / (sigma * sqrt(2)));
-        return 0.5 * (1 + erf );
+        return 0.5 * (1 + erf);
     }
 
     real_t Gaussian::myerf(real_t x) const
     {
-        real_t erf; 
-        erf = tanh((2/sqrt(M_PI)) *(x + (11 / 123) * pow(x, 3)));
+        real_t erf;
+        erf = tanh((2 / sqrt(M_PI)) * (x + (11 / 123) * pow(x, 3)));
         return erf;
     }
 
@@ -67,4 +68,108 @@ namespace DiceForge
     {
         return sigma;
     }
-} 
+
+    Gaussian fitToGaussian(const std::vector<real_t> &x, const std::vector<real_t> &y, int max_iter, real_t epsilon)
+    {
+        if (x.size() != y.size())
+        {
+            throw "Number of x-coordinates and y-coordinates provided in the data do not match!";
+        }
+
+        const int N = x.size();
+
+        // Initial guessing of mu, sigma
+        real_t mu = 1, sigma = 1;
+
+        real_t ysum = 0;
+        real_t y2sum = 0;
+        for (size_t i = 0; i < N; i++)
+        {
+            ysum += y[i];
+            y2sum += y[i] * y[i];
+        }
+        real_t mu_guess = ysum / N;
+        real_t sigma_guess = sqrt((y2sum / N) - mu_guess * mu_guess);
+
+        real_t ymax = -INFINITY;
+        for (size_t i = 0; i < N; i++)
+        {
+            // Check for outliers
+            if (y[i] > ymax && y[i] - mu_guess < 3 * sigma_guess)
+            {
+                ymax = y[i];
+
+                // Guess mu as the x value corresponding to maximum y
+                mu = x[i];
+            }
+        }
+        //computing max and min x values from given data
+        real_t xmax = x[0], xmin = x[0];
+        for (size_t i = 0; i < N; i++)
+        {
+            if (x[i] > xmax)
+                xmax = x[i];
+            if (x[i] < xmin)
+                xmin = x[i];
+        }
+        //setting initial guess of sigma
+        sigma = (xmax - xmin) / 6;
+
+        // Start iterative updation
+        for (size_t iter = 0; iter < max_iter; iter++)
+        {
+            // Compute Jacobian matrix and error vector
+            matrix_t J(N, 2); // Jacobian matrix
+            matrix_t R(N, 1); // Error vector
+
+            for (size_t i = 0; i < N; i++)
+            {
+                real_t pdf = exp(-(x[i] - mu) * (x[i] - mu) / (2 * sigma * sigma)) / (sqrt(2 * M_PI) * sigma);
+                // std::cout << sigma << std::endl;
+                //  Partial derivatives of the Gaussian function with respect to mu and sigma
+                real_t dpdf_dmu = (x[i] - mu) / (sigma * sigma) * pdf;
+                real_t dpdf_dsigma = ((x[i] - mu) * (x[i] - mu) / (sigma * sigma * sigma) - 1 / sigma) * pdf;
+
+                J[i][0] = -dpdf_dmu;
+                J[i][1] = -dpdf_dsigma;
+
+                R[i][0] = y[i] - pdf;
+            }
+
+            // Compute the transpose of the Jacobian matrix
+            matrix_t JT = J.transpose();
+
+            // Compute the move direction using the Gauss-Newton method
+            matrix_t d = inverse2x2(JT * J) * JT * R;
+
+            // Stop when error minimization is too little
+            if (fabs(d[0][0]) < epsilon && fabs(d[1][0]) < epsilon)
+            {
+                break;
+            }
+
+            real_t alpha = 0.001;
+
+            real_t pred_mu, pred_sigma;
+            pred_mu = mu - alpha * d[0][0];
+            pred_sigma = sigma - alpha * d[1][0];
+
+            // prevent possible incorrect jumping
+            if ((pred_mu / mu > 10 || pred_sigma / sigma > 10 || pred_mu / mu < 0.1 || pred_sigma / sigma < 0.1))
+            {
+                alpha = alpha * 0.0001;
+            }
+
+            // Gauss-Newton method
+            mu = mu - alpha * d[0][0];
+            sigma = sigma - alpha * d[1][0];
+        }
+
+        if (sigma < 0)
+        {
+            throw "Could not fit data to Gaussian!";
+        }
+
+        return Gaussian(mu, sigma);
+    }
+}
diff --git a/src/Distributions/Continuous/Gaussian/Gaussian.h b/src/Distributions/Continuous/Gaussian/Gaussian.h
@@ -38,6 +38,14 @@ namespace DiceForge {
             /// @brief Returns standard deviation of the distribution
             real_t get_sigma() const;
     };
+    /// @brief Fits the given sample points (x, y=pdf(x)) to a Gaussian distribution using non-linear least squares regression
+    /// following Gauss-Newton methods
+    /// @param x list of x coordinates
+    /// @param y list of corresponding y coordinates where y = pdf(x)
+    /// @param max_iter maximum iterations to attempt to fit the data (higher to try for better fits)
+    /// @param epsilon minimum acceptable error tolerance while attempting to fit the data (smaller to try for better fits)
+    /// @return A Gaussian distribution fit to the given sample points
+    Gaussian fitToGaussian(const std::vector<real_t>& x, const std::vector<real_t>& y, int max_iter = 10000, real_t epsilon = 1e-6);
 }
 
 

diff --git a/testing/test_fitting_gaussian.cpp b/testing/test_fitting_gaussian.cpp
@@ -0,0 +1,42 @@
+#include "diceforge.h"
+#include <iostream>
+#include <fstream>
+
+#define NUM_POINTS 50
+#define NOISE_AMP 0.01
+
+int main(int argc, char const *argv[])
+{
+    DiceForge::XORShift32 rng = DiceForge::XORShift32(time(NULL));
+
+    DiceForge::Gaussian noise_gen = DiceForge::Gaussian(1);
+    DiceForge::Gaussian gaussian = DiceForge::Gaussian(23, 1);
+
+    std::cout << "original: mu = " << gaussian.get_mu() << ", sigma = " << gaussian.get_sigma() << std::endl;
+
+    std::vector<double> x;
+    std::vector<double> y;
+
+    std::ofstream out = std::ofstream("noisy_data_gaussian.dat");
+    for (int i = 0; i < NUM_POINTS; i++)
+    {
+        double x1 = rng.next_in_crange(10, 50);
+        double y1 = gaussian.pdf(x1) + NOISE_AMP * M_1_PI * noise_gen.next(rng.next_unit(), rng.next_unit()) / gaussian.get_sigma();
+        x.push_back(x1);
+        y.push_back(y1);
+        out << x1 << " " << y1 << std::endl;
+    }
+    out.close();
+
+    DiceForge::Gaussian fit = DiceForge::fitToGaussian(x, y, 10000, 0.0000001);
+
+    std::cout << "fit: mu = " << fit.get_mu() << ", sigma = " << fit.get_sigma() << std::endl;
+
+    FILE* gnuplot = popen("gnuplot -persist", "w");
+    fprintf(gnuplot, "plot 'noisy_data_gaussian.dat' title 'samples', exp(-(x - %f) * (x - %f) / (2 * %f * %f)) / (%f) title 'Gaussian'\n",
+    fit.get_mu(), fit.get_mu(), fit.get_sigma(), fit.get_sigma(), sqrt(2 * M_PI) * fit.get_sigma());
+    //fprintf(gnuplot, "plot 'noisy_data_gaussian.dat' title 'samples'\n");
+    fclose(gnuplot);
+
+    return 0;
+}