Skip to content

Commit

Permalink
reading data and conversion
Browse files Browse the repository at this point in the history
- code for reading mnist
- conversion to one hot encoding
  • Loading branch information
kubershahi committed Oct 23, 2021
1 parent 7fe7185 commit 08d2963
Show file tree
Hide file tree
Showing 10 changed files with 247 additions and 0 deletions.
24 changes: 24 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

# -g adds debugging information to the executable file
# -Wall turns on most, but not all, compiler warnings

CC = g++
# CFLAGS = -g -Wall

nn: read_data.o utils.o nn.o
$(CC) $(CFLAGS) read_data.o utils.o nn.o -o nn

read_data.o: read_data.cpp read_data.hpp
$(CC) $(CFLAGS) -c read_data.cpp

utils.o: utils.cpp utils.hpp
$(CC) $(CFLAGS) -c utils.cpp

nn.o: nn.cpp define.hpp read_data.hpp utils.hpp
$(CC) $(CFLAGS) -c nn.cpp


# To start over from scratch, type 'make clean'. This removes the executable file,
# as well as old .o objectfiles and *~ backup files:
clean:
$(RM) nn file *.o *~
20 changes: 20 additions & 0 deletions define.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifndef DEFINE_HPP
#define DEFINE_HPP

#define SCALING_FACTOR 8192 // of 13 bits, 2^13

#include <Eigen/Dense>

// Parameters for the Neural Network
extern int N_train; // Number of Training Samples
extern int N_test; // Number of Testing Samples
extern int d; // Number of Features
extern int m; // Number of Output Classes
extern int B; // Batch Size
extern int NUM_EPOCHS;// Number of Epochs

typedef Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic> MatrixXi64;
typedef Eigen::Matrix<uint64_t, 1, Eigen::Dynamic> RowVectorXi64;
typedef Eigen::Matrix<uint64_t, Eigen::Dynamic, 1> ColVectorXi64;

#endif
Empty file added neural_network.cpp
Empty file.
Empty file added neural_network.hpp
Empty file.
Binary file added nn
Binary file not shown.
96 changes: 96 additions & 0 deletions nn.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#include <iostream>
#include <string>
#include <vector>

#include <Eigen/Dense>
#include "define.hpp"
#include "read_data.hpp"
#include "utils.hpp"

using namespace std;
using namespace Eigen;

int main()
{
cout<<"Select Dataset (enter corresponding digit):"<<endl;
cout<<"\t [1] MNIST"<<endl;

int selection = 0;
cout<<"Enter selection: ";
cin>>selection;

int N_train; // Number of Training Samples
int N_test; // Number of Testing Samples
int d; // Number of Features
int m; // Number of classes
int B; // Batch Size
int NUM_EPOCHS; // Number of Epochs

IOFormat CleanFmt(4, 0, ", ", "\n", "[", "]"); // formatting option while printing Eigen Matrices

MatrixXd X_train,Y_train,Y_train_onehot, X_test,Y_test, Y_test_onehot;

if (selection==1)
{
N_train = 10000;
N_test = 1000;
d = 784;
m = 10;
B = 128;
NUM_EPOCHS = 1;

cout<<"Reading Data:"<<endl;
vector<vector<double> > X_train_load; // dim: 60000 x 784, 60000 training samples with 784 features
vector<double> Y_train_load; // dim: 60000 x 1 , the true label of each training sample

read_data("datasets/mnist/mnist_train.csv", X_train_load, Y_train_load);

MatrixXd X_train_1(N_train, d);
MatrixXd Y_train_1(N_train, 1);

for (int i = 0; i < N_train; i++)
{
X_train_1.row(i) = Map<RowVectorXd>(&X_train_load[i][0], d)/256.0;
Y_train_1.row(i) = Map<RowVectorXd>(&Y_train_load[i],1)/10.0;
}

vector<vector<double> > X_test_load; // dim: 10000 x 784, 10000 testing samples with 784 features
vector<double> Y_test_load; // dim: 10000 x 1 , the true label of each testing sample

read_data("datasets/mnist/mnist_test.csv", X_test_load, Y_test_load); // for MNIST dataset

MatrixXd X_test_1(N_test, d); // 1000, 784
MatrixXd Y_test_1(N_test, 1); // 1000, 1

for (int i = 0; i < N_test; i++)
{
X_test_1.row(i) = Map<RowVectorXd>(&X_test_load[i][0], d)/256.0;
Y_test_1.row(i) = Map<RowVectorXd>(&Y_test_load[i],1)/10.0;
}
X_train = X_train_1;
Y_train = Y_train_1;
X_test = X_test_1;
Y_test = Y_test_1;

Y_train_onehot = onehot_Encoding(Y_train_1,m);
Y_test_onehot = onehot_Encoding(Y_test_1,m);

}



cout << X_train.rows() << "," << X_train.cols() << endl;
cout << X_test.rows() << "," << X_test.cols() << endl;
cout << Y_train.rows() << "," << Y_train.cols() << endl;
cout << Y_train_onehot.rows() << "," << Y_train_onehot.cols() << endl;
cout << Y_test.rows() << "," << Y_test.cols() << endl;
cout << Y_test_onehot.rows() << "," << Y_test_onehot.cols() << endl;


cout << Y_train.block(0,0,10,1) << endl;
cout << Y_train_onehot.block(0,0,10,10) << endl;

// cout << X_train.row(2).format(CleanFmt) <<endl;

return 0;
}
67 changes: 67 additions & 0 deletions read_data.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@

#include "read_data.hpp" // read_data header file
#include <Eigen/Dense> // Eigen Library

#include <vector> // for vector operations
#include <string> // for string operations
#include <iostream> // input output operation: cout
#include <fstream> // file stream operation: ifstream
#include <sstream> // string stream operation: istringstream
#include <algorithm> // replace functionality


/*
Input: dataset file
Output: returns dataset, data in two-dimensional vector.
*/

using namespace std;

//function to read any dataset with all numerical values like MNIST dataset.
void read_data(string inputfile, vector<vector<double> > &X, vector<double> &Y) {

ifstream fin; // declaring the input file stream
fin.open(inputfile); // opening the inputfile

int l = 0; // declaring a integer to track the number of line
string line; // declaring a string to hold the read line of the input file

if (fin.is_open()) { // if the input file is open
cout << "File opened successfully " << endl;

while (getline(fin, line)){ // storing the line of input file on the variable line
l++; // increasing the line read counter
istringstream linestream(line); // converting the read line into an string stream
vector <double> row; // declaring a vector to store the current row

int val = 0; // declaring a variable to track the number of values in a row
while (linestream) { // while the string stream is not null
string row_value; // declaring a string to hold the row values

if (!getline(linestream, row_value, ',')) // storing the values from stream into row_value one by one
break; // at the end of row break the while loop
try {
if (val < 784) {
row.push_back(stod(row_value)); // pushing the current value into the row for X values
val++;
}
else if (val == 784) // pushing the current value into the Y for y values
{
Y.push_back(stod(row_value));
}
}
catch (const invalid_argument err) { // if there is a error catch the error and display it
cout << "Invalid value found in the file: " << inputfile << " line: " << l << " value: " << val << endl;
err.what();
}
}

X.push_back(row); // pushing the row into the dataset
row.clear(); // clearing the row vector to store the next row
}
cout << "Lines read successfully: " << l << endl; // displaying the number or lines reads from the input file
}
else{
cout << "Unable to open the specified file " << endl; // output if file can't be opened
}
}
9 changes: 9 additions & 0 deletions read_data.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#ifndef READ_DATA_HPP
#define READ_DATA_HPP

#include <vector>
#include <string>

void read_data(std::string inputfile, std::vector<std::vector<double> > &X, std::vector<double> &Y);

#endif
20 changes: 20 additions & 0 deletions utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "utils.hpp"

#include <iostream>
#include <Eigen/Dense>

using namespace std;
using namespace Eigen;

MatrixXd onehot_Encoding(MatrixXd X, int m)
{
MatrixXd res = MatrixXd::Zero(X.rows(),m);

for(int i =0; i < X.rows(); i++)
{
int index = X(i,0) * 10;
res(i,index) = (double) 1;
}

return res;
}
11 changes: 11 additions & 0 deletions utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#ifndef UTILS_HPP
#define UTILS_HPP

#include <Eigen/Dense>

using namespace std;
using namespace Eigen;

MatrixXd onehot_Encoding(MatrixXd X, int m);

#endif

0 comments on commit 08d2963

Please sign in to comment.