Skip to content

Commit

Permalink
High-Dimensional Feature Selection of Medical Data
Browse files Browse the repository at this point in the history
Optimized LASSO feature selection
  • Loading branch information
SuperXiang authored May 3, 2017
1 parent 8185347 commit c9597ec
Show file tree
Hide file tree
Showing 16 changed files with 6,446 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
%% Read dataset and preprocessing
x=csvread('train_data1.csv',0,0,[0 0 699 128]);
y=csvread('train_data1.csv',0,129,[0 129 699 140]);
y(y(:,:)==-1) = 0;

%% T test
pSum = zeros(1,129);
for i=1:12
nowY = y(:,i);
X0 = x(nowY==0,:);
X1 = x(nowY==1,:);
[~,p,~,~] = ttest2(X0,X1,'Vartype','unequal');
pSum = pSum+p;
end

[~,featureIdxSortbyP]= sort(pSum);
x=x(:,featureIdxSortbyP(1:50));

%% LASSO regression feature selection
opts = statset('UseParallel',true);
featureSum = zeros(12,1);
featureWeight = zeros(50,1);
for i=1:12
[B,S] = lassoglm(x,y(:,i),'binomial','DFmax',30,'CV',10,'Alpha',0.5,'Options',opts);
featureSum(i,1) = sum(B(:,S.IndexMinDeviance)~=0);
featureWeight = featureWeight+B(:,S.IndexMinDeviance);
end

%% Filter
sum=floor(sum(featureSum)/12);
[~,featureIdxSortbyLasso]= sort(featureWeight);
x=x(:,featureIdxSortbyLasso(1:sum));
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
%% Read dataset
x=csvread('train_data1.csv',0,0,[0 0 699 128]);
y=csvread('train_data1.csv',0,129,[0 129 699 140]);
testX = csvread('test_feature_data1.csv',0,1);
number = csvread('test_feature_data1.csv',0,0,[0 0 203 0]);

%% Preprocessing
y(y(:,:)==-1) = 0;
net = patternnet(15);

net.trainFcn = 'trainrp';
net.trainParam.max_fail = 20;
net.trainParam.epochs = 1000;

net.divideParam.trainRatio = 70/100;
net.divideParam.valRatio = 15/100;
net.divideParam.testRatio = 15/100;

results = zeros(204,13);
results(:,1)=number;
errors = zeros(1,700);

%% Divide into 12 sub problems
for i=1:12

% T Test feature selection
nowY = y(:,i);
X0 = x(nowY==0,:);
X1 = x(nowY==1,:);
[~,p,~,~] = ttest2(X0,X1,'Vartype','unequal');
[~,featureIdxSortbyP]= sort(p);
nowX = x(:,featureIdxSortbyP(1:70));

% LASSO regression feature selection
[B,S] = lassoglm(nowX,nowY,'binomial','DFmax',30,'CV',10,'Alpha',0.5);
model = B(:,S.IndexMinDeviance)~=0;
nowX = nowX(:,model);

nowTestX = testX(:,featureIdxSortbyP(1:70));
nowTestX = nowTestX(:,model);

% Train and use NN array
outputs = zeros(1,204);
for j=1:5
rand = randperm(700);
nowX = nowX(rand,:);
nowY = nowY(rand,:);
nowNet = train(net,nowX',nowY');
errors = errors+gsubtract(nowY',nowNet(nowX'));
outputs = outputs+nowNet(nowTestX');
end

% Generate result
results(:,i+1) = outputs'/5;
end

%% Output result
disp(sum(abs(errors))/(700*60));
csvwrite('result1.csv',results);
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
%% Read dataset and preprocessing
x=csvread('train_data2.csv',0,0,[0 0 3999 409]);
y=csvread('train_data2.csv',0,410,[0 410 3999 410]);
X1 = x(y==0,:);
X2 = x(y==1,:);
X3 = x(y==2,:);

%% T Test
[~,p12,~,~] = ttest2(X1,X2,'Vartype','unequal');
[~,p13,~,~] = ttest2(X2,X3,'Vartype','unequal');
[~,p23,~,~] = ttest2(X1,X3,'Vartype','unequal');
p=p12+p13+p23;
[~,featureIdxSortbyP]= sort(p);

%% LASSO regression feature selection

x=x(:,featureIdxSortbyP(1:100));
opts = statset('UseParallel',true);
% Linear Regression
[B,S] = lasso(x,y,'DFmax',50,'CV',10,'Alpha',0.5,'Options',opts);
% Poisson Regression
% [B,S] = lassoglm(x,y,'poisson','DFmax',50,'CV',10,'Alpha',0.5,'Options',opts);
model = B(:,S.Index1SE)~=0;
x=x(:,model);

clear X1 X2 X3 p p12 p13 p23 B S

%% Process class label
newY=zeros(4000,3);
newY(y==0,1)=1;
newY(y==1,2)=1;
newY(y==2,3)=1;
y=newY;
clear newY;

%% Build NN parameters
neurons = sum(model~=0);
net = patternnet(floor(neurons/2));
net.divideParam.trainRatio = 70/100;
net.divideParam.valRatio = 15/100;
net.divideParam.testRatio = 15/100;

net.trainFcn = 'trainrp';
net.trainParam.max_fail = 20;
net.trainParam.epochs = 1000;

%% Train NN array
errorsArray=zeros(1,20);
nets = cell(1,20);

for i=1:20
% Randomly disorganize sample order
rand = randperm(4000);
x = x(rand,:);
y = y(rand,:);

nets{i} = train(net,x',y');
neti=nets{i};
outputs = neti(x');
errors = gsubtract(y',outputs);
errorsArray(i) = sum(sum(abs(errors)))/12000;
end

%% Select 10 best NN by error information

[~,IdxSortbyerrors]= sort(errorsArray,'ascend');
nets = nets(IdxSortbyerrors(1:10));
disp(mean(errorsArray(IdxSortbyerrors(1:10))));
clear errors errorsArray i net neti neurons opts outputs rand IdxSortbyerrors

%% Read test data
testX = csvread('test_feature_data2.csv',0,1);
number = csvread('test_feature_data2.csv',0,0,[0 0 1030 0]);
testX = testX(:,featureIdxSortbyP(1:100));
testX = testX(:,model);

%% Classify by NN
testY = zeros(3,1031);
for i=1:10
neti = nets{i};
testY = testY+neti(testX');
end
testY = testY/10;
testY = (vec2ind(testY)-1)';
testY = [number,testY];
csvwrite('result2.csv',testY);
12 changes: 12 additions & 0 deletions High-Dimensional Feature Selection of Medical Data/Code/lasso.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
%% Read dataset
x=csvread('train_data2.csv',0,0,[0 0 3999 409]);
y=csvread('train_data2.csv',0,410,[0 410 3999 410]);

%% LASSO regression feature selection

opts = statset('UseParallel',true);
% Linear Regression
[B,S] = lasso(x,y,'DFmax',100,'CV',10,'Alpha',0.5,'Options',opts);
% Poisson Regression
[B,S] = lassoglm(x,y,'poisson','DFmax',100,'CV',10,'Alpha',0.5,'Options',opts);
model = B(:,S.Index1SE)~=0;
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
%% Read dataset and preprocessing
x=csvread('train_data2.csv',0,0,[0 0 3999 409]);
y=csvread('train_data2.csv',0,410,[0 410 3999 410]);
X1 = x(y==0,:);
X2 = x(y==1,:);
X3 = x(y==2,:);

%% T test
[~,p12,~,~] = ttest2(X1,X2,'Vartype','unequal');
[~,p13,~,~] = ttest2(X2,X3,'Vartype','unequal');
[~,p23,~,~] = ttest2(X1,X3,'Vartype','unequal');
p=p12+p13+p23;
[~,featureIdxSortbyP]= sort(p);

%% LASSO regression feature selection

x=x(:,featureIdxSortbyP(1:100));
opts = statset('UseParallel',true);
% Linear Regression
[B,S] = lasso(x,y,'DFmax',100,'CV',10,'Alpha',0.5,'Options',opts);
% Poisson Regression
% [B,S] = lassoglm(x,y,'poisson','DFmax',100,'CV',10,'Alpha',0.5,'Options',opts);
model = B(:,S.Index1SE)~=0;
x=x(:,model);

%% SVM
14 changes: 14 additions & 0 deletions High-Dimensional Feature Selection of Medical Data/Code/useNN.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
%% Read dataset

testX = csvread('test_feature_data2.csv',0,1);
number = csvread('test_feature_data2.csv',0,0,[0 0 1030 0]);
testX = testX(:,featureIdxSortbyP(1:100));
testX = testX(:,model);

%% Neural Network
testY = net(testX');
testY = (vec2ind(testY)-1)';
testY = [number,testY];

%% Outpur data
csvwrite('result2.csv',testY);
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
1,0.82,0.53,0.81,0.77,0.12,0.53,0.81,0.47,0.12,0.53,0.83,0.11
2,0.72,0.53,0.81,0.71,0.1,0.51,0.74,0.77,0.72,0.93,0.83,0.11
3,0.68,0.53,0.81,0.77,0.12,0.52,0.81,0.37,0.12,0.53,0.83,0.11
4,0.62,0.53,0.83,0.77,0.12,0.53,0.63,0.27,0.72,0.63,0.83,0.11
5,0.21,0.53,0.81,0.77,0.12,0.56,0.81,0.77,0.12,0.53,0.83,0.11
6,0.12,0.53,0.81,0.77,0.11,0.53,0.22,0.17,0.92,0.43,0.83,0.11
7,0.22,0.53,0.82,0.76,0.12,0.53,0.29,0.67,0.12,0.53,0.83,0.11
8,0.12,0.53,0.81,0.77,0.14,0.58,0.71,0.37,0.12,0.73,0.83,0.11
9,0.92,0.53,0.81,0.77,0.12,0.53,0.81,0.87,0.12,0.83,0.83,0.11
10,0.92,0.53,0.81,0.77,0.12,0.53,0.81,0.87,0.12,0.83,0.83,0.11
11,0.82,0.53,0.81,0.77,0.12,0.53,0.81,0.47,0.12,0.53,0.83,0.11
12,0.72,0.53,0.81,0.71,0.1,0.51,0.74,0.77,0.72,0.93,0.83,0.11
13,0.68,0.53,0.81,0.77,0.12,0.52,0.81,0.37,0.12,0.53,0.83,0.11
14,0.62,0.53,0.83,0.77,0.12,0.53,0.63,0.27,0.72,0.63,0.83,0.11
15,0.21,0.53,0.81,0.77,0.12,0.56,0.81,0.77,0.12,0.53,0.83,0.11
16,0.12,0.53,0.81,0.77,0.11,0.53,0.22,0.17,0.92,0.43,0.83,0.11
17,0.22,0.53,0.82,0.76,0.12,0.53,0.29,0.67,0.12,0.53,0.83,0.11
18,0.12,0.53,0.81,0.77,0.14,0.58,0.71,0.37,0.12,0.73,0.83,0.11
19,0.92,0.53,0.81,0.77,0.12,0.53,0.81,0.87,0.12,0.83,0.83,0.11
20,0.92,0.53,0.81,0.77,0.12,0.53,0.81,0.87,0.12,0.83,0.83,0.11
21,0.82,0.53,0.81,0.77,0.12,0.53,0.81,0.47,0.12,0.53,0.83,0.11
22,0.72,0.53,0.81,0.71,0.1,0.51,0.74,0.77,0.72,0.93,0.75,0.11
23,0.68,0.53,0.81,0.77,0.12,0.52,0.81,0.37,0.12,0.53,0.75,0.11
24,0.62,0.53,0.83,0.77,0.12,0.53,0.63,0.27,0.72,0.63,0.75,0.11
25,0.21,0.53,0.81,0.77,0.12,0.56,0.81,0.77,0.12,0.53,0.75,0.11
26,0.12,0.53,0.81,0.77,0.11,0.53,0.22,0.17,0.92,0.43,0.75,0.11
27,0.22,0.53,0.82,0.76,0.12,0.53,0.29,0.67,0.12,0.53,0.75,0.11
28,0.12,0.53,0.81,0.77,0.14,0.58,0.71,0.37,0.12,0.73,0.75,0.11
29,0.92,0.53,0.81,0.77,0.12,0.53,0.81,0.87,0.12,0.83,0.75,0.11
30,0.92,0.53,0.81,0.77,0.12,0.53,0.81,0.87,0.12,0.83,0.75,0.11
31,0.82,0.53,0.81,0.77,0.12,0.53,0.81,0.47,0.12,0.53,0.75,0.11
32,0.72,0.53,0.81,0.71,0.1,0.51,0.74,0.77,0.72,0.93,0.75,0.11
33,0.68,0.53,0.81,0.77,0.12,0.52,0.81,0.37,0.12,0.53,0.75,0.11
34,0.62,0.53,0.83,0.77,0.12,0.53,0.63,0.27,0.72,0.63,0.75,0.11
35,0.21,0.53,0.81,0.77,0.12,0.56,0.81,0.77,0.12,0.53,0.75,0.11
36,0.12,0.53,0.81,0.77,0.11,0.53,0.22,0.17,0.92,0.43,0.75,0.11
37,0.22,0.53,0.82,0.76,0.12,0.53,0.29,0.67,0.12,0.53,0.75,0.11
38,0.12,0.53,0.81,0.77,0.14,0.58,0.71,0.37,0.12,0.73,0.64,0.11
39,0.92,0.53,0.81,0.77,0.12,0.53,0.81,0.87,0.12,0.83,0.64,0.11
40,0.92,0.53,0.81,0.77,0.12,0.53,0.81,0.87,0.12,0.83,0.64,0.11
41,0.82,0.53,0.81,0.77,0.12,0.53,0.81,0.47,0.12,0.53,0.64,0.11
42,0.72,0.53,0.81,0.71,0.1,0.51,0.74,0.77,0.72,0.93,0.64,0.11
43,0.68,0.53,0.81,0.77,0.12,0.52,0.81,0.37,0.12,0.53,0.64,0.11
44,0.62,0.53,0.83,0.77,0.12,0.53,0.63,0.27,0.72,0.63,0.64,0.11
45,0.21,0.53,0.81,0.77,0.12,0.56,0.81,0.77,0.12,0.53,0.64,0.11
46,0.12,0.53,0.81,0.77,0.11,0.53,0.22,0.17,0.92,0.43,0.64,0.11
47,0.22,0.53,0.82,0.76,0.12,0.53,0.29,0.67,0.12,0.53,0.64,0.11
48,0.12,0.53,0.81,0.77,0.14,0.58,0.71,0.37,0.12,0.73,0.64,0.11
49,0.92,0.53,0.81,0.77,0.12,0.53,0.81,0.87,0.12,0.83,0.64,0.11
50,0.92,0.53,0.81,0.77,0.12,0.53,0.81,0.87,0.12,0.83,0.64,0.11
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
1,1
2,1
3,1
4,0
5,1
6,0
7,2
8,1
9,0
10,2
11,1
12,0
13,1
14,0
15,1
16,1
17,1
18,1
19,1
20,1
21,1
22,0
23,1
24,0
25,2
26,1
27,0
28,0
29,2
30,1
31,1
32,2
33,1
34,1
35,1
36,1
37,1
38,1
39,1
40,1
41,1
42,1
43,1
44,1
45,1
46,2
47,2
48,0
49,1
50,1
51,1
52,1
53,0
54,1
55,1
56,1
57,0
58,1
59,0
60,0
61,2
62,1
63,1
64,1
65,1
66,0
67,1
68,1
69,0
70,0
71,1
72,1
73,2
74,1
75,0
76,1
77,1
78,1
79,1
80,0
81,0
82,1
83,1
84,0
85,1
86,1
87,2
88,2
89,1
90,1
91,2
92,0
93,1
94,1
95,2
96,0
97,1
98,1
99,1
100,1
Loading

0 comments on commit c9597ec

Please sign in to comment.