-
Notifications
You must be signed in to change notification settings - Fork 3
/
lv_lda.m
126 lines (103 loc) · 6.57 KB
/
lv_lda.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
function [mdl] = lv_lda(cfg)
% LDA as in Matthias' paper (FDA) for binary classification and primal form
% Mdl = fitcdiscr(TRAIN,TRAIN_GROUP);
% [outclass,posterior] = predict(Mdl,TEST); err=1;
% takes mdl as is empty during training
% using mean difference and shirnkage regularisation .. using GED ... using
% the assumption of LDA and calculating the soft max of likelihood ..
%%%%
% cfg.use_GED=0;
mdl = cfg.mdl;
if strcmp(cfg.phase,'trn')
if ~isfield(cfg,'use_GED')
TRAIN = cfg.TRAIN;
TRAIN_GROUP = cfg.TRAIN_GROUP;
sz = size(TRAIN); n=sz(1);
id1 = TRAIN_GROUP==1; id2 = TRAIN_GROUP==2;
n1 = sum(id1); n2 = sum(id2);
X = [ TRAIN(id1,:)-repmat(mean(TRAIN(id1,:),1),sum(id1),1); TRAIN(id2,:)-repmat(mean(TRAIN(id2,:),1),sum(id2),1)];
X = X - repmat(mean(X,1),n,1);
% Fisher discriminant analysis
% Dual Ledoit-Wolf estimate of shrinkage parameter lambda that
% controls the shape of the cov, 1 will be spherical and 0.5 is
% spherical and so on
S = (X'*X)/n; p1=0;
for i=1:n, p1 = p1 + norm(X(i,:)' * X(i,:) - S,'fro')^2; end
p2 = (n^2) * (trace(S^2)-(trace(S)^2)/sz(2));
lambda = p1/p2; lambda = min(max(lambda,0),1); % to avoid overshooting and undershooting and always have lambda [0 1]
Swithin = n1*(cov(TRAIN(id1,:),1)) + n2*(cov(TRAIN(id2,:),1));
Swithin_hat = (1-lambda)*Swithin + lambda * trace(Swithin)/sz(2) * eye(sz(2));
mdl.weights = (Swithin_hat^-1)*(mean(TRAIN(id1,:),1)' - mean(TRAIN(id2,:),1)');
mdl.bias = -mdl.weights' * ((mean(TRAIN(id1,:),1)')+(mean(TRAIN(id2,:),1)'))/2; % the impact of bias is small
% because based on that the sign will determine the class and since we took between class as mu1-mu2
% now we assumed that class1 is positive but if the data is not centered the boundary will not be at zero
% so we would need to shift it so that now the boundary at zero and the sign will reflect the class .. so we take
% the central point of the classes (mu1+mu2)/2 and we use this as
% our treshold .. so if the data is centered the effect of this
% bias is small .. and we get the central point to do that because
% it will be in the middle and should be reflecting zero so if it
% is shifted that would be the bias..
mdl.mus = [mean(TRAIN(id1,:),1); mean(TRAIN(id2,:),1)];
mdl.Swithin_hat = Swithin_hat; mdl.sz=sz;
mdl.cov1 = cov(TRAIN(id1,:),1); mdl.cov2 = cov(TRAIN(id2,:),1);
else % GED gerenalised eigen decomposition
TRAIN = cfg.TRAIN;
TRAIN_GROUP = cfg.TRAIN_GROUP;
sz = size(TRAIN); n=sz(1);
id1 = TRAIN_GROUP==1; id2 = TRAIN_GROUP==2;
Sbetween = cov( [mean(TRAIN(id1,:),1);mean(TRAIN(id2,:),1)] ,1 );
Swithin= cov(TRAIN(id1,:),1) + cov(TRAIN(id2,:),1);
if isfield(cfg,'tst_noise_for_within'), Swithin = cov(cfg.TEST,1); end % guess the true labels of tst should be revealed
% to get the within cov correctly
[W,val] = eig(Sbetween,Swithin); % generalized decomposition
[~,id] = sort(diag(val),'descend');
mdl.weights = W(:,id(1));
% to know the class direction
% if class1 center is negative then we swap the weight sign because
% this means that class1 is negative and classes directions are in
% the other direction so that when we calculate mdl.outclass the result will be fine
% we do that because we normally assume that class1 is positive at
% mdl.outclass = 1*(Z>0)+2*(Z<0); so we do this to know the actual direction
% this was not needed in the previous condition because the
% difference of the mean was takes so the difference will be
% positive when class1 cecnter is higheer than two so this handles the sign and gives direction to the result
if mean(TRAIN(id1,:),1)*mdl.weights < 0, mdl.weights=-mdl.weights; end
mdl.bias = -mdl.weights' * ((mean(TRAIN(id1,:),1)')+(mean(TRAIN(id2,:),1)'))/2; % the impact of bias is small
mdl.mus = [mean(TRAIN(id1,:),1); mean(TRAIN(id2,:),1)];
mdl.Swithin_hat = Swithin*n; mdl.sz=sz; % this *n will be removed when we divide by n later in prediction
mdl.cov1 = cov(TRAIN(id1,:),1); mdl.cov2 = cov(TRAIN(id2,:),1);
end
end
if strcmp(cfg.phase,'predict')
TEST = cfg.TEST;
Z = (TEST*mdl.weights) + mdl.bias;
mdl.dval = Z;
mdl.outclass = 1*(Z>0)+2*(Z<0);
% posterior probability .. likelihood is from the ditribution
% then we softmax the likelihood .. the assumptions of likelihood are
% that we have multivariate gaussian and the cov is the within cov and
% is the same for both classes .. the difference between lda and bayes
% classifier is that bayes classifier assumes independency of features
% and get the pdf of every feature then multiply them to get the
% likelihood for one class then softmax with other classes to get the
% posterior .. but lda assumes multi-variate gaussian and same
% covariance matrices for both classes
likelihood1 = mvnpdf(cfg.TEST, mdl.mus(1,:), mdl.Swithin_hat/mdl.sz(1)); % dividing by
% the no. trials because the within cov was calculated for every trials and then summed
likelihood2 = mvnpdf(cfg.TEST, mdl.mus(2,:), mdl.Swithin_hat/mdl.sz(1)); % dividing by
% softmax the likelihood to get the posterior
mdl.posterior = [likelihood1./(likelihood1+likelihood2) likelihood2./(likelihood1+likelihood2)];
mdl.outclass = 1*(mdl.posterior(:,1)>mdl.posterior(:,2))+2*(mdl.posterior(:,1)<mdl.posterior(:,2));
% showing that e^z isn't posterior for lda but for neural network is posterior
% post = zeros(size(TEST,1),2);
% post(Z>0,1) = exp(Z(Z>0)); post(Z<0,2) = exp(Z(Z<0));
% mdl.posterior = post./sum(post,2);
% % % t-distribution lda not normal distribution
% % df=mdl.sz(1)-1; % degree of freedom .. equals gaussian when df=inf.. and cauchy distribution when df=1 .. a parameter to optimise
% % likelihood1 = mvtpdf(bsxfun(@minus,cfg.TEST,mdl.mus(1,:)),mdl.cov1,df); % we subtract the mean of the class because mvtpdf has the assumption that the t-dist. is at zero mean
% % % and in equation 2.159 the mean was subtracted
% % likelihood2 = mvtpdf(bsxfun(@minus,cfg.TEST,mdl.mus(2,:)),mdl.cov2,df);
% % mdl.posterior = [likelihood1./(likelihood1+likelihood2) likelihood2./(likelihood1+likelihood2)];
% % mdl.outclass = 1*(mdl.posterior(:,1)>mdl.posterior(:,2))+2*(mdl.posterior(:,1)<mdl.posterior(:,2));
end
end