%%% Rosenblatt's Perceptron learning.
% This is a simple example illustrating the classic perceptron algorithm
% A linear decision function parametrized by the weigh vector "w" and bias
% paramter "b" is learned by making small adjustements to these parameters
% every time the predicted label "f_i" mismatches the true label "y_i" of
% an input data point "x_i".
% The predicted label corresponds to the following function:
% f_i = sign( < w, x_i> + b),
% where "< . , . >" denotes the dot product operation.
% The update rule is given as follows:
% if "y_i ~= f_i" (predicted label f_i different from true label y_i) then
% w <-- w + y_i*x_i; and
% b <-- b + y_i;
% else continue with next data sample
% The above process is repeated over the set of samples several times.
% If data points are linearly separable the above rule is guaranteed to
% converge in a finite number of iterations.
% 2016 Luis G Sanchez Giraldo and Odelia Schwartz
close all
clear all
clc
%% Construct a simple data set based on MNIST images
% This is a data set of handwritten digits 0 to 9
load('data/mnist_all.mat');
% Create a simple two-class problem using images of digits 0 and 5 from
% MNIST test data set
pos_class = 0;
neg_class = 5;
%% get samples from positive and negative classes
pos_data = eval(strcat('test', num2str(pos_class)));
neg_data = eval(strcat('test', num2str(neg_class)));
%% Look at some digits from the classes
% Look at different samples from each class (here plotted just the first)
figure(1);
size(pos_data)
subplot(1,2,1)
imshow((reshape(pos_data(1,:),28,28))')
size(neg_data)
subplot(1,2,2)
imshow((reshape(neg_data(1,:),28,28))')
%% Gather the samples from the two classes into one matrix X
% Note that there area 936 samples from each class, and that
% these are appended to make up 1872 samples altogether
X = double([pos_data; neg_data])/255;
size(X)
%% Label the two classes with 1 and -1 respectively
Y = [ones(size(pos_data, 1), 1); -ones(size(neg_data, 1), 1)];
size(Y)
%% Choose random samples from data. To do so:
%% permute data samples to run the learning algorithm
% and take just n_samples from the permuted data (here 60 samples)
n_samples = 60;
size(X,1)
[p_idx] = randperm(size(X, 1));
X = X(p_idx(1:n_samples), :);
Y = Y(p_idx(1:n_samples));
%% Project the data onto the means of the two classes
% First look at the mean of the two class
figure(2);
subplot(1,2,1);
imshow(reshape(mean(X(Y == 1, :)),28,28)')
% mean of second class
subplot(1,2,2);
imshow(reshape(mean(X(Y == -1, :)),28,28)')
% Now project the data
V(1, :) = mean(X(Y == 1, :))' ;
V(1, :) = V(1, :)/norm(V(1, :));
V(2, :) = mean(X(Y == -1, :));
V(2, :) = V(2, :)/norm(V(2, :));
Z = X*V';
figure(3);
gscatter(Z(:,1), Z(:,2), Y);
%% Simple Learning algorithm for Perceptron
% here we denote two classes: the positive class by label "1" and the negative
% class by label "-1."
% Any point in the plane colored as green will be classified as positive
% class and any point falling within the red region as negative class.
% Training samples are denoted by the green crosses (positive) and red dots
% (negative). A missclassified training point, that is "f_i ~= y_i" is
% marked with a circle
lr = 0.1; % Learning rate parameter
w = randn(size(Z, 2), 1); % Initial guess for the hyperplane parameters
b = 0; % bias is initially zero
max_epoch = 100; % Number of epoch (complete loops trough all data)
epoch = 1; % epoch counter
h = figure();
while epoch <= max_epoch
% loop trough all data points one time (an epoch)
for iSmp = 1:n_samples
z_i = Z(iSmp, :)';
% compute prediction with current w and b using logistic sigmoid
f_i = 1/(1 + exp(-(w'*z_i + b)));
% compute delta from cross-entropy loss
delta_i = f_i - (1 + Y(iSmp))/2;
% update w and b if missclassified
w = w - lr*delta_i*z_i;
b = b - lr*delta_i;
end
% diplay current decision hyperplane (partition of the space)
display2DPartition(Z, Y, w, b, h);
pause(0.1)
epoch = epoch + 1;
end
% EXERCISE: After trying the code for the given classes,
% try running the code again, but this time changing learning rate "lr" parameter
% What are the effects of increasing / decreasing lr?