function [cost, time, grad, preds, probs] = cnnCostCPU(theta,images,labels,numClasses,...
                                filterDim1,numFilters1, filterDim2, numFilters2,...
                                poolDim1, poolDim2, pred, lambda, costWeights)

if ~exist('pred','var')
    pred = false;
end;

if ~exist('costWeights', 'var')
    costWeights = ones(size(numClasses, 1));
end

dropout = false;
prob1st = 0.99;
prob2nd = 0.95;
prob3rd = 0.5;
prob4th = 0.5;

hinge = -log(2-sqrt(3));



imageDim = size(images,1); % height/width of image
numImages = size(images,3); % number of images


[Wc1, Wc2, Wd1, Wd2, Wd3, bc1, bc2, bd1, bd2, bd3] = cnnParamsToStack(theta,imageDim,filterDim1,...
                                 numFilters1,filterDim2,numFilters2,poolDim1,...
                                 poolDim2,numClasses);

if pred && dropout
Wc2 = Wc2 * prob1st;
bc2 = bc2 * prob1st;
Wd1 = Wd1 * prob2nd;
bd1 = bd1 * prob2nd;
Wd2 = Wd2 * prob3rd;
bd2 = bd2 * prob3rd;
Wd3 = Wd3 * prob4th;
bd3 = bd3 * prob4th;
end
% max(max(max(abs(Wc))))
% max(max(max(Wd1)))
% max(max(max(Wd2)))
% max(abs(bc))
% max(abs(bd1))
% max(abs(bd2))

%%======================================================================
%% Forward Propagation
tic
convDim1 = imageDim-filterDim1+1; % dimension of convolved output
inputDim1 = (convDim1)/poolDim1; % dimension of subsampled output

convDim2 = inputDim1-filterDim2+1;
inputDim2 = (convDim2)/poolDim2;
% convDim x convDim x numFilters x numImages tensor for storing activations
activations1 = zeros(convDim1,convDim1,numFilters1,numImages);
activations2 = zeros(convDim2,convDim2,numFilters2,numFilters1,numImages);

activationsPooled1 = zeros(inputDim1,inputDim1,numFilters1,numImages);
activationsPooled2 = zeros(inputDim2,inputDim2,numFilters2,numFilters1,numImages);

activations1 = cnnConvolve(filterDim1, numFilters1, images, Wc1, bc1);
if ~pred && dropout
activations1Mask = rand(convDim1, convDim1, numFilters1) < prob1st;
activations1 = bsxfun(@times, activations1, activations1Mask);
end

activationsPooled1 = cnnPool(poolDim1, activations1);

activations2 = cnnConvolve(filterDim2, numFilters2,...
             reshape(activationsPooled1,inputDim1,inputDim1,[]),...
             Wc2, bc2);
if ~pred && dropout
activations2Mask = rand(convDim2, convDim2, numFilters2) < prob2nd;
activations2 = bsxfun(@times, activations2, activations2Mask);    
end
activationsPooled2 = cnnPool(poolDim2, activations2);

%activationsPooled1 = reshape(activationsPooled1,[],numImages);
activationsPooled2 = reshape(activationsPooled2,[],numImages);
activations2 = reshape(activations2,convDim2,convDim2,numFilters2,numFilters1,numImages);
probs = zeros(numClasses,numImages);

% max(max(max(max(abs(activations)))))

z2 = bsxfun(@plus, Wd1*activationsPooled2, bd1);
a2 = rectifier(z2);
if ~pred && dropout
a2mask = rand(size(a2, 1), 1) < prob3rd;
a2 = bsxfun(@times, a2, a2mask);
end
z3 = bsxfun(@plus, Wd2*a2, bd2);
a3 = rectifier(z3);
if ~pred && dropout
a3mask = rand(size(a3, 1), 1) < prob4th;
a3 = bsxfun(@times, a3, a3mask);
end
z4 = bsxfun(@plus, Wd3*a3, bd3);


% Softmax:
% h = bsxfun(@minus, z4, logsumexp(z4));
% probs = min(exp(h), 1);
% probs(find(max(probs) == 1 & probs < 1)) = 0;
% probs(find(probs == 1)) = 1 - 100*realmin;


% Sigmoid:
probs = sigmoid(z4);

% if ~pred
% max(max(h))
% min(min(h))
% sum(sum(sum(Wc1 .^ 2)))
% sum(sum(sum(Wc2 .^ 2)))
% sum(sum(Wd1 .^ 2))
% sum(sum(Wd2 .^ 2))
% sum(sum(Wd3 .^ 2))
% end








%%======================================================================
%% STEP 1b: Calculate Cost


cost = 0; % save objective into cost

% SOFTMAX
% slabels = sparse(labels, 1:numImages, 1, numClasses, numImages, numImages);
% slabels = sparse(labels, 1:numImages, 1, 2, numImages, numImages);
% slabels = bsxfun(@times, slabels, costWeights);
% cost = -sum(sum(slabels .* h)) / numImages;

% time = toc;

% Makes predictions given probs and returns without backproagating errors.
% if pred
%     [~,preds] = max(probs,[],1);
%     preds = preds';
%     grad = 0;
%     return;
% end;

% SIGMOID
cost = sum(-(labels - 1) .* log(probs') - (1 - labels + 1) .* log(1 - probs')) / numImages +...
        lambda * (sum(sum(sum(Wc1.^2))) + sum(sum(sum(Wc2.^2))) + sum(sum(sum(Wd1.^2))) + sum(sum(sum(Wd2.^2))) + sum(sum(sum(Wd3.^2)))) / (2*numImages);

time = toc;

preds = 0;
if pred
    preds = (probs > 0.5) + 1;
    preds = preds';
    grad = 0;
    return;
end

%%======================================================================
%% STEP 1c: Backpropagation

Wc1_grad = zeros(size(Wc1));
Wc2_grad = zeros(size(Wc2));
Wd1_grad = zeros(size(Wd1));
Wd2_grad = zeros(size(Wd2));
Wd3_grad = zeros(size(Wd3));
bc1_grad = zeros(size(bc1));
bc2_grad = zeros(size(bc2));
bd1_grad = zeros(size(bd1));
bd2_grad = zeros(size(bd2));
bd3_grad = zeros(size(bd3));

%
% Cross Entropy Loss
%
% delta_out = bsxfun(@times, probs - slabels, costWeights);
delta_out = probs - labels' + 1;

%
% Hinged Cross Entropy Loss
%
% delta_out = arrayfun(@hingeFilter, probs - labels' + 1, z4, labels' - 1, hinge);

delta_hidden2 = (Wd3' * delta_out) .* drectifier(a3);
if ~pred && dropout
    delta_hidden2 = bsxfun(@times, delta_hidden2, a3mask);
end
delta_hidden = (Wd2' * delta_hidden2) .* drectifier(a2);
if ~pred && dropout
    delta_hidden = bsxfun(@times, delta_hidden, a2mask);
end

delta_pool2 = (Wd1' * delta_hidden);
delta_pool2 = reshape(delta_pool2,inputDim2,inputDim2,numFilters2,numFilters1,numImages);
delta_cnv2 = zeros(convDim2,convDim2,numFilters2,numFilters1,numImages);
parfor imageNum=1:numImages
    for filterNum1=1:numFilters1
        for filterNum2=1:numFilters2        
            delta_cnv2(:,:,filterNum2,filterNum1,imageNum) = (1./poolDim2 ^ 2)*kron(squeeze(...
                delta_pool2(:,:,filterNum2,filterNum1,imageNum)), ones(poolDim2));
        end
    end
end
delta_cnv2 = drectifier(activations2) .* delta_cnv2;

if ~pred && dropout
    delta_cnv2 = bsxfun(@times, delta_cnv2, activations2Mask);
end

delta_pool1 = zeros(inputDim1, inputDim1, numFilters1, numImages);
parfor imageNum=1:numImages
    for filterNum1=1:numFilters1
        for filterNum2=1:numFilters2
            filter = Wc2(:,:,filterNum2);
            delta_pool1(:,:,filterNum1,imageNum) = delta_pool1(:,:,filterNum1,imageNum) + ...
             conv2(filter, delta_cnv2(:,:,filterNum2,filterNum1,imageNum), 'full');
        end
    end
end

delta_cnv1 = zeros(convDim1, convDim1, numFilters1, numImages);
parfor imageNum=1:numImages
    for filterNum1=1:numFilters1
        delta_cnv1(:,:,filterNum1,imageNum) = (1./poolDim1 ^ 2)*kron(squeeze(delta_pool1(:,:,filterNum1,imageNum)), ones(poolDim1));
    end
end
delta_cnv1 = drectifier(activations1) .* delta_cnv1;

if ~pred && dropout
    delta_cnv1 = bsxfun(@times, delta_cnv1, activations1Mask);
end



%%======================================================================
%% STEP 1d: Gradient Calculation

Wd3_grad = delta_out * a3' ./ numImages + lambda .* Wd3 ./ numImages;
bd3_grad = sum(delta_out')' ./ numImages;% + lambda .* bd3 ./ numImages;
Wd2_grad = delta_hidden2 * a2' ./ numImages + lambda * Wd2 ./ numImages;
bd2_grad = sum(delta_hidden2')' ./ numImages;% + lambda * bd2 ./ numImages;
Wd1_grad = delta_hidden *activationsPooled2' ./ numImages + lambda * Wd1 ./ numImages;
bd1_grad = sum(delta_hidden')' ./ numImages;% + lambda * bd1 ./ numImages;
parfor filter2Num=1:numFilters2
    for imageNum=1:numImages
        for filter1Num=1:numFilters1
            im = activationsPooled1(:, :, filter1Num, imageNum);
            filter=rot90(squeeze(delta_cnv2(:,:,filter2Num,filter1Num,imageNum)), 2);
            Wc2_grad(:,:,filter2Num)=Wc2_grad(:,:,filter2Num)+conv2(im, filter, 'valid')./ numImages;
        end
    end
    delta_filter = delta_cnv2(:,:,filter2Num,:);
    bc2_grad(filter2Num) = sum(delta_filter(:))/numImages;
end

Wc2_grad = Wc2_grad + lambda * Wc2 ./numImages;
% bc2_grad += lambda * bc2 ./numImages;


parfor filterNum = 1:numFilters1   
    for imageNum = 1:numImages
        im = squeeze(images(:, :, imageNum));
        filter = rot90(squeeze(delta_cnv1(:,:,filterNum, imageNum)), 2);
        Wc1_grad(:,:,filterNum) = Wc1_grad(:,:,filterNum) + conv2(im, filter, 'valid') ./ numImages;
    end
    delta_filter = delta_cnv1(:,:,filterNum,:);
    bc1_grad(filterNum) = sum(delta_filter(:))/numImages;
end

Wc1_grad = Wc1_grad + lambda * Wc1 ./numImages;
% bc1_grad += lambda * bc1 ./numImages;

%% Unroll gradient into grad vector for minFunc
grad = [Wc1_grad(:) ; Wc2_grad(:) ; Wd1_grad(:) ; Wd2_grad(:) ; Wd3_grad(:) ; bc1_grad(:) ; bc2_grad(:) ; bd1_grad(:) ; bd2_grad(:) ; bd3_grad(:)];

time = toc;
end
