results_dir='../results/';
datasets={'20News','COIL20','MNIST','USPS','UMIST','ORL'};
ndatasets=numel(datasets);

method_names={'NCut','NJW','denoise+NJW'};
nmethods=numel(method_names);

cluster_sizes=[2,3,4,5];
n_shuffle_sets=1;
N_keep=200;
nmonte=20;

full_stats={'ACC','NMI','VI','time (s)'};
full_stat_signs=[1 1 -1 -1];
full_best_funcs={@(x) max(x,[],2),@(x) max(x,[],2),@(x) min(x,[],2),@(x) min(x,[],2)};
nstats=numel(full_stats);

for dataset_ii=1:ndatasets
    dataset_name=datasets{dataset_ii};
    if ~exist(['../results/denoise_clustering_',dataset_name,'.mat'],'file')
        getData(dataset_name)
        text_data=0;
        switch dataset_name
            case '20News'
                load ../data/20newsgroups/20news.mat
                Y=sum(bsxfun(@times,1:size(Y,2),Y),2);
                [Y,resort]=sort(Y);
                fea=X(resort,:);
                clear X
                num_topics=20;
                doc_count_min=4;
                text_data=1;
            case 'ORL'
                load ../data/orl_faces/orl.mat
                num_topics=max(fig_label);
                Y=fig_label;
                clear fig_label
                fea=reshape(images,numel(Y),[]);
                clear images;
            case 'MNIST'
                load ../data/mnist/mnist_test.mat
                Y=test_labels;
                clear test_labels
                num_topics=max(Y);
                fea=test_X;
                clear test_X;
            case 'COIL20'
                load ../data/coil-20-proc/coil20.mat
                Y=fig_label(:,1);
                clear fig_labels
                num_topics=max(Y);
                fea=images;
                clear images;
            case 'UMIST'
                load('../data/umist/umist_cropped.mat','facedat')
                Y=cell2mat(cellfun(@(x,y) y*ones(size(x,3),1),facedat(:),num2cell((1:numel(facedat))'),'uni',0));
                fea=cell2mat(cellfun(@(x) reshape(double(x),[],size(x,3))',facedat(:),'uni',0));
                clear facedat
                num_topics=max(Y);
            case 'USPS'
                load '../data/usps/usps.mat'
                fea=images;
                clear images;
                Y=Y+1;% zeros are class 1
                num_topics=max(Y);
        end
        alltopics=cell(numel(cluster_sizes),1);
        allresults=cell(numel(cluster_sizes),1);
        for cluster_ii=1:numel(cluster_sizes)
            kcluster=cluster_sizes(cluster_ii);
            topic_sets=cell(nmonte,1);
            rng(880);
            
            stats=zeros(n_shuffle_sets,numel(topic_sets),nmethods,nstats);
            for topic_ii=1:numel(topic_sets)
                tt=randperm(num_topics);
                topic_sets{topic_ii}=tt(1:kcluster);
                current_topics=topic_sets{topic_ii};
                
                for shuffle_ii=1:n_shuffle_sets
                    included_ind= ismember(Y,current_topics);
                    Y_subset=Y(included_ind);
                    outoforder=randperm(numel(Y_subset));
                    inc_idx = sort(outoforder(1:min(numel(Y_subset),N_keep)));
                    Y_subset=Y_subset(inc_idx);
                    
                    x=fea(included_ind,:);
                    x=x(inc_idx,:);
                    if text_data==1
                        x=x(:,sum(x>0)>=doc_count_min);% words that appear in at least doc_count_min  docs
                        non_empty=sum(x,2)>0;
                        x=x(non_empty,:);
                        Y_subset=Y_subset(non_empty);
                        df=mean(x>0,1);
                        not_empty=find(df);
                        idf_w=sparse(1,not_empty,-log(df(not_empty)),1,size(x,2));
                        x=x*spdiags(idf_w.',0,size(x,2),size(x,2));
                        x=spdiags(1./sqrt(sum(x.^2,2)),0,size(x,1),size(x,1))*x;
                        A=full(x*x');
                    else
                        XX=x*x';
                        D2=bsxfun(@plus,diag(XX),bsxfun(@plus,diag(XX)',-2*XX));
                        Dtemp=sort(D2);
                        sig_max = sqrt(max(Dtemp(end,:)));
                        Dtemp(Dtemp==0)=inf;
                        dt=sqrt(min(reshape(Dtemp(2:end,:),[],1)));
                        sig_min = 2*dt;
                        N_sigma=15;
                        sigmas=linspace(sig_min,sig_max,N_sigma)';
                        sigma_ii=4;
                        A=exp(-D2/(2*sigmas(sigma_ii)^2));
                    end
                    A=(A+A')/2;
                    A(1:1+size(A,1):end)=1;%ensure diagonal
                    
                    for method_ii=1:nmethods
                        switch method_names{method_ii}
                            case 'NCut'% Normalized cut via normalized Laplacian
                                d=sum(A,2)-diag(A);
                                Lap=-A;
                                Lap(1:size(A,1)+1:end)=d;
                                %normalized spectral
                                nLap=bsxfun(@rdivide,bsxfun(@rdivide,Lap,sqrt(d)),sqrt(d'));
                                nLap=(nLap+nLap')/2;
                                tic;
                                [U,E]=eigs(nLap,kcluster+1,'sa');
                                e=diag(E);[~,idx]=sort(e);
                                X=bsxfun(@rdivide,U(:,idx(1:end)),sqrt(d));
                                trun=toc;
                            case 'NJW'% NJW
                                %Ng Jordan Weiss
                                d=sum(A,2)-diag(A);
                                K=bsxfun(@rdivide,bsxfun(@rdivide,A,sqrt(d)),sqrt(d'));
                                K(1:size(A,1)+1:end)=0;
                                K=(K+K')/2;
                                tic;
                                [Ua,S]=eigs(K,kcluster,'la');
                                s=diag(S);[~,idx]=sort(s,'descend');
                                Ua=Ua(:,idx);
                                U=Ua(:,1:kcluster);
                                X=bsxfun(@rdivide,U,sqrt(sum(U.^2,2)));
                                trun=toc;
                            case 'denoise+NJW'% denoised to NJW
                                % informativeness-based denoising
                                tic
                                W=denoiseInfo(A,size(K,1),ceil(sqrt(size(K,1))));
                                
                                Adn=W*W';%denoised correlation matrix
                                [I,J,S]=find(Adn.*(Adn>0)); %non-negative thresholding
                                Adn=sparse(I,J,S,size(Adn,1),size(Adn,1));
                                
                                Adn=(Adn+Adn')/2;
                                Adn(1:1+size(Adn,1):end)=1;%ensure diagonal
                                
                                %Ng Jordan Weiss
                                d=sum(Adn,2)-diag(Adn);
                                K=bsxfun(@rdivide,bsxfun(@rdivide,Adn,sqrt(d)),sqrt(d'));
                                K(1:size(A,1)+1:end)=0;
                                K=(K+K')/2;
                                
                                [Ua,S]=eigs(K,kcluster,'la');
                                s=diag(S);[~,idx]=sort(s,'descend');
                                Ua=Ua(:,idx);
                                WU=Ua(:,1:kcluster);
                                X=bsxfun(@rdivide,WU,sqrt(sum(WU.^2,2)));
                                trun=toc;
                        end
                        cluster_labels=kmeans(X,kcluster);
                        [ MInorm,VI]=compare_clusterings(Y_subset,cluster_labels);
                        ACC=brute_force_accuracy(Y_subset,cluster_labels);
                        stats(shuffle_ii,topic_ii,method_ii,:)=[ACC,MInorm,VI,trun];
                    end
                end
            end
            allresults{cluster_ii}=stats;
            alltopics{cluster_ii}=stats;
        end
        save(sprintf('../results/denoise_clustering_%s',dataset_name),'allresults','alltopics','cluster_sizes');
    end
end
%%

use_stats=[1  3 ];
stat_strs=full_stats(:,use_stats);
best_perf_func=full_best_funcs(use_stats);
stat_signs=full_stat_signs(use_stats);

num_clusters=[2 3 4 5];
n_sizes=numel(num_clusters);
nmethods=numel(method_names);
M=zeros(n_sizes,nmethods,numel(datasets),numel(use_stats),nmonte);

for dataset_ii=1:ndatasets
    load([results_dir,'denoise_clustering_',datasets{dataset_ii}])
    for cluster_ii=1:numel(allresults)
        stats=allresults{cluster_ii};
        for stat_ii=1:numel(use_stats)
            stat_idx=use_stats(stat_ii);
            X=stats(:,:,:,stat_idx);
            M(cluster_ii,:,dataset_ii,stat_ii,:)=reshape(X,[],nmethods)';
        end
    end
end

%%
C=numel(method_names);
table_codes=cell(ndatasets,1);
Sall=[];
nstats=numel(best_perf_func);
for div=1:nstats
    win_func=@(X) abs(bsxfun(@eq,best_perf_func{div}(X),X));
    diff_func=@(X) abs(bsxfun(@minus,best_perf_func{div}(X),X));
    stat_name=cell2mat(stat_strs(:,div)');
    
    if div==1
        Ctemp=C+2;
    else
        Ctemp=C;
    end
    S=sprintf('\\begin{minipage}{.45\\textwidth}\\begin{tabular}{%s}\n',repmat('l ',1,Ctemp));
    S=cat(2,S,sprintf('\\multicolumn{%i}{c}{%s}\\\\ \\hline\n',Ctemp,stat_name));
    fprintf('%s\n%s\n',stat_name,repmat('-',100));
    
    if div==1
        str=sprintf('%20s &','&$k$');
    else
        str=[];
    end
    for col=1:C
        str=cat(2,str,sprintf('%20s &',method_names{col}));
    end
    fprintf('%s\n',strrep(str(1:end-1),'&','|'));
    S=cat(2,S,sprintf('%s\\\\\n',str(1:end-1)));
    S=cat(2,S,sprintf('\\hline\n'));
    fprintf('%s\n',repmat('-',1,100));
    for dataset_ii=1:ndatasets
        nstats=numel(use_stats);
        row_headers=cell(n_sizes,nstats);
        M1=squeeze(mean(M(:,:,dataset_ii,:,:),5));
        M2=squeeze(mean(win_func(M(:,:,dataset_ii,:,:)),5));
        M3=squeeze(mean(diff_func(M(:,:,dataset_ii,:,:)),5));
        for row=1:n_sizes
            if row==1
                str=sprintf('%15s &',datasets{dataset_ii});
            else
                str=sprintf('%15s &','');
            end
            str=sprintf('%s %2i',str,num_clusters(row));
            row_headers{row,div}=str;
        end
        for row=1:n_sizes
            if div==1
                str=sprintf('%20s &',row_headers{row,div});
            else
                str=[];
            end
            for col=1:C
                val1=M1(row,col,div);
                val2=M2(row,col,div);
                val3=M3(row,col,div);
                a_str=sprintf('%.2f (%.2f)',val1,val3);
                if true || all(stat_signs(div)*round(val1*100) >= stat_signs(div)*round(M1(row,:,div)*100))  && all(round(val3*100)<=round(M3(row,:,div)*100))
                    x_this=squeeze(M(row,col,dataset_ii,div,:));
                    x_rest=squeeze(M(row,:,dataset_ii,div,:))';
                    pval=ones(1,C-1);
                    col_ii=1;
                    for col2=[1:col-1,col+1:C];
                        if stat_signs(div)==1
                            pval(col_ii) = signrank(x_this,x_rest(:,col2),'tail','right');
                        else
                            pval(col_ii) = signrank(x_this,x_rest(:,col2),'tail','left');
                        end
                        col_ii=col_ii+1;
                    end
                    if all(pval < 0.05)
                        a_str=sprintf('\\textbf{%s}',a_str);
                    end
                end
                str=cat(2,str,sprintf('%20s &',a_str));
            end
            fprintf('%s\n',strrep(str(1:end-1),'&','|'));
            str=strrep(str,'%','\%');
            S=cat(2,S,sprintf('%s\\\\\n',str(1:end-1)));
        end
        S=cat(2,S,sprintf('\\hline\n'));
        fprintf('%s\n',repmat('-',1,100));
    end
    S=cat(2,S,sprintf('\\end{tabular}'));
    S=cat(2,S,sprintf('\\end{minipage}\\hfill\n'));
    table_codes{dataset_ii}=S;
    Sall=cat(2,Sall,S);
end
fid=fopen('../results/Table_9.txt','w');
fwrite(fid,Sall)
fclose(fid);
