Commit 954ea2e6 authored by Wen Yao Jin's avatar Wen Yao Jin
Browse files

all_the_works

parents
File added
This diff is collapsed.
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 08 22:03:25 2017
@author: les trois petits copains
"""
import numpy as np
import matplotlib.pyplot as plt
# task 2
def data_gen( dimen, numGau, n_samples, mu, std):
# ambient dimension : dimension of each sample e.g. X^d, Y^d
data = np.zeros((n_samples,dimen))
for i in range(n_samples):
k = np.random.randint(numGau)
data[i,:] = np.random.multivariate_normal( mu[k], std[k])
return data
def vector_gen(dimen, magnitude = None, normalize = True):
if magnitude==None:
magnitude = np.ones(dimen)
rand = np.random.rand(dimen)
if normalize:
return (rand-0.5)*2*magnitude
else :
return rand*magnitude
if __name__ == "__main__":
dimen = 2 # ambient dimension
numGau = 4 # number of gaussian
trans_magnitude = [1,2,3,4] # the magnitude of each transfer vector of each gaussian
trans = [vector_gen(dimen,np.repeat(trans_magnitude[i],dimen)) for i in range(numGau)] # translation of mean for each gaussian in GMM
mu = [vector_gen(dimen) for i in range(numGau)]
std = [np.diag(vector_gen(dimen,normalize=False)) for i in range(numGau)]
n_samples = 10000
# task 2
mu_trans = [m+t for m,t in zip(mu,trans)]
data_p = data_gen( dimen, numGau, n_samples, mu = mu,std = std )
data_q = data_gen( dimen, numGau, n_samples, mu = mu_trans,std = std )
print(mu)
plt.scatter(data_p[:,0],data_p[:,1])
plt.scatter(data_q[:,0],data_q[:,1],color = 'r')
plt.show()
task1:
paper[10]2.2.2
theorem 1: kn-NN estimate using Euclidean distance is strongly pointwise consistent
http://web.stanford.edu/class/ee378a/books/book1.pdf
p95 knn regressor is has the optimal rate of convergence for class D(1,C)
ppt[11]
p11-12 best convergent rate when dimension is low
%This function implements the MMD two-sample test using a bootstrap
%approach to compute the test threshold.
%Arthur Gretton
%07/12/08
%Inputs:
% X contains dx columns, m rows. Each row is an i.i.d sample
% Y contains dy columns, m rows. Each row is an i.i.d sample
% alpha is the level of the test
% params.sig is kernel size. If -1, use median distance heuristic.
% params.shuff is number of bootstrap shuffles used to
% estimate null CDF
% params.bootForce: if this is 1, do bootstrap, otherwise
% look for previously saved threshold
%Outputs:
% thresh: test threshold for level alpha test
% testStat: test statistic: m * MMD_b (biased)
function [testStat,thresh,params] = mmdTestBoot(X,Y,alpha,params);
m=size(X,1);
%Set kernel size to median distance between points in aggregate sample
if params.sig == -1
Z = [X;Y]; %aggregate the sample
size1=size(Z,1);
if size1>100
Zmed = Z(1:100,:);
size1 = 100;
else
Zmed = Z;
end
G = sum((Zmed.*Zmed),2);
Q = repmat(G,1,size1);
R = repmat(G',size1,1);
dists = Q + R - 2*Zmed*Zmed';
dists = dists-tril(dists);
dists=reshape(dists,size1^2,1);
params.sig = sqrt(0.5*median(dists(dists>0))); %rbf_dot has factor two in kernel
end
K = rbf_dot(X,X,params.sig);
L = rbf_dot(Y,Y,params.sig);
KL = rbf_dot(X,Y,params.sig);
%MMD statistic. Here we use biased
%v-statistic. NOTE: this is m * MMD_b
testStat = 1/m * sum(sum(K + L - KL - KL'));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
threshFileName = strcat('mmdTestThresh',num2str(m));
Kz = [K KL; KL' L];
if ~exist(strcat(threshFileName,'.mat'),'file') || params.bootForce==1
% disp(strcat('Generating new threshold: ',threshFileName))
MMDarr = zeros(params.shuff,1);
for whichSh=1:params.shuff
[notUsed,indShuff] = sort(rand(2*m,1));
KzShuff = Kz(indShuff,indShuff);
K = KzShuff(1:m,1:m);
L = KzShuff(m+1:2*m,m+1:2*m);
KL = KzShuff(1:m,m+1:2*m);
MMDarr(whichSh) = 1/m * sum(sum(K + L - KL - KL'));
end
MMDarr = sort(MMDarr);
thresh = MMDarr(round((1-alpha)*params.shuff));
save(threshFileName,'thresh','MMDarr');
else
load(threshFileName);
end
%This function implements the MMD two-sample test using a Gamma approximation
%to the test threshold
%Arthur Gretton
%07/12/08
%Inputs:
% X contains dx columns, m rows. Each row is an i.i.d sample
% Y contains dy columns, m rows. Each row is an i.i.d sample
% alpha is the level of the test
% params.sig is kernel size. If -1, use median distance heuristic.
%Outputs:
% thresh: test threshold for level alpha test
% testStat: test statistic m*MMD (biased)
function [testStat,thresh,params] = mmdTestGamma(X,Y,alpha,params);
m=size(X,1);
%Set kernel size to median distance between points IN AGGREGATE SAMPLE
if params.sig == -1
Z = [X;Y]; %aggregate the sample
size1=size(Z,1);
if size1>100
Zmed = Z(1:100,:);
size1 = 100;
else
Zmed = Z;
end
G = sum((Zmed.*Zmed),2);
Q = repmat(G,1,size1);
R = repmat(G',size1,1);
dists = Q + R - 2*Zmed*Zmed';
dists = dists-tril(dists);
dists=reshape(dists,size1^2,1);
params.sig = sqrt(0.5*median(dists(dists>0))); %rbf_dot has factor of two in kernel
end
%MMD statistic. Here we use biased
%statistic and get Gamma fit.
K = rbf_dot(X,X,params.sig);
L = rbf_dot(Y,Y,params.sig);
KL = rbf_dot(X,Y,params.sig);
testStat = 1/m^2 * sum(sum(K + L - KL - KL'));
testStat = testStat * m; %null distirbution on m*MMD_b
%mean under H0 of MMD
meanMMD = 2/m * ( 1 - 1/m*sum(diag(KL)) );
K = K - diag(diag(K));
L = L - diag(diag(L));
KL = KL - diag(diag(KL));
%Variance under H0 of MMD
varMMD = 2/m/(m-1) * 1/m/(m-1) * sum(sum( (K + L - KL - KL').^2 ));
al = meanMMD^2 / varMMD;
bet = varMMD*m / meanMMD;
thresh = icdf('gam',1-alpha,al,bet); %%%% TEST THRESHOLD
%This function implements the MMD two-sample test using Pearson curves.
%Arthur Gretton
%07/12/08
%Inputs:
% X contains dx columns, m rows. Each row is an i.i.d sample
% Y contains dy columns, m rows. Each row is an i.i.d sample
% alpha is the level of the test
% params.sig is kernel size (set to -1 to use median
% distance between inputs)
% params.numNullSamp is number of samples from null Pearson MMD
% distribution used to estimate CDF
%Outputs:
% thresh: test threshold for level alpha test
% testStat: test statistic m*MMD (unbiased)
%Uses original test code from Malte Rasch
function [testStat,thresh,params] = mmdTestPears(X,Y,alpha,params);
m=size(X,1);
%Set kernel size to median distance between points IN AGGREGATE SAMPLE
if params.sig == -1
Z = [X;Y]; %aggregate the sample
size1=size(Z,1);
if size1>100
Zmed = Z(1:100,:);
size1 = 100;
else
Zmed = Z;
end
G = sum((Zmed.*Zmed),2);
Q = repmat(G,1,size1);
R = repmat(G',size1,1);
dists = Q + R - 2*Zmed*Zmed';
dists = dists-tril(dists);
dists=reshape(dists,size1^2,1);
params.sig = sqrt(0.5*median(dists(dists>0))); %rbf_dot has factor two in kernel
end
K = rbf_dot(X,X,params.sig);
L = rbf_dot(Y,Y,params.sig);
KL = rbf_dot(X,Y,params.sig);
%%%% Get test statistic (unbiased)
MMDf = ( K + L - KL - KL' );
MMDf = MMDf - diag(diag(MMDf));
testStat = 1/m/(m-1) * sum(sum( MMDf ));
testStat = testStat * m; %null distirbution on m*MMD
%%%% Compute test threshold
%Get 2nd moment
MMDg = MMDf.^2 ;
MMD = MMDg / m/(m-1);
preVarMMD = sum(sum(MMD));
U_m2 = preVarMMD*2/m/(m-1);
%Get 3rd moment
pre3rdMMD = 0;
for i1=1:m
for i2 = 1:m
if i1~=i2
intMean = 0; %intermediate mean
for i3 = 1:m
if i3~=i1 & i3~=i2
intMean = intMean + MMDf(i1,i3)*MMDf(i2,i3);
end
end
intmean = intMean/(m-2);
pre3rdMMD = pre3rdMMD + intmean*MMDf(i1,i2);
end
end
end
pre3rdMMD = pre3rdMMD/ (m*(m-1));
U_m3 = 8*(m-2)/(m*(m-1))^2 * pre3rdMMD;
%Rescale 2nd and 3rd moments, since they were computed for MMD and
%must be applied to m*MMD
U_m3 = m^3 * U_m3;
U_m2 = m^2 * U_m2;
%Approx 4th moment
kurt = 2*((U_m3)^2/U_m2^3 + 1); %kurtosis >= skewness^2 + 1
%Sample the distribution
U_moments = {0,sqrt(U_m2),U_m3/U_m2^(3/2),kurt};
[r1] = pearsrnd(U_moments{:},params.numNullSamp,1);
%Compute empirical CDF to get the 95% quantile
%keyboard
[fi,xi] = ecdf(r1);
thresh= xi(find(fi<1-alpha,1,'last'));
%This function implements the MMD two-sample test using the Gram
%matrix spectrum to estimate the coefficients of the infinite sum
%of chi^2 (which constitutes the null distibution)
%Arthur Gretton
%07/12/08
%Inputs:
% X contains dx columns, m rows. Each row is an i.i.d sample
% Y contains dy columns, m rows. Each row is an i.i.d sample
% alpha is the level of the test
% params.sig is kernel size. If -1, use median distance heuristic.
% params.numEigs is number of eigenvalues used to compute
% the null distribution.If it is -1, then use
% 2*m - 2
% params.numNullSamp is number of samples from null spectral MMD
% distribution used to estimate CDF
%Outputs:
% thresh: test threshold for level alpha test
% testStat: test statistic: m * MMD_b (biased)
%Set kernel size to median distance between points, if no kernel specified
function [testStat,thresh,params] = mmdTestSpec(X,Y,alpha,params);
m=size(X,1);
if params.numEigs==-1
params.numEigs = 2*m-2;
end
%Set kernel size to median distance between points in aggregate sample
if params.sig == -1
Z = [X;Y]; %aggregate the sample
size1=size(Z,1);
if size1>100
Zmed = Z(1:100,:);
size1 = 100;
else
Zmed = Z;
end
G = sum((Zmed.*Zmed),2);
Q = repmat(G,1,size1);
R = repmat(G',size1,1);
dists = Q + R - 2*Zmed*Zmed';
dists = dists-tril(dists);
dists=reshape(dists,size1^2,1);
params.sig = sqrt(0.5*median(dists(dists>0))); %rbf_dot has factor two in kernel
end
K = rbf_dot(X,X,params.sig);
L = rbf_dot(Y,Y,params.sig);
KL = rbf_dot(X,Y,params.sig);
%MMD statistic. Here we use biased
%v-statistic. NOTE: this is m * MMD_b
testStat = 1/m * sum(sum(K + L - KL - KL'));
%Draw samples from null distribution
Kz = [K KL; KL' L];
H = eye(2*m) - 1/(2*m)*ones(2*m,2*m);
Kz = H*Kz*H;
kEigs = eigs(Kz,params.numEigs); %note: this retains only largest magnitude eigenvalues
%empirical eigenvalues scaled by 1/2/m: see p. 2 Shawe-Tayor
%et al. (2005)
kEigs = 1/2/m * abs(kEigs);
numEigs = length(kEigs);
%DEBUG: plot the eigenspectrum to ensure it is not truncated
if params.plotEigs
stem(kEigs);
keyboard
end
nullSampMMD = zeros(1,params.numNullSamp);
for whichSamp = 1:params.numNullSamp
nullSampMMD(whichSamp) = 2*sum(kEigs.*(randn(length(kEigs),1)).^2);
end
nullSampMMD = sort(nullSampMMD);
thresh = nullSampMMD(round((1-alpha)*params.numNullSamp));
%Radial basis function inner product
%Arthur Gretton
%Pattern input format : [pattern1 ; pattern2 ; ...]
%Output : p11*p21 p11*p22 ... ; p12*p21 ...
%Deg is kernel size
function [H]=rbf_dot(patterns1,patterns2,deg);
%Note : patterns are transposed for compatibility with C code.
size1=size(patterns1);
size2=size(patterns2);
%new vectorised version
G = sum((patterns1.*patterns1),2);
H = sum((patterns2.*patterns2),2);
Q = repmat(G,1,size2(1));
R = repmat(H',size1(1),1);
H = Q + R - 2*patterns1*patterns2';
H=exp(-H/2/deg^2);
% This program is free software; you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation; either version 3 of the License, or
% (at your option) any later version.
%
% Written (W) 2012 Heiko Strathmann, Dino Sejdinovic, Arthur Gretton
MATLAB code to reproduce experiments described in
Optimal kernel choice for large-scale two-sample tests.
Advances in Neural Information Processing Systems 2012.
Gretton, A., Sriperumbudur, B., Sejdinovic, D., Strathmann, H.,
Balakrishnan, S., Pontil, M., & Fukumizu, K.
Start script is standalone.m, see file for instructions.
Written by Heiko Strathmann under GPLv3, see License.txt for a copy.
% This program is free software; you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation; either version 3 of the License, or
% (at your option) any later version.
%
% Written (W) 2012 Heiko Strathmann
% for a vector of indices, returns training/validation indices the i-th fold
% of a k-fold cross validation
function [train, validation]=cross_val_fold(indices, k, i)
validation_size=round(length(indices)/k);
if i<k
select=(i-1)*validation_size+1:i*validation_size;
else
% catch possible different size of folds
select=(i-1)*validation_size+1:length(indices);
end
% extract test and train indices
validation=indices(select);
indices(select)=[];
train=indices;
% This program is free software; you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation; either version 3 of the License, or
% (at your option) any later version.
%
% Written (W) 2012 Heiko Strathmann
% Produces the 'blobs' dataset which consists of a 2D grid of Gaussians which
% have a different shape
function X=gaussian_blobs(stretch, angle, blob_distance, num_blobs, num_samples)
% rotation matrix
R=[cos(angle),-sin(angle);sin(angle),cos(angle)];
% construct covariance matrix where ratio of Eigenvalues corresponds to stretch
X=zeros(num_samples, 2);
eigenvalues=ones(2,1);
eigenvalues(1,1)=stretch;
for i=1:num_samples
% random sample from blob number
mu=[randi(num_blobs)*blob_distance; randi(num_blobs)*blob_distance];
% generate multivariate random sample with rotation and stretch
X(i,:)=normal_sample(mu, R, eigenvalues, 1);
end
%Generates an AM signal from an input sound file.
%Arthur Gretton
%01/08/12
function [sigAM] = genAMsig(sig,fs,fMultiple,fTransmitMultiple,offset,envelopeScale);
%Upsample the signal to 2* the nyqist of the carrier
sig_upsample = interp(sig,fMultiple*fTransmitMultiple,1);
%Carrier frequency
fCarrier = fs*fMultiple;
%time axis and carrier signal
t = (1:length(sig_upsample))/(fs*fMultiple*fTransmitMultiple);
carrierSig = sin(2*pi*fCarrier*t);
%amplitude modulated signal
sigAM = carrierSig' .*(offset + sig_upsample*envelopeScale);
%Generates two samples from various artificial datasets.
%The datasets are as follows:
%INPUT:
% n: number of samples
% d: dataset parameter. Dimensionality for the first two datasets,
% frequency of the sinusoid for the third.
% whichData: which toy dataset to generate. Can be one of:
%"mean": Gaussians in high dimension which have different means in a
% single dimension.
%"var": Gaussians in high dimension which have different variances in a
% single dimension.
%"sine" A 1-D Gaussian, and a Gaussian with a sinusoidal perturbation to
% the density
%dataParam.meanShift: how much the mean is shifted
%dataParam.stdScale: how much the standard deviation is scaled
%dataParam.nuArr: frequency of perturbation for 'sine'
%OUTPUT:
% Y,X each of dimension n rows * d columns. Each row is an
% i.i.d. sample. Each column is a different dimension
%Arthur Gretton
%SINE code by Bharath Sriperumbudur, taken from:
%/Users/arthur/Documents/gatsby/kernelDistributionComparison/COLT-09/Experiments/sampling.m
% sine code modified for large scale applications by Heiko Strathmann
% 9 Dec 2012
function [X,Y] = genArtificialData(n,d,whichData,dataParam)
switch whichData
case 'mean'
X = randn(n,dataParam.dim(d));
Y = randn(n,dataParam.dim(d));
Y(:,1) = Y(:,1) + ones(n,1)*dataParam.meanShift;
case 'var'
X = randn(n,dataParam.dim(d));
Y = randn(n,dataParam.dim(d));
Y(:,1) = Y(:,1) *dataParam.stdScale;
case 'sine'