% Laboratory in Oceanography - Data and Methods % SMAST, UMass Dartmouth % % Examples from statistics toolbox: % chi-squared random test, bootstrap, z-test, t-test, ANOVA % % Written by Miles A. Sundermeyer, 3/9/09 % set a breakpoint - use F10 to step line by line in debug mode keyboard %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Statistics Toolbox / Descriptive Statistics %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Examples of moments % create a chi squared random data set to show skewness & kurtosis figure(1) clf ndata = 1000; disttype = {'randn'; 'chi2rnd'; 'chi2rnd'}; for n=1:3 if n==1 eval(['a = ',disttype{n},'(1,ndata);']); elseif n==2 eval(['a = ',disttype{n},'(1,1,ndata);']); elseif n==3 eval(['a = ',disttype{n},'(1,1,ndata);']); a = -a+max(a); end subplot(2,3,n) hist(a) axis tight xlabel({['mean=',num2str(mean(a),2)]; ... ['var=',num2str(var(a,1),2),'=',num2str(moment(a,2),2)]; ... ['skewness=',num2str(skewness(a),2),'=',num2str(moment(a,3)/std(a,1)^3,2)]; ... ['kurtosis=',num2str(kurtosis(a),2),'=',num2str(moment(a,4)/std(a,1)^4,2)]}); end %print -djpeg stats_toolbox_1.jpg %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Example of Bootstrap method figure(1) clf; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % suppose we have 1000000 points representing the 'true' distribution of data ntrue = 1000000; atrue = randn(1,ntrue); subplot(2,2,1) hist(atrue,20); xlabel('value') ylabel('# occurances') title('"True" mean') xlim(5*[-1 1]); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % consider a subsample of 100 points nsample = 100; asample = atrue(ceil(rand(1,nsample)*ntrue)); subplot(2,2,2) hist(asample,10); xlabel('value') ylabel('# occurances') title('"Sample" mean') xlim(5*[-1 1]); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % compute 'true' mean and 95% confidence limits truemean = mean(atrue); true95pct = (1.96 * std(atrue)/sqrt(ntrue)); disp('True mean, 95% confidence interval'); disp([' ',num2str([truemean truemean+true95pct*[-1 1]],2)]); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % wish to compute mean(asample), with 95% confidence limits samplemean = mean(asample); sample95pct = (1.96 * std(asample)/sqrt(nsample)); disp('Sample mean, 95% confidence interval'); disp([' ',num2str([samplemean samplemean+sample95pct*[-1 1]],2)]); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % plot the means and std errors subplot(2,2,1) hold on plot(truemean*[1 1],get(gca,'ylim'),'r-') plot([1;1]*(truemean+true95pct*[-1 1]), get(gca,'ylim')'*[1 1],'m-') subplot(2,2,2) hold on plot(samplemean*[1 1],get(gca,'ylim'),'r-') plot([1;1]*(samplemean+sample95pct*[-1 1]), get(gca,'ylim')'*[1 1],'m-') %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% subplot(2,2,3) meanbs = bootstrp(1000,'mean',asample); hist(meanbs,20); hold on xlabel('value') ylabel('# occurances') title('Bootstrap mean') xlim(1*[-1 1]); ci = bootci(1000,@mean,asample); disp('Bootstrap mean, 95% confidence interval'); disp([' ',num2str([mean(meanbs) ci'],2)]); for n=1:3 subplot(2,2,n) plot(mean(meanbs)*[1 1],get(gca,'ylim'),'c--') plot([1;1]*ci', get(gca,'ylim')'*[1 1],'b-') end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Distribution Plots % Example of normally distributd data ... figure(1) clf x = normrnd(10,1,25,1); normplot(x) % vs when distribution is not normal ... x = exprnd(10,100,1); normplot(x) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Quantile-quantile plots % two data sets from same distribution x = poissrnd(10,50,1); y = poissrnd(5,100,1); qqplot(x,y); % vs. when underlying distributions are not the same x = normrnd(5,1,100,1); y = wblrnd(2,0.5,100,1); qqplot(x,y); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Cumulative distribution plots y = evrnd(0,3,100,1); cdfplot(y) hold on x = -20:0.1:10; f = evcdf(x,0,3); plot(x,f,'m') legend('Empirical','Theoretical','Location','NW') %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Example of t-test and z-test on DO data % Are the DO data of the different Green Pond sites from the same distribution? GreenUpper = load('GreenPondUpper15min'); GreenMiddle = load('GreenPondMiddle15min'); GreenLowest = load('GreenPondLowest15min'); % get all the data to the same time base jdaymax = min([max(GreenUpper.jday) max(GreenMiddle.jday) max(GreenLowest.jday)]); jdaymin = max([min(GreenUpper.jday) min(GreenMiddle.jday) min(GreenLowest.jday)]); keep = {find(GreenUpper.jday>=jdaymin & GreenUpper.jday<=jdaymax) ... find(GreenMiddle.jday>=jdaymin & GreenMiddle.jday<=jdaymax) ... find(GreenLowest.jday>=jdaymin & GreenLowest.jday<=jdaymax)}; % pull out just the DO records %DO = [GreenUpper.DO(keep{1}) GreenMiddle.DO(keep{2}) GreenLowest.DO(keep{3})]; DO = [GreenUpper.DO(keep{1}) GreenLowest.DO(keep{3})]; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% figure(1) clf normplot(DO) % perform Lillifors test for Gaussianity lillietest(GreenUpper.DO(keep{1})) lillietest(GreenLowest.DO(keep{3})) % both=1 => reject null hypothesis that data are Gaussian (note tails) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % now test to see if the two time series came from distributions with equal means % i.e., same distribution if also same variances % H=0 % indicates that the null hypothesis ("means are equal") cannot be % rejected at the 5% significance level. % H=1 indicates that the null % hypothesis can be rejected at the 5% level. By default the data are assumed % to come from normal distributions with unknown, but equal, variances. X % and Y can have different lengths. tail = 'both'; % both=means not equal, right=mean x>mean y, left vice versa alpha = 0.05; % significance level % unequal - specifies that the variance of the two distributions are also unequal [h,p,ci,stats] = ttest2(GreenUpper.DO,GreenLowest.DO, alpha, tail, 'unequal') %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % ttest2 yielded the following: % h = % 1 % % p = % p-value % 1.2826e-018 % % ci = % confidence interval on difference of population means % 0.3478 % 0.5458 % % stats = % tstat: 8.8484 % t statistic % df: 4.2166e+003 % degrees freedom % sd: 2.2899 % pooled standard deviation, or individual when not equal % 1.2509 % % h=1, confidence interval does not include zero => reject null % hypothesis that the two data sets are from same distribution, % NOTE: for equal variances, df = total obsv's - 2 mean(DO) abs(diff(mean(DO))) size(DO(:)); % plot these using boxplot figure(2) clf boxplot(DO,'notch','on','labels',{'Green Upper', 'Green Lowest'}) xlabel('Site') ylabel('DO (\mu/l)') %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Run through Matlab Statistics Toolbox example of ANOVA load hogg hogg %hogg = % % 24 14 11 7 19 % 15 7 9 7 24 % 21 12 7 4 19 % 27 17 13 7 15 % 33 14 12 12 10 % 23 16 18 18 20 [p,tbl,stats] = anova1(hogg) % pvalue << 0.05 => values are significantly different from one another %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % to see which are different, check this with multcompare load hogg [p,tbl,stats] = anova1(hogg); [c,m] = multcompare(stats) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % SEE ALSO Light_vs_DO example ... Light_vs_DO/Light_DO.m %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Example: Two-Way ANOVA % Determine the effect of car model and factory on the mileage rating of cars. load mileage mileage %mileage = % % 33.3000 34.5000 37.4000 % 33.4000 34.8000 36.8000 % 32.9000 33.8000 37.6000 % 32.6000 33.4000 36.6000 % 32.5000 33.7000 37.0000 % 33.0000 33.9000 36.7000 % % There are three models of cars (columns) and two factories (rows). % There are six rows in mileage instead of two since each factory provides % three cars of each model for the study - data from the first factory are % in the first three rows, the second factory is in the last three rows. cars = 3; [p,tbl,stats] = anova2(mileage,cars)