# Learning R: Harvard-Smithsonian Center for Astrophysics tutorial # Eric Feigelson # January 2014 # Adapted from R scripts of # Modern Statistical Methods for Astronomy With R Applications # Eric D. Feigelson & G. Jogesh Babu # http://astrostatistics.psu.edu/MSMA # These scripts may be cut and pasted into any R console. # They are self-contained. # I. Query the session environment and make a vector of three numbers # A hash mark (#) denotes comments in an R script # A semi-colon (;) has the same action as a carriage return () # All R functions use () to list arguments and parameters getwd() # find working directory. # setwd('/Users/myself/newdir') # set your personal working directory setwd('/Users/e5f/Desktop/CASt/Consulting/CfA_2014') sessionInfo() # interact with the host computer system citation() # quote this citation in any publication using R # II. Create and characterize a vector a <- c(33, 44, 92, 58) # combine numbers into a vector length(a) ls() # list names of objects in your environment class(a) # state the `class' of an R object str(a) # state the structure of an R object a # state the contents of an R object write(file='output', a) # write an ASCII file into the working directory save(file='output_bin', a) # write a binary file Save(file='output_bin', a) # error. R is case sensitive. mean(a) ; median(a) min(a) ; max(a) sum(a) ; sum(a>40) summary(a) which.max(a) match(44, a) a[1:4] ; a[3] # Make a data.frame, a 2D array with column names df.a <- data.frame(cbind(seq(1:4),a)) names(df.a) <- c('rows', 'value') df.a help(seq) # III. Simple math and a plot 5 + 3 ; 5-3 ; 5*3 ; 5/3 ; 5^3 x <- 5 ; y <- 3 x+y z <- seq(0.0, 0.5, 0.1) z H_0 <- 68 # km/s/Mpc, Planck value speed.light <- 3.0E5 dist <- speed.light*z / H_0 dist trunc(dist[4]) ; round(dist[4]) format(dist[4], digits=3, scientific=T) sprintf('%7.2e',dist[4]) sin(0) ; sin(pi/2) ang <- seq(0, pi/2, length=30) sin(ang) plot(ang, sin(ang)) plot(ang, log10(sin(ang)), pch=20, cex=1.2, xlab='Angle (radian)', ylab='log(sine)', main='Trig exercise', cex.lab=1.2, cex.main=1.7, col=2) lines(ang, log10(sin(ang)), lwd=2, lty=2, col='green3') dev.copy2eps(file='sine.eps') help(par) # IV. An astrophysical calculation using functions Omega_m <- (0.022068 + 0.12029) / (H_0/100)^2 Omega_Lambda <- 0.6825 E.H0 <- function(redshift) {sqrt(Omega_m*(1+redshift)^3 + Omega_Lambda)} lum.dist <- function(redshift) { dist = (speed.light/H_0) * integrate(E.H0, 0, redshift)\$value return(dist) } lum.dist(0.1) ; lum.dist(1.0) # V. Create a 500x2 bivariate dataset where the X values are evenly # distributed between 0 and 3 and the Y values have a nonlinear # dependence on X with heteroscedastic Gaussian scatter help(seq) # Learn some R functions. Use help files frequently! help(sample) help(rnorm) help(rbind) set.seed(1) x <- sample(seq(0.01, 3, length.out=500)) y <- 0.5*x + 0.3^(x^2) + rnorm(500, mean=0, sd=(0.05*(1+x^2))) # VI. Examine, summarize and plot univariate distributions summary(x) ; summary(y) # Summarizes properties of an R object str(x) # State the structure of an R object rbind(summary(x),summary(y)) # Concatenate rows. cbind concatenates columns. help(par) # An essential help file for formatting figures help(boxplot) # A common univariate plot par(mfrow=c(1,2)) # Set up a two-panel figure boxplot(x, notch=T, main='Boxplot for X') boxplot(y, notch=T, pch=20, cex=0.5, main='Boxplot for Y') dev.copy2eps(file='box.eps') # Copy device contents to an EPS formatted file. Also dev.copy2pdf. hist(y, breaks='FD', main='', xlab='Yvar', col='yellow') # Histogram with Freedman-Diaconis bin widths; see help(nclass.FD) qqnorm(y, pch=20, cex=0.3) # Quantile function of y compared to normal distribution qqline(y) # Expected relationship if y is normal plot(ecdf(x),pch=20,cex=0.3,main='',ylab='EDF',xlab='') plot(ecdf(y),pch=20,cex=0.3,add=T) text(2.0,0.5,"X") ; text(1.4,0.8,"Y") dev.copy2eps(file='edf.eps') # VII. Put X and Y into a data frame help(cbind) ; help(as.data.frame) ; help(names) # Learn more R functions xy <- cbind(x, y) ; str(xy) # Here xy is an `array' of numbers xy <- as.data.frame(xy) names(xy) <- c('Xvar', 'Yvar') ; str(xy) # Now xy is a 'data.frame' with names associated with columns attach(xy) # Attach allows use of variable names rather than column numbers. ls() # List current contents of environment xy[1:10,] # Filter the data.frame: first 10 rows xy[xy[,1]>2,] # Rows where the first column value exceeds 2 # VIII. Sampling and bootstrapping trials <- sample.int(length(Xvar),10) # 10 random rows xy[trials,] trials <- sample.int(length(Xvar),10, replace=T) # 10 bootstrap resamples xy[trials,] median(Yvar) # Estimate the standard error of the median of Yvar? mad(Yvar) / sqrt(500) # Median absolute deviation estimate of median s.e. library(boot) med <- function(x,ind) median(x[ind]) boot(Yvar, med, R=1000) # Bootstrap estimate of median s.e. hist(boot(Yvar, med, R=1000)\$t, breaks=50, xlab='Bootstrap median of Yvar') # IX. Bivariate plots and correlation tests help(plot) par(mfrow=c(1,2)) plot(xy, pch=20, cex=0.5) # Scatterplot. See help(points) for symbol shapes. cex specifies symbol size. plot(log10(xy), pch=20, cex=0.5, xlab='log(Xvar)', ylab='log(Yvar)') # Note `log' means `log_e' not `log_10' in R. par(mfrow=c(1,2)) length(x[x>2]) # State length of a vector. Use `dim' for an array or data.frame. cor.test(x[x>2],y[x>2], method='pearson') # Parametric hypothesis test for bivariate correlation cor.test(x[x>2],y[x>2], method='kendall') # Nonparametric hypothesis test for bivariate correlation # Note that parametric test is vulnerable to arbitrary variable transformations # while nonparametric test is not cor.test(log10(x[x>2]),log10(y[x>2]), method='pearson') cor.test(log10(x[x>2]),log10(y[x>2]), method='kendall')