# Density estimation and local regression # Eric Feigelson # Harvard-Smithsonian Center for Astrophysics tutorial, January 2014 # Regenerate bivariate dataset from first tutorial x <- sample(seq(0.01, 3, length.out=500)) y <- 0.5*x + 0.3^(x^2) + rnorm(500, mean=0, sd=(0.05*(1+x^2))) xy <- cbind(x, y) # I. Kernel density estimator with visualizations library(KernSmooth) dpik(x) ; dpik(y) smxy <- bkde2D(xy, bandwidth=c(dpik(x),dpik(y))) image(smxy\$x1, sm_xy\$x2, smxy\$fhat, col=topo.colors(30), xlab='Xvar', ylab='Yvar', cex.lab=1.2) contour(smxy\$x1, smxy\$x2, smxy\$fhat, add=T) persp(smxy\$x1,smxy\$x2, smxy\$fhat, theta=100, phi=40, shade=0.6, col='green1', xlab='Xvar', ylab='Yvar', zlab='Density') # II. Least-squares polynomial regression logx <- log10(x) ; logx2 <- (log10(x))^2 ; logx3 <- (log10(x))^3 # Set up independent variables logx4 <- (log10(x))^4 ; logx5 <- (log10(x))^5 yfit <- lm(log10(y) ~ logx + logx2 + logx3 + logx4 + logx5) # `lm' is an important function for linear modeling, '~' indicates a formula str(yfit) # Note the complicated output from lm plot(log10(x), log10(y), pch=20, col=grey(0.5),cex=0.3) # Plot data points lines(sort(log10(x)), yfit\$fit[order(log10(x))], lwd=3) # Add regression curve. Subtle difference between `sort' and `order' dev.copy2eps(file="smooth.eps") # III. Two spline fits # A cubic smoothing spline fit # This function is based on code in the GAMFIT Fortran program by T. Hastie and R. Tibshirani (http://lib.stat.cmu.edu/general/), which makes use of spline code by Finbarr O'Sullivan. [Chair of Statistics, Univ College Cork IE] cubsplxy <- smooth.spline(log10(xy)) plot(log10(xy), pch=20, cex=0.5, xlab='log(Xvar)', ylab='log(Yvar)') # Plot points plot(log10(xy), pch=20, cex=0.5, ylim=c(-0.7, 0.4), xlab='log(Xvar)', ylab='log(Yvar)', main='Two spline fits') lines(cubsplxy, lwd=2, col='darkgreen') # Plot the spline fit predict(cubsplxy, seq(-2.0,0.5, length.out=20)) # Evaluate the spline fit using R's generic `predict' function knotx <- cubsplxy\$fit\$knot*cubsplxy\$fit\$range + cubsplxy\$fit\$min # Find and plot the spline knots rug(knotx,col='darkgreen') # COnstrained B-Splines Nonparametric Regression Quantiles # Bartels, R. and Conn A. (1980) Linearly Constrained Discrete L_1 Problems, ACM Transaction on Mathematical Software 6, 594–608. # Ng, P. (1996) An Algorithm for Quantile Smoothing Splines, Computational Statistics & Data Analysis 22, 99–118. # He, X. and Ng, P. (1999) COBS: Qualitatively Constrained Smoothing via Linear Programming; Computational Statistics 14, 315–337. # Ng, P. and Maechler, M. (2007) A Fast and Efficient Implementation of Qualitatively Constrained Quantile Smoothing Splines, Statistical Modelling 7(4), 315-328. install.packages("cobs") ; library(cobs) help(cobs) cobsxy50 <- cobs(log10(x), log10(y), ic='BIC', tau=0.5) # Median regression fit lines(sort(cobsxy50\$x),cobsxy50\$fitted[order(cobsxy50\$x)], lwd=2, col=2) cobsxy25 <- cobs(log10(x), log10(y), ic='BIC', tau=0.25) lines(sort(cobsxy25\$x),cobsxy25\$fitted[order(cobsxy25\$x)], lwd=1, col=2) cobsxy75 <- cobs(log10(x), log10(y), ic='BIC', tau=0.75) lines(sort(cobsxy75\$x),cobsxy75\$fitted[order(cobsxy75\$x)], lwd=1, col=2) rug(cobsxy50\$knots, lwd=2, col=2) # III. Three well-established bivariate semi-parametric local regression estimators # LOESS, W. S. Cleveland, `Visualizing Data', Hobart Press 1993 sortx <- x[order(x)] ; sorty <- y[order(x)] local_fit <- loess(sorty ~ sortx, span=0.5, data.frame(x=x,y=y)) summary(local_fit) plot(x,y,pch=20, cex=0.5) lines(sortx, predict(local_fit), lwd=2, col=2) # Save evenly-spaced LOESS fit to a file x_seq <- seq(0.0, 3.0, by=0.03) loc_dat <- predict(local_fit, newdata=x_seq) write(rbind(x_seq,loc_dat), sep=' ', ncol=2, file='localfit.txt') # Nonlinear quantile regression (R. Koenker`Quantile Regression', Cambridge Univ Press 2005) install.packages('quantreg') ; library(quantreg) install.packages('MatrixModels') ; library(MatrixModels) plot(sortx, sorty, pch=20, cex=0.5, xlab='Xvar', ylab='Yvar') fit_rqss.25 <- rqss(sorty ~ qss(sortx), tau=0.25) lines(sortx[-1],fit_rqss.25\$coef[1]+fit_rqss.25\$coef[-1], col='red') fit_rqss.50 <- rqss(sorty ~ qss(sortx), tau=0.50) lines(sortx[-1],fit_rqss.50\$coef[1]+fit_rqss.50\$coef[-1], lwd=2) fit_rqss.75 <- rqss(sorty ~ qss(sortx), tau=0.75) lines(sortx[-1],fit_rqss.75\$coef[1]+fit_rqss.75\$coef[-1], col='red') # Nonparametric regression with bootstrap errors # Hayfield, T. & Racine, J. S. Nonparametric Econometrics: The np package, J. Statist. Software, 27(5), 2008 # http://www.jstatsoft.org/v27/i05/ install.packages("np") ; library(np) help(npplot) bw.NW <- npregbw(x, y, regtype='lc', bwtype='fixed') npplot(bws=bw.NW, ylim=c(0.0,2.5), plot.errors.method="bootstrap", plot.errors.bar='I', plot.errors.type='quantiles') points(x, y, pch=20, cex=0.5)