% File src/library/stats/man/ppr.Rd % Part of the R package, http://www.R-project.org % Copyright 1995-2013 R Core Team % Distributed under GPL 2 or later % file stats/man/ppr.Rd % copyright (C) 1995-8 B. D. Ripley % copyright (C) 2000-3 The R Core Team \newcommand{\CRANpkg}{\href{http://CRAN.R-project.org/package=#1}{\pkg{#1}}} \newcommand{\sspace}{\ifelse{latex}{\out{~}}{ }} \name{ppr} \alias{ppr} \alias{ppr.default} \alias{ppr.formula} \title{Projection Pursuit Regression} \description{ Fit a projection pursuit regression model. } \usage{ ppr(x, \dots) \method{ppr}{formula}(formula, data, weights, subset, na.action, contrasts = NULL, \dots, model = FALSE) \method{ppr}{default}(x, y, weights = rep(1, n), ww = rep(1, q), nterms, max.terms = nterms, optlevel = 2, sm.method = c("supsmu", "spline", "gcvspline"), bass = 0, span = 0, df = 5, gcvpen = 1, \dots) } \arguments{ \item{formula}{ a formula specifying one or more numeric response variables and the explanatory variables. } \item{x}{ numeric matrix of explanatory variables. Rows represent observations, and columns represent variables. Missing values are not accepted. } \item{y}{ numeric matrix of response variables. Rows represent observations, and columns represent variables. Missing values are not accepted. } \item{nterms}{number of terms to include in the final model.} \item{data}{ a data frame (or similar: see \code{\link{model.frame}}) from which variables specified in \code{formula} are preferentially to be taken. } \item{weights}{a vector of weights \code{w_i} for each \emph{case}.} \item{ww}{ a vector of weights for each \emph{response}, so the fit criterion is the sum over case \code{i} and responses \code{j} of \code{w_i ww_j (y_ij - fit_ij)^2} divided by the sum of \code{w_i}. } \item{subset}{ an index vector specifying the cases to be used in the training sample. (NOTE: If given, this argument must be named.) } \item{na.action}{ a function to specify the action to be taken if \code{\link{NA}}s are found. The default action is given by \code{getOption("na.action")}. (NOTE: If given, this argument must be named.) } \item{contrasts}{ the contrasts to be used when any factor explanatory variables are coded. } \item{max.terms}{ maximum number of terms to choose from when building the model. } \item{optlevel}{ integer from 0 to 3 which determines the thoroughness of an optimization routine in the SMART program. See the \sQuote{Details} section. } \item{sm.method}{ the method used for smoothing the ridge functions. The default is to use Friedman's super smoother \code{supsmu}. The alternatives are to use the smoothing spline code underlying \code{smooth.spline}, either with a specified (equivalent) degrees of freedom for each ridge functions, or to allow the smoothness to be chosen by GCV. } \item{bass}{ super smoother bass tone control used with automatic span selection (see \code{supsmu}); the range of values is 0 to 10, with larger values resulting in increased smoothing. } \item{span}{ super smoother span control (see \code{supsmu}). The default, \code{0}, results in automatic span selection by local cross validation. \code{span} can also take a value in \code{(0, 1]}. } \item{df}{ if \code{sm.method} is \code{"spline"} specifies the smoothness of each ridge term via the requested equivalent degrees of freedom. } \item{gcvpen}{ if \code{sm.method} is \code{"gcvspline"} this is the penalty used in the GCV selection for each degree of freedom used. } \item{\dots}{arguments to be passed to or from other methods.} \item{model}{logical. If true, the model frame is returned.} } \value{ A list with the following components, many of which are for use by the method functions. \item{call}{the matched call} \item{p}{the number of explanatory variables (after any coding)} \item{q}{the number of response variables} \item{mu}{the argument \code{nterms}} \item{ml}{the argument \code{max.terms}} \item{gof}{the overall residual (weighted) sum of squares for the selected model} \item{gofn}{the overall residual (weighted) sum of squares against the number of terms, up to \code{max.terms}. Will be invalid (and zero) for less than \code{nterms}.} \item{df}{the argument \code{df}} \item{edf}{if \code{sm.method} is \code{"spline"} or \code{"gcvspline"} the equivalent number of degrees of freedom for each ridge term used.} \item{xnames}{the names of the explanatory variables} \item{ynames}{the names of the response variables} \item{alpha}{a matrix of the projection directions, with a column for each ridge term} \item{beta}{a matrix of the coefficients applied for each response to the ridge terms: the rows are the responses and the columns the ridge terms} \item{yb}{the weighted means of each response} \item{ys}{the overall scale factor used: internally the responses are divided by \code{ys} to have unit total weighted sum of squares.} \item{fitted.values}{the fitted values, as a matrix if \code{q > 1}.} \item{residuals}{the residuals, as a matrix if \code{q > 1}.} \item{smod}{internal work array, which includes the ridge functions evaluated at the training set points.} \item{model}{(only if \code{model = TRUE}) the model frame.} } \details{ The basic method is given by Friedman (1984), and is essentially the same code used by S-PLUS's \code{ppreg}. This code is extremely sensitive to the compiler used. The algorithm first adds up to \code{max.terms} ridge terms one at a time; it will use less if it is unable to find a term to add that makes sufficient difference. It then removes the least important term at each step until \code{nterms} terms are left. The levels of optimization (argument \code{optlevel}) differ in how thoroughly the models are refitted during this process. At level 0 the existing ridge terms are not refitted. At level 1 the projection directions are not refitted, but the ridge functions and the regression coefficients are. % Levels 2 and 3 refit all the terms and are equivalent for one response; level 3 is more careful to re-balance the contributions from each regressor at each step and so is a little less likely to converge to a saddle point of the sum of squares criterion. } \source{ Friedman (1984): converted to double precision and added interface to smoothing splines by B. D. Ripley, originally for the \CRANpkg{MASS} package. } \references{ Friedman, J. H. and Stuetzle, W. (1981) Projection pursuit regression. \emph{Journal of the American Statistical Association}, \bold{76}, 817--823. Friedman, J. H. (1984) SMART User's Guide. Laboratory for Computational Statistics, Stanford University Technical Report No.\sspace{}1. Venables, W. N. and Ripley, B. D. (2002) \emph{Modern Applied Statistics with S.} Springer. } \seealso{ \code{\link{plot.ppr}}, \code{\link{supsmu}}, \code{\link{smooth.spline}} } \examples{ require(graphics) # Note: your numerical values may differ attach(rock) area1 <- area/10000; peri1 <- peri/10000 rock.ppr <- ppr(log(perm) ~ area1 + peri1 + shape, data = rock, nterms = 2, max.terms = 5) rock.ppr # Call: # ppr.formula(formula = log(perm) ~ area1 + peri1 + shape, data = rock, # nterms = 2, max.terms = 5) # # Goodness of fit: # 2 terms 3 terms 4 terms 5 terms # 8.737806 5.289517 4.745799 4.490378 summary(rock.ppr) # ..... (same as above) # ..... # # Projection direction vectors: # term 1 term 2 # area1 0.34357179 0.37071027 # peri1 -0.93781471 -0.61923542 # shape 0.04961846 0.69218595 # # Coefficients of ridge terms: # term 1 term 2 # 1.6079271 0.5460971 par(mfrow = c(3,2)) # maybe: , pty = "s") plot(rock.ppr, main = "ppr(log(perm)~ ., nterms=2, max.terms=5)") plot(update(rock.ppr, bass = 5), main = "update(..., bass = 5)") plot(update(rock.ppr, sm.method = "gcv", gcvpen = 2), main = "update(..., sm.method=\"gcv\", gcvpen=2)") cbind(perm = rock$perm, prediction = round(exp(predict(rock.ppr)), 1)) detach() } \keyword{regression}