Dummy-variable regression

Sometimes a data set exists in which fitting different relationships for subsets of the observations are in order.  This situation is exemplified by the Scandinavian EU preference data.

Shape file components for Scandinavian communes:  [scand_prov.dbf] [scand_prov.shp] [scand_prov.shx]

# Scandinavian EU Preference Vote -- bivariate regression
library(gpclib)
library(maptools)
library(RColorBrewer)

# read and plot shape file
# read and plot shape file
scand_prov.shp <- readShapePoly(file.choose(),
proj4string=CRS("+proj=longlat"))
plot(scand_prov.shp)

# get coordinates of district centroids
scand_prov.pts <- coordinates(scand_prov.shp)
District <- scand_prov.shp@data$District
points(scand_prov.pts, pch=16, cex=0.4)
text(scand_prov.pts, labels=District, cex=.7)

# get variable values from .dbf attributes
Y <- scand_prov.shp@data$Yes
X <- scand_prov.shp@data$Pop
Country <- scand_prov.shp@data$Country

# plot Yes votes
plotvar <- Y # gets data from shapefile .dbf
plotvar <- rank(plotvar)
nclr <- 8
plotclr <- brewer.pal(nclr,"PuOr")
colornum <- cut(plotvar, nclr, labels=FALSE)
colorcode <- plotclr[colornum]
brks <- round(quantile(plotvar, probs=seq(0,1,1/(nclr))), digits=1)
colorcode <- plotclr[colornum]
plot(scand_prov.shp, col=colorcode)
points(scand_prov.pts, type="n")
#text(scand_prov.pts, labels=scand_prov.shp@data$Yes, cex=.7)
text(scand_prov.pts, labels=Y, cex=.7)
legend(locator(1), legend=leglabs(brks), fill=plotclr, cex=0.6, bty="n")

# print the Yes votes
cbind(Y,as.character(Country),as.character(District))

Note the strong spatial gradient across the region in the number of Yes votes.

Here's the basic linear model with the number of Yes votes as the response and the log of population as a predictor.

# linear model, Yes ~ log10(Pop)
vote.lm1 <- lm(Y ~ log10(X))
vote.lm1

# examine the regression equation
plot(Y ~ log10(X))
abline(vote.lm1)
segments(log10(X), fitted(vote.lm1), log10(X), Y)

summary(vote.lm1)
oldpar <- par(mfrow = c(2, 2))
plot(vote.lm1, which=c(1,2,4,5))
par(oldpar)

# confidence intervals
pred.data <- data.frame(X=seq(1,1151,by=50))
pred.int <- predict(vote.lm1, int="p", newdata=pred.data)
conf.int <- predict(vote.lm1, int="c", newdata=pred.data)

plot(Y ~ log10(X), ylim=range(Y, pred.int, na.rm=T))
pred.X <- log10(pred.data$X)
matlines(pred.X, pred.int, lty=c(1,2,2), col="black")
matlines(pred.X, conf.int, lty=c(1,2,2), col="red")

The basic regression diagnostic plots and model summaries don't really hint at anything going on behind the scenes, although there is a little tendency for the residuals to be not normally distributed, and a few other patterns in the diagnostic plots.

However, plotting the residuals reveals a distinct pattern.

# map the residuals
plotvar <- vote.lm1$residuals # gets residuals
nclr <- 8
plotclr <- brewer.pal(nclr,"PuOr")
colornum <- cut(plotvar, nclr, labels=FALSE)
brks <- round(quantile(plotvar, probs=seq(0,1,1/(nclr))), digits=1)
colorcode <- plotclr[colornum]
plot(scand_prov.shp, col=colorcode)
points(scand_prov.pts, type="n")
text(scand_prov.pts, labels=scand_prov.shp@data$District, cex=.7)
legend(locator(1), legend=leglabs(brks), fill=plotclr, cex=0.6, bty="n")

# print the residuals
cbind(residuals(vote.lm1),as.character(Country),as.character(District))

The residual pattern suggests that there may be a "country-effect" on the number of Yes votes:

boxplot(Y ~ Country)

The residuals can be examined in light of the information that there may be inter-country differences.

# residual grouped boxplot
plot(residuals(vote.lm1) ~ Country)

# labelled scatter diagram
plot(Y ~ log10(X), type="n")
abline(vote.lm1)
text(log10(X), Y, labels=as.character(Country))

The information present in the obvious pattern in the residuals should be incorporated into the model.  One approach for doing that is to fit separate regression lines, one for each group.  Note that both the slope and intercept can vary among groups, here (based on inspection of the data), only the intercept will be made to vary among countries.

# Scandinavian EU Preference Vote -- dummy variable regression
# model with a factor as predictor
Y <- scand_prov.shp@data$Yes
X <- scand_prov.shp@data$Pop
Country <- scand_prov.shp@data$Country
vote.lm2 <- lm(Y ~ log10(X)+Country)
vote.lm2

# display the fitted lines
plot(Y ~ log10(X))
lines(log10(X)[Country=="N"],fitted(vote.lm2)[Country=="N"], col="red")
lines(log10(X)[Country=="F"],fitted(vote.lm2)[Country=="F"], col="blue")
lines(log10(X)[Country=="S"],fitted(vote.lm2)[Country=="S"], col="yellow")

# examine the model
summary(vote.lm2)
oldpar <- par(mfrow = c(2, 2))
plot(vote.lm2, which=c(1,2,4,5))
par(oldpar)

Note that the non-normality of the residuals and other patterns in the diagnostic plots are less obvious than in the first model. The residuals from the second, dummy-variable regression can be mapped, and reveal that the inclusion of a country effect has greatly diminished the pattern in the residuals.

# map residuals
plotvar <- vote.lm2$residuals # gets residuals
nclr <- 8
plotclr <- brewer.pal(nclr,"PuOr")
colornum <- cut(plotvar, nclr, labels=FALSE)
brks <- round(quantile(plotvar, probs=seq(0,1,1/(nclr))), digits=1)
colorcode <- plotclr[colornum]
plot(scand_prov.shp, col=colorcode)
points(scand_prov.pts, type="n")
text(scand_prov.pts, labels=scand_prov.shp@data$Country, cex=.7)
legend(locator(1), legend=leglabs(brks), fill=plotclr, cex=0.6, bty="n")

# print the residuals
cbind(residuals(vote.lm2),as.character(Country),as.character(District))