---
title: "Data Science for Actuaries"
author: "Arthur Charpentier, @freakonometrics"
highlighter: highlight.js
output: pdf_document
job: 2018
knit: slidify::knit2slides
mode: selfcontained
hitheme: tomorrow
framework: io2012
widgets : [mathjax, quiz, bootstrap, interactive] # {mathjax, quiz, bootstrap}
ext_widgets : {rCharts: [libraries/nvd3, libraries/leaflet, libraries/dygraphs]}
assets : {assets: /assets}
#- shiny
# library(devtools)
# install_github("slidify", "ramnathv")
# install_github("slidifyLibraries", "ramnathv")
---
# R & Packages
```{r set-options, echo=TRUE, cache=FALSE}
getOption("defaultPackages")
(.packages(all.available=TRUE))
rm(list=ls())
options(width = 80)
library(knitr); library(htmlwidgets); library(dplyr); library(leaflet)
library(distr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(distrEx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(pryr,verbose=FALSE,quietly=TRUE)
library(wordcloud,verbose=FALSE,quietly=TRUE)
library(tm,verbose=FALSE,quietly=TRUE)
```
---
# Using with RStudio
Download **RStudio** from http://www.rstudio.com
---
```{r, error=TRUE}
?rq
library(quantreg,verbose=FALSE,quietly=TRUE)
require(quantreg)
?rq
rq
detach(package:quantreg, unload=TRUE)
?rq
```
---
```{r}
apropos("norm")
?pnorm
```
---
# S3 vs S4
```{r}
person3 <- function(name,age,weight,height){
characteristics<-list(name=name,age=age,weight=weight,height=height)
class(characteristics)<-"person3"
return(characteristics)}
JohnDoe3 <- person3(name="John",age=28, weight=76, height=182)
JohnDoe3$age
BMI3 <- function(object,...) {return(object$weight*1e4/object$height^2)}
BMI3(JohnDoe3)
```
---
```{r}
JohnDoe3
```
---
```{r}
reg3 <- lm(dist~speed,data=cars)
reg3$coefficients
coef(reg3)
```
---
```{r}
summary(reg3)
```
---
```{r}
setClass("person4", representation(name="character",
age="numeric", weight="numeric", height="numeric"),where=.GlobalEnv)
JohnDoe4 <- new("person4",name="John",age=28, weight=76, height=182)
JohnDoe4
```
---
```{r}
JohnDoe4@age
setGeneric("BMI4",function(object,separator) return(standardGeneric("BMI4")),
where=.GlobalEnv)
setMethod("BMI4", "person4",where=.GlobalEnv,
function(object){return(object@weight*1e4/object@height^2)})
BMI4(JohnDoe4)
```
---
```{r}
library(VGAM,verbose=FALSE,quietly=TRUE)
reg4 <- vglm(dist~speed,data=cars,family=gaussianff)
reg4@coefficients
coef(reg4)
```
---
```{r}
pnorm(3.65102,mean=5,sd=2)
library(distr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
X <- Norm(mean=5,sd=2)
mean(X)
p(X)(3.65102)
```
---
```{r}
qnorm(0.25,mean=5,sd=2)
library(distrEx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
distr::q(X)
distr::q(X)(0.25)
```
---
```{r}
X1 <- Norm(mean=5,sd=2)
X2 <- Norm(mean=2,sd=1)
S <- X1+X2
distr::q(S)(.5)
```
---
```{r}
plot(S)
```
---
# R objects : Numbers
```{r}
set.seed(1)
U <- runif(20)
U[1:4]
U[1] == 0.2655087
```
---
```{r}
options(digits = 3)
U[1:4]
options(digits = 22)
U[1:4]
options(digits = 3)
```
---
```{r}
x <- exp(1)
y <- x
x <- 2
y
class(x)
```
---
```{r}
0/0
1/0
.Machine$double.xmax
2e+307
0])
x[-(3:8)]
# x[-3:8]
x[c(TRUE,FALSE)]
```
---
```{r}
x[c(TRUE,NA,FALSE,TRUE)]
x[2]
x[[2]]
```
---
```{r}
for(i in 1:2){
nom_var <- paste0("x", i)
assign(nom_var, rpois(5,7))
}
x1
x2
```
---
```{r}
x <- runif(4)
x
x
```
---
```{r, error=TRUE}
library(pryr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
x %.5) {("greater than 50%")} else {("smaller than 50%")}
ifelse(u>.5,("greater than 50%"),("smaller than 50%"))
u
```
---
```{r}
u <- runif(3)
if(u>.5) {print("greater than 50%")} else {("smaller than 50%")}
ifelse(u>.5,("greater than 50%"),("smaller than 50%"))
u
```
---
# Loops
```{r}
v_x <- runif(1e5)
sqrt_x <- NULL
for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x))
sqrt_x <- NULL
system.time(for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x)))
```
---
```{r}
sqrt_x <- numeric(length(v_x))
for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i])
sqrt_x <- numeric(length(v_x))
system.time(for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i]))
```
---
```{r}
system.time(Vectorize(sqrt)(v_x))
sqrt_x <- sapply(v_x,sqrt)
sqrt_x <- unlist(lapply(v_x,sqrt))
system.time(unlist(lapply(v_x,sqrt)))
```
---
```{r}
library(parallel,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
(allcores <- detectCores(all.tests=TRUE))
sqrt_x <- unlist(mclapply(v_x,sqrt,mc.cores=4))
system.time(unlist(mclapply(v_x,sqrt,mc.cores=4)))
```
---
```{r}
f<-function(x) dlnorm(x)
integrate(f,0,Inf)
integrate(f,0,1e5)
integrate(f,0,1e3)$value+integrate(f,1e3,1e5)$value
```
---
# Apply
```{r}
rN.Poisson <- function(n) rpois(n,5)
rX.Exponential <- function(n) rexp(n,2)
set.seed(1)
rN.Poisson(5)
rX.Exponential(3)
```
---
```{r}
rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
V <- rep(0,n)
for(i in 1:n){
N <- rN(1)
if(N>0){V[i] <- sum(rX(N))}
}
return(V)}
rcpd1(3)
```
---
```{r}
rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
V <- rep(0,n)
for (i in 1:n) V[i] <- sum(rX(rN(1)))
return(V)}
rcpd1(3)
```
---
```{r}
rcpd2 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
N <- rN(n)
X <- rX(sum(N))
I <- factor(rep(1:n,N),levels=1:n)
return(as.numeric(xtabs(X ~ I)))}
rcpd2(3)
```
---
```{r}
rcpd3 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
N <- rN(n)
X <- rX(sum(N))
I <- factor(rep(1:n,N),levels=1:n)
V <- tapply(X,I,sum)
V[is.na(V)] <- 0
return(as.numeric(V))}
rcpd3(3)
```
---
```{r}
rcpd4 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
return(sapply(rN(n), function(x) sum(rX(x))))}
rcpd4(3)
```
---
```{r}
rcpd5 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
return(sapply(Vectorize(rX)(rN(n)),sum))}
rcpd5(3)
```
---
```{r}
rcpd6 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
return(unlist(lapply(lapply(t(rN(n)),rX),sum)))}
rcpd6(3)
```
---
```{r}
n <- 100
library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
options(digits=1)
microbenchmark(rcpd1(n),rcpd2(n),rcpd3(n),rcpd4(n),rcpd5(n),rcpd6(n))
options(digits=5)
```
---
# Vectorization
```{r}
f <- function(x,m=0,s=1){
H<-function(t) 1-pnorm(t,m,s)
integral<-integrate(H,lower=x,upper=Inf)$value
res<-H(x)/integral
return(res)
}
f(0)
f(0:1)
```
---
```{r}
y <- rep(NA,2)
x <- 0:1
for(i in 1:2) y[i] <- f(x[i])
y
sapply(x,"f")
Vectorize(f)(x)
```
---
```{r}
binorm <- function(x1,x2,r=0){
exp(-(x1^2+x2^2-2*r*x1*x2)/(2*(1-r^2)))/(2*pi*sqrt(1-r^2))
}
u <- seq(-2,2)
binorm(u,u)
outer(u,u,binorm)
```
---
```{r}
(uv<-expand.grid(u,u))
matrix(binorm(uv$Var1,uv$Var2),5,5)
```
---
```{r}
u <- seq(-2,2,length=31)
persp(u,u,outer(u,u,binorm),theta=25)
```
---
# Computational Efficiency
```{r}
library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
n <- 1000
A<-matrix(seq(1,n^2),n,n)
B<-matrix(seq(1,n^2),n,n)
C<-1:n
microbenchmark((A%*%B)%*%C,A%*%(B%*%C),replications=1)
```
---
```{r}
m <- 1000
n <- 50
X <- matrix(rnorm(m * n, mean = 10, sd = 3), nrow = m)
group <- rep(c("A","B"), each = n / 2)
system.time(for(i in 1:m) t.test(X[i, ] ~ group)$stat)
system.time(for(i in 1:m) t.test(X[i, group == "A"], X[i, group == "B"])$stat)
```
---
```{r}
t.test_1 <- function(x, grp){
t.test(x[ group == "A"], x[ group == "B"])$stat
}
system.time(t1 <- apply(X, 1, t.test_1, grp = grp))
```
---
```{r}
t.test_2 <- function(x, grp) {
t_stat <- function(x) {
m <- mean(x)
n <- length(x)
var <- sum((x - m) ^ 2) / (n - 1)
list(m = m, n = n, var = var)
}
g1 <- t_stat(x[grp == "A"])
g2 <- t_stat(x[grp == "B"])
se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
(g1$m - g2$m) / se_total
}
system.time(apply(X, 1, t.test_2, grp = group))
```
---
```{r}
t.test_3 <- function(X, grp){
t_stat <- function(X) {
m <- rowMeans(X)
n <- ncol(X)
var <- rowSums((X - m) ^ 2) / (n - 1)
list(m = m, n = n, var = var)
}
g1 <- t_stat(X[, grp == "A"])
g2 <- t_stat(X[, grp == "B"])
se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
(g1$m - g2$m) / se_total
}
system.time( t.test_3(X, group))
```
---
```{r}
"%pm%" <- function(x,s) x + c(qnorm(.05),qnorm(.95))*s
100 %pm% 10
'second<-' <- function(x, value) {
x[2] <- value
return(x)
}
x <- 1:10
second(x) <- 5
x
```
---
```{r}
library(pryr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
x <- 1:10
address(x)
second(x) <- 6
address(x)
```
---
```{r}
ls(globalenv())
find("x")
find("pi")
environment(sd)
```
---
```{r}
f <- function(x) x^2
f
formals(f)
body(f)
```
---
```{r}
e <- new.env()
e$d <- 1
e$f <- 1:5
e$g <- 1:5
ls(e)
str(e)
identical(globalenv(),e)
search()
```
---
```{r}
f1 <- function(x1){
f2 <- function(x2){
f3 <- function(x3){
x1+x2+x3
}
f3(3)
}
f2(2)
}
f1(1)
```
---
```{r, error=TRUE}
f <- function(x) log(x)
f("x")
try(f("x"))
inherits(try(f("x")), "try-error")
inherits(try(f("x"), silent=TRUE), "try-error")
```
---
```{r}
a=0
try(a<-f("x"), silent=TRUE)
a
try(a<-f(x), silent=TRUE)
a
```
---
```{r, error=TRUE}
x=1:10
g=function(f) f(x)
g(mean)
```
---
```{r}
fibonacci <- function(n){
if(n<2) return(1)
fibonacci(n-2)+fibonacci(n-1)
}
fibonacci(20)
system.time(fibonacci(30))
```
---
```{r}
library(memoise,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
fibonacci <- memoise(function(n){
if(n<2) return(1)
fibonacci(n-2)+fibonacci(n-1)
})
system.time(fibonacci(30))
```
---
```{r}
set.seed(1)
x<-rexp(6)
sum(x)
.Primitive("sum")(x)
```
---
```{r}
library(Rcpp,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
cppFunction('double sum_C(NumericVector x) {
int n = x.size();
double total = 0;
for(int i = 0; i < n; ++i) {
total += x[i];
}
return total;
}')
sum_C(x)
```
---
```{r}
library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
x <- runif(1e6)
microbenchmark(
sum(x),
.Primitive("sum")(x),
sum_C(x))
```
---
```{r}
names_list <- function(...){
names(list(...))
}
names_list(a=5,b=7)
```
---
```{r, error=TRUE}
power <- function(exponent) {
function(x) {
x ^ exponent
}
}
square <- power(2)
square(4)
cube <- power(3)
cube(4)
```
---
```{r}
cube
```
---
```{r}
total <- 20
pb <- txtProgressBar(min = 0, max = total, style = 3)
for(i in 1:(10*total)){
Sys.sleep(0.01)
if(i %%10 == 0) setTxtProgressBar(pb, i%/%10)
}
close(pb)
```
---
# R objects : Data Frames
```{r}
df <- data.frame(x=1:3,y=letters[1:3])
str(df)
typeof(df)
class(df)
```
---
```{r}
cbind(df,z=9:7)
df$z<-5:3
df
```
---
```{r}
df[1]
df[,1,drop=FALSE]
df[,1,drop=TRUE]
```
---
```{r}
df[[1]]
df$x
df[,"x"]
df[["x"]]
```
---
```{r}
set.seed(1)
df[sample(nrow(df)),]
set.seed(1)
df[sample(nrow(df),nrow(df)*2,replace=TRUE),]
```
---
```{r}
subset(df, x>1)
```
---
```{r}
set.seed(123)
df <- data.frame(Y=rnorm(50), X1=as.factor(sample(LETTERS[1:4],size=50,
replace=TRUE)), X2=as.factor(sample(1:3,size=50,replace=TRUE)))
tail(df)
```
---
```{r}
reg <- lm(Y~X1+X2,data=df)
model.matrix(reg)[45:50,]
```
---
```{r}
reg <- lm(Y~(X1=="A")+X2,data=df)
model.matrix(reg)[45:50,]
```
---
```{r}
reg <- lm(Y~X1*X2,data=df)
model.matrix(reg)[45:50,]
```
---
```{r}
reg <- lm(Y~X1+X2%in%X1,data=df)
model.matrix(reg)[45:50,]
```
---
```{r}
download.file("http://freakonometrics.free.fr/superheroes.RData","superheroes.RData")
load("superheroes.RData")
superheroes
publishers
```
---
```{r}
library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
inner_join(superheroes, publishers)
```
---
```{r}
inner_join(publishers, superheroes)
```
---
```{r}
merge(superheroes, publishers, all = TRUE)
```
---
```{r}
semi_join(superheroes, publishers)
```
---
```{r}
semi_join(publishers, superheroes)
```
---
```{r}
left_join(superheroes, publishers)
```
---
```{r}
left_join(publishers, superheroes)
```
---
```{r}
anti_join(superheroes, publishers)
```
---
```{r}
anti_join(publishers, superheroes)
```
---
```{r}
download.file("http://freakonometrics.free.fr/gapminderDataFiveYear.txt",
"gapminderDataFiveYear.txt")
gdf <- read.delim("gapminderDataFiveYear.txt")
head(gdf,4)
```
---
```{r}
str(gdf)
```
---
```{r}
library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
gtbl <- tbl_df(gdf)
library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
gdfdt <- data.table(gdf)
gdfdt
```
---
```{r,error=TRUE}
subset(gdf, lifeExp < 30)
gdf[lifeExp < 30,]
gdf[gdf$lifeExp < 30,]
```
---
```{r}
filter(gtbl, lifeExp < 30)
subset(gdfdt, lifeExp < 30)
gdfdt[lifeExp < 30,]
```
---
```{r}
gdf[gdf$country == "Italy", c("year", "lifeExp")]
```
---
```{r}
gtbl %>%
filter(country == "Italy") %>%
select(year, lifeExp)
gdfdt[country == "Italy",cbind(year, lifeExp)]
setkey(gdfdt,country)
gdfdt["Italy",cbind(year, lifeExp)]
```
----
# Data frames (and other format)
```{r}
library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ldf <- tbl_df(gdf)
glimpse(ldf)
```
---
```{r}
select(ldf,lifeExp)
select(ldf,lifeExp)$lifeExp[1:10]
```
---
```{r}
ldf[4:5,]
slice(ldf,4:5)
```
---
```{r}
filter(ldf,lifeExp<30 | gdpPercap<300)
rename(ldf,population = pop)
```
---
```{r}
library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
dt <- data.table(gdf)
is.data.frame(dt)
```
---
```{r}
object.size(gdf)
object.size(ldf)
object.size(dt)
```
---
```{r}
df[df$lifeExp<30,]
subset(df, lifeExp < 30)
filter(ldf, lifeExp < 30)
```
---
```{r}
subset(dt, lifeExp < 30)
dt[lifeExp<30]
```
---
```{r}
setkey(dt, country)
dt["Italy"]
```
---
```{r}
dt["Italy",mult="first"]
dt["Italy",max(lifeExp)]
max(df$lifeExp[df$country=="Italy"])
```
---
```{r}
setkey(dt, country, lifeExp)
dt["Italy"]
```
---
```{r}
dt[.("Italy",77.44)]
dt[.("Italy",77)]
dt[.("Italy",77),roll="nearest"]
```
---
```{r}
df[df$country == "Italy", c("year", "lifeExp")]
ldf %>%
filter(country == "Italy") %>%
select(year, lifeExp)
```
---
```{r}
small_df <- df[df$country %in% c("France","Italy","Spain"), c("country","year", "lifeExp")]
```
---
```{r}
library(tidyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
str(small_df)
table_df=spread(small_df,"country","lifeExp")
table_df
```
---
```{r}
data_df=gather(table_df,"country","lifeExp",2:4)
data_df
```
---
```{r}
library(reshape2,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
dcast(small_df, formula = country~year,
value.var="lifeExp")
table_df= dcast(small_df, formula = year~country,
value.var="lifeExp")
table_df
```
---
```{r}
melt(table_df, id.vars=c("year"),
value.name = "lifeExp",
variable.name="country")
```
---
```{r}
aggregate(small_df$lifeExp,FUN=max,by=list(small_df$country))
tapply(small_df$lifeExp,factor(small_df$country),max)
```
---
```{r}
small_ldf = filter(ldf,country %in% c("France","Italy","Spain"))
group_by(small_ldf,country) %>% summarize(max(lifeExp))
filter(ldf,country %in% c("France","Italy","Spain")) %>%
group_by(country) %>%
summarize(max(lifeExp))
```
---
```{r}
filter(ldf,country %in% c("France","Italy","Spain")) %>%
group_by(country) %>%
summarize(max(lifeExp))
```
---
```{r}
plot(df$gdpPercap,df$lifeExp)
```
---
```{r}
select(ldf,gdpPercap,lifeExp) %>% plot()
```
---
```{r}
setkey(dt, country)
dt[c("France","Italy","Spain"),max(lifeExp),by="country"]
dt[c("France","Italy","Spain"),max(lifeExp),by="country"]
```
---
```{r}
head(df[with(df,order(-lifeExp)),])
head(dt[order(-lifeExp)])
head(setorder(dt,-lifeExp))
arrange(ldf,-lifeExp)
```
---
```{r}
df$gdp=df$pop*df$gdpPercap
df = within(df,gdp = pop*gdpPercap)
df = within(df,gdp <- pop*gdpPercap)
dt[,gdp:=pop*gdpPercap]
head(dt)
```
---
```{r}
mutate(ldf,gdp=pop*gdpPercap)
```
---
```{r}
countries=read.csv2("http://freakonometrics.free.fr/countries.csv",header=TRUE)
countries = countries[-1,]
head(countries)
df2 = countries[,c("Country","Capital")]
names(df2)=c("countries","capital")
```
---
```{r}
df1 = aggregate(df$lifeExp,FUN=max,by=list(df$country))
names(df1)=c("countries","lifeExp")
dim(df1)
dim(df2)
tail(merge(df1,df2))
```
---
```{r}
tail(merge(df1,df2,all.x=TRUE))
tail(merge(df1,df2,all.y=TRUE))
```
---
```{r}
ldf1 = group_by(ldf, country) %>% summarize(max(lifeExp))
names(ldf1)=c("countries","lifeExp")
ldf2 <- tbl_df(df2)
inner_join(ldf1,ldf2)
left_join(ldf1,ldf2)
ldf1 %>% inner_join(ldf2)
```
---
```{r}
dt1 = dt[,max(lifeExp),by="country"]
setnames(dt1,c("countries","lifeExp"))
dt2 <- data.table(df2)
setkey(dt1,"countries")
setkey(dt2,"countries")
dt1[dt2,nomatch=0]
dt1[dt2]
dt2[dt1]
```
---
```{r}
library(stringr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(lubridate,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(plyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(LaF,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
```
---
```{r}
loc="http://freakonometrics.free.fr/gapminderDataFiveYear.txt"
download.file(loc,destfile="temp")
ssn <- laf_open_csv("temp",sep="\t",
column_types=c("character","integer","numeric","character","numeric","numeric"),
column_names=names(df)[1:6])
object.size(ssn)
str(ssn)
nrow(ssn)
```
---
```{r}
go_through <- which(ssn[1:(nrow(ssn)-1),'country'] != ssn[2:nrow(ssn),'country'])+1
if(go_through[length(go_through)] != nrow(ssn)) go_through <- c(go_through, nrow(ssn))
go_through <- cbind(go_through[-length(go_through)], c(go_through[-c(1, length(go_through))]-1, go_through[length(go_through)]))
```
---
```{r}
worst_life_exp <- function(s){
data <- ssn[go_through[s,1]:go_through[s,2],c("country","year","lifeExp")]
data[which.min(data$lifeExp),]
}
worst_life_exp(10)
```
---
# Graphs with ggplot2
```{r}
library(gamair,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
data(chicago)
head(chicago)
```
---
```{r}
base=data.frame(death=chicago$death,
temp_F=chicago$tmpd,
o3=chicago$o3median,
date=seq(as.Date("1987-01-01"),
as.Date("2000-12-31"),by=1))
base$temp_C <- (base$temp_F-32)/1.8
base$year <- substring(base$date,1,4)
```
---
```{r}
date2season <- function(date){
m <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%m"))
d <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%d"))
s <- NA
if(m %in% c(1,2) | ((m==12)&(d>=21)) | ((m==3)&(d<21))) s <- "winter"
if(m %in% c(4,5) | ((m==3)&(d>=21)) | ((m==6)&(d<21))) s <- "spring"
if(m %in% c(7,8) | ((m==6)&(d>=21)) | ((m==9)&(d<21))) s <- "summer"
if(m %in% c(10,11) | ((m==9)&(d>=21)) | ((m==12)&(d<21))) s <- "autumn"
return(s)}
base$season <- sapply(base$date,date2season)
head(base)
```
---
```{r}
plot(base[,c("date", "temp_C")])
```
---
```{r}
library(ggplot2,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
g <- ggplot(base, aes(date, temp_C)) +
geom_point()
g
ggsave(file="figure.pdf", scale=3)
```
---
```{r}
g <- g+labs(title='Temperature')
g <- g+ggtitle('Temperature')
g
```
---
```{r}
g+theme(plot.title = element_text(size=20, face="bold", vjust=2, color="red"))
```
---
```{r}
g<-g+labs(
x="Date",
y=expression(paste("Temperature ( ", degree ~ C, " )")),
title="Temperature in Chicago")
g
```
---
```{r}
g+ theme(
axis.title.x = element_text(color="blue"),
axis.title.y = element_text(color="blue")
)
```
---
```{r}
g + ylim(c(-20,60))
g <- g + xlim(as.Date(c("1995-01-01","1999-01-01")))
g
```
---
```{r}
base2=data.frame(
today=base$temp_C[1:5113],
tomorrow=base$temp_C[2:5114])
ggplot(base2, aes(today, tomorrow))+ geom_point()+ coord_equal()
```
---
```{r}
g <- ggplot(base, aes(date, temp_C, color=factor(season)))+
xlim(as.Date(c("1995-01-01","1999-01-01")))+
geom_point()
g+theme(legend.title=element_blank())+
guides(colour = guide_legend(override.aes = list(size=4)))
```
---
```{r}
g<-ggplot(base, aes(date, temp_C))+
xlim(as.Date(c("1995-01-01","1999-01-01")))
g+geom_line(color="grey")
```
---
```{r}
g+geom_line(aes(color="temperature"))+
scale_colour_manual(name='', values=c('temperature'='grey'))
```
---
```{r}
ggplot(base, aes(date, temp_C, color=factor(season)))+
geom_point() +
scale_color_manual(values=
c("dodgerblue4", "darkolivegreen4",
"darkorchid3", "goldenrod1"))
```
---
```{r}
ggplot(base, aes(date, temp_C, color=factor(season)))+
geom_point() +
scale_color_brewer(palette="Set1")
```
---
```{r}
ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
scale_color_gradient(low="blue", high="red")
```
---
```{r}
ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
scale_color_gradient2(low="blue", mid="white", high="red")
```
---
```{r}
ggplot(base, aes(date, temp_C))+geom_point(color="firebrick")+
theme(panel.background = element_rect(fill = 'light yellow'))
```
---
```{r}
ggplot(base, aes(date,temp_C))+geom_point(color="firebrick")+
facet_wrap(~year, ncol=2, scales="free")
```
---
```{r, error=TRUE}
library(ggthemes)
ggplot(base, aes(date, temp_C, color=factor(season)))+
geom_point()+labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like MS Excel")+
theme_excel()+scale_colour_excel()+
theme(legend.title=element_blank())
```
---
```{r, error=TRUE}
library(ggthemes)
ggplot(base, aes(date, temp_C, color=factor(season)))+
geom_point()+labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like 'The Economist'")+
theme_economist()+
theme(legend.title=element_blank())
```
---
```{r nvd3plot, results = 'asis', commment = NA, message = F, echo = F}
require(rCharts)
haireye = as.data.frame(HairEyeColor)
n1 <- nPlot(Freq ~ Hair,
group = 'Eye',
data = subset(haireye, Sex == 'Male'),
type = 'multiBarChart'
)
n1$print('chart1')
```
---
# OpenStreetMap
```{r}
library(OpenStreetMap,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
map <- openmap(c(lat= 51.516, lon= -.141), c(lat= 51.511, lon= -.133))
map <- openproj(map, projection = "+init=epsg:27700")
plot(map)
```
---
```{r, error = TRUE}
loc="http://freakonometrics.free.fr/cholera.zip"
download.file(loc,destfile="cholera.zip")
unzip("cholera.zip", exdir="./")
library(maptools,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
deaths<-readShapePoints("Cholera_Deaths")
head(deaths@coords)
```
---
```{r, error = TRUE}
plot(map)
points(deaths@coords,col="red", pch=19, cex=.7 )
```
---
```{r, error = TRUE}
X <- deaths@coords
library(KernSmooth)
kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))
library(grDevices,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
clrs <- colorRampPalette(c(rgb(0,0,1,0), rgb(0,0,1,1)), alpha = TRUE)(20)
```
---
```{r, error = TRUE}
plot(map)
points(deaths@coords,col="red", pch=19, cex=.7 )
image(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE,col=clrs)
contour(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE)
```
---
# Google Map
```{r, error = TRUE}
library(ggmap,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
get_london <- get_map(c(-.137,51.513), zoom=17)
london <- ggmap(get_london)
```
---
# OpenStreetMap
```{r}
london
```
---
```{r, error = TRUE}
df_deaths <- data.frame(X)
library(sp,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(rgdal,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700")
df_deaths = spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))
```
---
# OpenStreetMap
```{r}
london + geom_point(aes(x=coords.x1,y=coords.x2),data=data.frame(df_deaths@coords),col="red")
```
---
```{r, error = TRUE}
london <- london + geom_point(aes(x=coords.x1, y=coords.x2),
data=data.frame(df_deaths@coords),col="red") +
geom_density2d(data = data.frame(df_deaths@coords),
aes(x = coords.x1, y=coords.x2), size = 0.3) +
stat_density2d(data = data.frame(df_deaths@coords),
aes(x = coords.x1, y=coords.x2,fill = ..level.., alpha = ..level..),
size = 0.01, bins = 16, geom = "polygon") + scale_fill_gradient(low = "green", high = "red",
guide = FALSE) + scale_alpha(range = c(0, 0.3), guide = FALSE)
```
---
```{r}
london
```
---
```{r, error = TRUE}
require(leaflet)
deaths <- readShapePoints("Cholera_Deaths")
df_deaths <- data.frame(deaths@coords)
coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700")
df_deaths=spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))
df=data.frame(df_deaths@coords)
lng=df$coords.x1
lat=df$coords.x2
m = leaflet()%>% addTiles()
```
```{r, eval=FALSE}
m %>% fitBounds(-.141, 51.511, -.133, 51.516)
```
```{r, echo=FALSE, warning=FALSE, cache=FALSE}
m = m %>% fitBounds(-.141, 51.511, -.133, 51.516)
saveWidget(m, 'leaflet_cholera.html')
```
---
```{r, 'cholera_map_osm', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'}
cat('')
```
---
```{r, error = TRUE}
rd=.5
op=.8
clr="blue"
m = leaflet() %>% addTiles()
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr)
```
```{r, echo=FALSE, warning=FALSE, cache=FALSE}
m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr)
saveWidget(m, 'leaflet_cholera_circles.html')
```
---
```{r, 'cholera_map_osm_circle', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'}
cat('')
```
---
```{r, error = TRUE}
X=cbind(lng,lat)
kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))
x=kde2d$x1
y=kde2d$x2
z=kde2d$fhat
CL=contourLines(x , y , z)
m = leaflet() %>% addTiles()
m %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
```
```{r, eval=F}
m %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
```
```{r, echo=FALSE, warning=FALSE, cache=FALSE}
m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr)
saveWidget(m, 'leaflet_cholera_kde.html')
```
---
```{r, 'cholera_map_osm_kde', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'}
cat('')
```
---
```{r, error = TRUE}
m = leaflet() %>% addTiles()
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
m = leaflet() %>% addTiles()
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE)
```
```{r, echo=FALSE, warning=FALSE, cache=FALSE}
m = leaflet() %>% addTiles()
m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
saveWidget(m, 'leaflet_cholera_kde2.html')
```
```{r, 'cholera_map_osm_kde2', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'}
cat('')
```
```{r, echo=FALSE, warning=FALSE, cache=FALSE}
m = leaflet() %>% addTiles()
m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE)
saveWidget(m, 'leaflet_cholera_kde3.html')
```
```{r, 'cholera_map_osm_kde3', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'}
cat('')
```
---
```{r, error = TRUE}
m = leaflet() %>% addTiles()
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>%
addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>%
addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red")
```
```{r, echo=FALSE, warning=FALSE, cache=FALSE}
m = leaflet() %>% addTiles()
m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>%
addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>%
addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red")
saveWidget(m, 'leaflet_cholera_kde4.html')
```
```{r, 'cholera_map_osm_kde4', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'}
cat('')
```
---
# Map with googleVis
```{r, warning=FALSE, cache=FALSE, error = FALSE, message=FALSE}
library(googleVis)
df=data.frame(state=c("AU-WA","AU-VIC","AU-NT", "AU-NSW", "AU-SA","AU-QLD","AU-TAS","NZ"),
R_users=c(323,425,154,486,201,195,87,123))
Geo <- gvisGeoChart(df,
locationvar="state",colorvar=c("R_users"),
options=list(region="AU",
dataMode="regions",resolution="provinces",
width="80%", height="80%"))
```
---
# Map with googleVis
```{r, 'gvisGeoChart', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'}
print(Geo, tag = "chart")
```
---
```{r, error = TRUE}
library(XML,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
year <- 2012
loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="")
tabs <- readHTMLTable(htmlParse(loc))
tabs[1]
storms <- unlist(strsplit(as.character(tabs[[1]]$Name),split=" "))
storms
```
---
```{r, error = TRUE}
index <- storms%in%c("Tropical", "Storm", paste("Hurricane-",1:6,sep=""), "Depression", "Subtropical", "Extratropical", "Low", paste("Storm-",1:6,sep=""), "Xxx")
nstorms <- storms[!index]
nstorms
TRACK=NULL
```
---
```{r, error = TRUE}
for(i in length(nstorms):1){
if((nstorms[i]=="SIXTEE")&(year==2008)) nstorms[i] <- "SIXTEEN"
if((nstorms[i]=="LAUR")&(year==2008)) nstorms[i] <- "LAURA"
if((nstorms[i]=="FIFTEE")&(year==2007)) nstorms[i] <- "FIFTEEN"
if((nstorms[i]=="CHANTA")&(year==2007)) nstorms[i] <- "CHANTAL"
if((nstorms[i]=="ANDR")&(year==2007)) nstorms[i] <- "ANDREA"
if((nstorms[i]=="NINETE")&(year==2005)) nstorms[i] <- "NINETEEN"
if((nstorms[i]=="JOSEPH")&(year==2002)) nstorms[i] <- "JOSEPHINE"
if((nstorms[i]=="FLOY")&(year==1993)) nstorms[i] <- "FLOYD"
if((nstorms[i]=="KEIT")&(year==1988)) nstorms[i] <- "KEITH"
if((nstorms[i]=="CHARLI")&(year==1972)) nstorms[i] <- "CHARLIE"}
```
---
```{r}
for(i in length(nstorms):1){
loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="")
track <- read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20))
names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT")
track$LAT=as.numeric(as.character(track$LAT))
track$LON=as.numeric(as.character(track$LON))
track$WIND=as.numeric(as.character(track$WIND))
track$PR=as.numeric(as.character(track$PR))
track$year=year
track$name=nstorms[i]
TRACK=rbind(TRACK,track)}
tail(TRACK,10)
```
---
```{r,error = TRUE}
extract.track <- function(year=2012,p=TRUE){
loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="")
tabs <- readHTMLTable(htmlParse(loc))
storms <- unlist(strsplit(as.character(
tabs[[1]]$Name),split=" "))
index <- storms%in%c("Tropical","Storm",paste("Hurricane-",1:6,sep=""),"Depression","Subtropical","Extratropical","Low",paste("Storm-",1:6,sep=""),
"Xxx")
nstorms <- storms[!index]
TRACK=NULL
for(i in length(nstorms):1){
loc=paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="")
track=read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20))
names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT")
track$LAT=as.numeric(as.character(track$LAT))
track$LON=as.numeric(as.character(track$LON))
track$WIND=as.numeric(as.character(track$WIND))
track$PR=as.numeric(as.character(track$PR))
track$year=year
track$name=nstorms[i]
TRACK=rbind(TRACK,track)
if(p==TRUE){ cat(year,i,nstorms[i],nrow(track),"\n")}}
return(TRACK)}
```
---
```{r,error = TRUE}
TOTTRACK=NULL
# for(y in 2012:1851){
# TOTTRACK=rbind(TOTTRACK,extract.track(y))
# }
loc="http://freakonometrics.free.fr/TRACK-ATLANTIC.Rdata"
download.file(loc,destfile="TRACK-ATLANTIC.Rdata")
load("TRACK-ATLANTIC.Rdata")
dim(TOTTRACK)
```
---
```{r}
library(grDevices,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(maps,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
map("world", col="light yellow", fill=TRUE)
```
---
```{r}
map("world", col="light yellow", fill=TRUE)
for(n in unique(TOTTRACK$name)){
lines(TOTTRACK$LON[TOTTRACK$name==n],TOTTRACK$LAT[TOTTRACK$name==n],lwd=.5,col=rgb(1, 0, 0, alpha=.5))}
```
---
# Shapefiles
```{r}
loc="http://freakonometrics.free.fr/Italy.zip"
download.file(loc,destfile="Italy.zip")
unzip("Italy.zip", exdir="./")
library(rgdal,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ita1<-readOGR(dsn="./Italy/",layer="ITA_adm1",verbose=FALSE)
names(ita1)
```
---
```{r}
plot(ita1,col="light green")
```
---
```{r}
ita1@data[,"NAME_1"][1:6]
pos<-which(ita1@data[,"NAME_1"]=="Sicily")
Poly_Sicily<-ita1[pos,]
```
---
```{r}
plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
```
---
```{r}
plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
abline(v=5:20,col="light blue")
abline(h=35:50,col="light blue")
axis(1)
axis(2)
```
---
```{r}
pos<-which(ita1@data[,"NAME_1"] %in% c("Abruzzo","Apulia","Basilicata","Calabria","Campania","Lazio","Molise","Sardegna","Sicily"))
ita_south <- ita1[pos,]
ita_north <- ita1[-pos,]
plot(ita1)
plot(ita_south,col="red",add=TRUE)
plot(ita_north,col="blue",add=TRUE)
```
---
```{r}
library(xlsx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
loc="http://freakonometrics.free.fr/Italy_codes.xlsx"
download.file(loc,destfile="Italy_codes.xlsx")
data_codes<-read.xlsx2(file = "Italy_codes.xlsx",sheetName="ITALY", startRow=1, header=TRUE)
names(data_codes)[1]="NAME_1"
ita2<-merge(ita1, data_codes, by = "NAME_1", all.x=TRUE)
pos<-which(ita2@data[,"GROUP"] == "SOUTH")
```
---
```{r}
library(rgeos,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ita_s<-gUnionCascaded(ita_south)
ita_n<-gUnionCascaded(ita_north)
plot(ita1)
plot(ita_s,col="red",add=TRUE)
plot(ita_n,col="blue",add=TRUE)
```
---
```{r}
plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
gCentroid(Poly_Sicily,byid=TRUE)
points(gCentroid(Poly_Sicily,byid=TRUE),pch=19,col="red")
```
---
```{r}
G <- as.data.frame(gCentroid(ita1,byid=TRUE))
plot(ita1,col="light green")
text(G$x,G$y,1:20)
```
---
```{r}
c1 <-G[c(17,20,6,16,8,12,2),]
c2 <- G[c(13,11,1,5,3,4),]
L1<-SpatialLines(list(Lines(list(Line(c1)),"Line1")))
L2<-SpatialLines(list(Lines(list(Line(c2)),"Line2")))
L1@proj4string<-ita1@proj4string
L2@proj4string<-ita1@proj4string
cross_road <-gIntersection(L1,L2)
cross_road@coords
```
---
```{r}
plot(ita1,col="light green")
plot(L1,col="red",lwd=2,add=TRUE)
plot(L2,col="blue",lwd=2,add=TRUE)
plot(cross_road,pch=19,cex=1.5,add=TRUE)
```
---
## Linear Model
$$
y_i=\boldsymbol{x}_i^{\text{T}}\boldsymbol{\beta}+\varepsilon_i=\beta_0 + [\beta_1 x_{1,i}+\cdots+
\beta_k x_{k,i}]+\varepsilon_i
$$
See linear model, where $\varepsilon_i\sim\mathcal{N}(0,\sigma^2)$ i.id.
$$
(Y\vert \boldsymbol{X}=\boldsymbol{x})\sim
\mathcal{N}(\boldsymbol{x}^{\text{T}}\boldsymbol{\beta},
\sigma^2)
$$
i.e.
$$
\mathbb{E}[Y\vert\boldsymbol{X}=\boldsymbol{x}]=\boldsymbol{x}^{\text{T}}\boldsymbol{\beta}
$$
and homoscedastic model,
$$
\text{Var}[Y\vert\boldsymbol{X}=\boldsymbol{x}]=\sigma^2
$$
Least squares (and maximum likelihood) estimator
$$
\widehat{\boldsymbol{\beta}}=\text{argmin}
\left\lbrace
\sum_{i=1}^n (y_i-\boldsymbol{x}_i^{\text{T}}\boldsymbol{\beta})^2
\right\rbrace
=(\boldsymbol{X}^{\text{T}}\boldsymbol{X})^{-1}\boldsymbol{X}^{\text{T}}\boldsymbol{y}$$
---
## Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars)
```
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
model <- lm(dist ~ speed, data=cars)
```
---
## Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
summary(model)
```
---
## Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
X=cbind(1,cars$speed)
Y=cars$dist
solve(crossprod(X,X),crossprod(X,Y))
model$coefficients
```
---
## Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
summary(model)$sigma^2*solve(crossprod(X,X))
vcov(model)
n=nrow(cars)
```
---
## Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
confint(model)
model$coefficients[1]+qt(c(.025,.975),n-2)* summary(model)$coefficients[1,2]
```
---
## Linear Model
coefficient of determination
$\displaystyle{R^2 = \frac{\text{explained variance}}{\text{total variance}}}$
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
summary(model)$r.squared
1-deviance(model)/sum((Y-mean(Y))^2)
```
$$
\text{Var}[Y]=
\text{Var}[\mathbb{E}[Y\vert X]]+
\mathbb{E}[\text{Var}[Y\vert X]]
$$
see analysis of variance
---
## Linear Model
$$
\overline{R}^2 =
1-[1-R^2]\cdot \frac{n-1}{n-(k-1)-1}
$$
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
summary(model)$adj.r.squared
```
---
## Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
anova(lm(dist~speed,data=cars),lm(dist~1,data=cars))
```
---
## Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars)
abline(model,col="red")
```
---
## Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars)
abline(model,col="red")
x=seq(2,26)
y=predict(model, newdata=data.frame(speed=x))
lines(x,y,lwd=2,col="blue")
```
----
## Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
y=predict(model, newdata=data.frame(speed=x), interval = "confidence")
head(y)
```
$$
\text{Var}[\widehat{Y}(\boldsymbol{x})]=\text{Var}[\boldsymbol{x}^{\text{T}}\widehat{\boldsymbol{\beta}}]=\boldsymbol{x}^{\text{T}}\text{Var}[\widehat{\boldsymbol{\beta}}]\boldsymbol{x}
=\widehat{\sigma}^2 \boldsymbol{x}^{\text{T}}[\boldsymbol{X}^{\text{T}}\boldsymbol{X}]^{-1}\boldsymbol{x}
$$
----
## Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars)
polygon(c(x,rev(x)),c(y[,2],rev(y[,3])),col=rgb(0,0,1,.4),border=NA)
lines(x,y[,1],lwd=2,col="blue")
```
----
## Linear Model and Bagging
Method 1
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
Y=matrix(NA,1000,length(x))
for(b in 1:1000){
idx <- sample(1:nrow(cars),size=nrow(cars),replace=TRUE)
modelb <- lm(dist ~ speed, data=cars[idx,])
Y[b,] <- predict(modelb, newdata=data.frame(speed=x))
}
```
See bootstrap, based on pseudo-sample
$$
\lbrace(\boldsymbol{x}_{i_1},y_{i_1}),
\cdots,(\boldsymbol{x}_{i_n},y_{i_n})
\rbrace
$$
where $(i_1,\cdots,i_n)\in\lbrace 1,2,\cdots,n\rbrace$.
----
## Linear Model and Bagging
Method 1
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars)
for(b in 1:100) lines(x,Y[b,],col=rgb(0,0,1,.4))
```
----
## Linear Model and Bagging
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars)
lines(x,apply(Y,2,mean),col="blue",lwd=2)
lines(x,apply(Y,2,function(x) quantile(x,.025)),col="red",lty=2)
lines(x,apply(Y,2,function(x) quantile(x,.975)),col="red",lty=2)
```
----
## Linear Model and Bagging
Method 2
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
pred_dist=predict(model)
epsilon =residuals(model)
Y=matrix(NA,1000,length(x))
for(b in 1:1000){
idx <- sample(1:nrow(cars),size=nrow(cars),replace=TRUE)
carsb=data.frame(speed=cars$speed,
dist=pred_dist+epsilon[idx])
modelb <- lm(dist ~ speed, data=carsb)
Y[b,] <- predict(modelb, newdata=data.frame(speed=x))
}
```
See bootstrap, based on pseudo-sample
$$
\lbrace(\boldsymbol{x}_{1},\widehat{y}_{1}+\widehat{\varepsilon}_{i_1}),
\cdots,(\boldsymbol{x}_{n},\widehat{y}_{n}+\widehat{\varepsilon}_{i_n})
\rbrace
$$
where $(i_1,\cdots,i_n)\in\lbrace 1,2,\cdots,n\rbrace$.
----
## Linear Model and Bagging
Method 2
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars)
for(b in 1:100) lines(x,Y[b,],col=rgb(0,0,1,.4))
```
----
## Linear Model and Bagging
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars)
lines(x,apply(Y,2,mean),col="blue",lwd=2)
lines(x,apply(Y,2,function(x) quantile(x,.025)),col="red",lty=2)
lines(x,apply(Y,2,function(x) quantile(x,.975)),col="red",lty=2)
```
----
## Errors
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars)
abline(model,col="red")
segments(cars$speed,cars$dist,cars$speed,predict(model),col="blue")
```
----
## Least Squares?
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
cars2=cars; cars2[,2]=cars[,2]/10
plot(cars2,ylab="dist/10")
acp=princomp(cars2)
b=acp$loadings[2,1]/acp$loadings[1,1]
a=acp$center[2]-b*acp$center[1]
abline(a,b,col="red")
```
----
## Least Squares?
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars2,ylab="dist/10",xlim=c(0,30),ylim=c(-1,12))
abline(a,b,col="red")
t <- acp$loadings[,1] %*% (t(cars2)-acp$center)
X1 <- acp$center[1] +t * acp$loadings[1,1]
X2 <- acp$center[2] +t * acp$loadings[2,1]
segments(cars2$speed,cars2$dist,X1,X2,col="blue")
```
----
## Least Absolute Deviation
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
f <- function(a) sum(abs(cars$dist-(a[1]+a[2]*cars$speed)))
opt <- optim( c(0,0), f )$par
plot(cars)
abline(model, col='red', lty=2)
abline(opt[1], opt[2],col="blue")
```
----
## Quantile Regression
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(quantreg,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(cars)
abline(model, col="blue")
abline(rq(dist ~ speed,data=cars, tau=.5),col="red",lty=2)
abline(rq(dist ~ speed,data=cars, tau=.9),col="purple",lty=2)
```
----
## Diagnostic and Errors
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(model,which=1)
```
Scatterplot $(\widehat{Y}_i,\widehat{\varepsilon}_i)$
----
## Diagnostic and Errors
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(model,which=2)
```
Scatterplot $\displaystyle{\left(\widehat{\varepsilon}_{i:n},\Phi^{-1}
\left(\frac{i}{n}\right)\right)}$
----
## Diagnostic and Errors
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(model,which=3)
```
Scatterplot $(\widehat{Y}_i,\sqrt{\vert\widehat{\varepsilon}_i\vert})$
----
## Diagnostic and Errors
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(model,which=4)
```
Cook distance,
$\displaystyle{C_i=\frac{\widehat{\varepsilon}_i^2}{p\cdot\text{MSE}}\cdot\left(\frac{H_{i,i}}{(1-H_{i,i})^2}\right)}$ with $\boldsymbol{H}=\boldsymbol{X}(\boldsymbol{X}^{\text{T}}\boldsymbol{X})^{-1}\boldsymbol{X}^{\text{T}}=[H_{i,i}]$
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
C=cooks.distance(model)
```
----
## Diagnostic and Errors
$H_{i,i}$ are leverages, and define Studentized residuals as
$$
\widehat{r}_i=\frac{\widehat{\varepsilon}_i}{
\widehat{\sigma} \sqrt{1-H_{i,i}}}
$$
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
rstudent(model)
```
----
## Diagnostic and Errors
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(model,which=5)
```
Scatterplot $(H_{i,i},\widehat{r}_i)$
----
## Diagnostic and Errors
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(model,which=6)
```
Scatterplot $(H_{i,i},C_i)$
----
## Diagnostic and Errors
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
hist(residuals(model),probability=TRUE, col="light green")
lines(density(residuals(model)),lwd=2,col="red")
boxplot(residuals(model),horizontal=TRUE,add=TRUE,at=.024,
pars=list(boxwex=.004),col=rgb(0,0,1,.25))
```
----
## Diagnostic and Errors
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
sigma=summary(model)$sigma
plot(ecdf(residuals(model)/sigma))
lines(seq(-3,3,by=.1),pnorm(seq(-3,3,by=.1)),lwd=2,col="red")
```
----
## Diagnostic and Errors
Kolmogorov-Smirnov $
d=\sup_{x\in\mathbb{R}}\lbrace \vert\Phi(x)-\widehat{F}_n(x)\vert \rbrace
$
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
ks.test(residuals(model)/sigma,"pnorm")
```
----
## Diagnostic and Errors
Anderson-Darling, Cramer-von Mises,
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(nortest,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ad.test(residuals(model))
cvm.test(residuals(model))
```
----
## Model Choice
AIC
$AIC = 2k - 2\log(\mathcal{L}) = 2k + n\left[\log\left(2\pi \frac{1}{n}\sum_{i=1}^n \widehat{\varepsilon}_i^2 \right) + 1\right]$
BIC
$BIC = { k \log(n) -2 \log(\mathcal{L}) } = k \log(n) + n\left[\log\left(2\pi \frac{1}{n}\sum_{i=1}^n \widehat{\varepsilon}_i^2 \right) + 1\right]$
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
AIC(model)
AIC(model,k=log(n))
```
----
## Testing Linear Assumptions
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(splines,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_brk <- lm(dist ~ bs(speed,degree=1,knots=(c(4,15,25))), data=cars)
x=seq(4,25,by=.1)
y=predict(model_brk, newdata=data.frame(speed=x))
```
see $b$-splines,
$$
y_i=\beta_0+\beta_1 x_i + \beta_2 (x_i-15)_+ + \varepsilon_i
$$
----
## Testing Linear Assumptions
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars)
lines(x,y,lwd=2,col="blue")
```
----
## Testing Linear Assumptions
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
positive=function(x,s) ifelse(x>s,x-s,0)
model_brk <- lm(dist ~ speed + positive(speed,15), data=cars)
x=seq(4,25,by=.1)
y=predict(model_brk, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")
```
----
## Testing Linear Assumptions
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")
abline(coefficients(model_brk)[1],coefficients(model_brk)[2],col="blue",lty=2)
summary(model_brk)$coefficients
```
----
## Testing Linear Assumptions
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(strucchange,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(Fstats(dist ~ speed,data=cars,from=7/50))
```
see Chow Test
----
## Testing Linear Assumptions
$$
W_t=\frac{1}{\widehat{\sigma}\sqrt{n}}\sum_{i=1}^{
\lfloor nt \rfloor} \widehat{\varepsilon}_i
$$
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
cusum <- efp(dist ~ speed, type = "OLS-CUSUM",data=cars)
plot(cusum,ylim=c(-2,2))
```
----
## Testing Linear Assumptions
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(cusum, alpha = 0.05, alt.boundary = TRUE,ylim=c(-2,2))
```
see CUSUM test
----
## Model with Categorical Covariates
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
model_cut=lm(dist~ cut(speed, breaks=c(0,10,15,20,25)),data=cars)
y=predict(model_cut, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue",type="s")
```
----
## Model with Categorical Covariates
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(rpart,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
tree=rpart(dist~speed,data=cars)
y=predict(tree,newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue",type="s")
```
----
## Smoothing : Polynomial Regression
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
model_poly=lm(dist~ poly(speed, df=3),data=cars)
y=predict(model_poly, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")
```
----
## Smoothing : Local Regression
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(cars)
bw <- dpill(cars$speed,cars$dist)
lines( locpoly(cars$speed,cars$dist,degree=0, bandwidth=bw), col='red' )
lines( locpoly(cars$speed,cars$dist,degree=1, bandwidth=bw), col='green' )
lines( locpoly(cars$speed,cars$dist,degree=2, bandwidth=bw), col='blue' )
```
----
## Smoothing : $k$-Nearest Neighboors
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(FNN,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
knn=knn.reg(train=cars$speed,y=cars$dist,k=5)
plot(cars)
lines(cars$speed,knn$pred,col="red")
```
----
## Smoothing : $b$ Splines
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(splines,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_bs <- lm(dist ~ bs(speed), data=cars)
y=predict(model_bs, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")
```
----
## Smoothing Linear Model
Tuning parameter selection, Silverman's Rule
$$
b^\star=0.9 \cdot\frac{ \min\lbrace \sigma,F^{-1}(.75)-F^{-1}(.25)\rbrace}{1.34 \cdot n^{1/5}}
$$
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
bw.nrd0(cars$speed)
```
See kernel regression
----
## Smoothing Linear Model
Tuning parameter selection, Cross Validation
$$
b^\star=\text{argmin}\left\lbrace
\sum_{i=1}^n \left(y_i - \widehat{m}_{(i)}(\boldsymbol{x}_i)\right)^2\right\rbrace
$$
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
bw.ucv(cars$speed)
```
----
## Smoothing Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
Nadaraya_Watson =with(cars,ksmooth(speed, dist, "normal",bandwidth=2.75))
plot(cars)
abline(model,col="red",lty=2)
lines(Nadaraya_Watson,lwd=2,col="blue")
```
----
## Smoothing Linear Model
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_loess=loess(dist~ speed, data=cars,degree=1, family="gaussian")
y=predict(model_loess, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")
```
----
## Multiple Regression
Life Expectancy (1), Homicide Rate (2), Illiteracy Rate (3)
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
chicago=read.table("http://freakonometrics.free.fr/chicago.txt",header=TRUE,sep=";")
model_c = lm(Fire~X_2+X_3,data=chicago)
y=function(x2,x3) predict(model_c,newdata=data.frame(X_2=x2,X_3=x3))
VX2=seq(0,80,length=26); VX3=seq(5,25,length=26)
VY=outer(VX2,VX3,y)
persp(VX2,VX3,VY,xlab="Homicide",ylab="Illiteracy",zlab="Fire",theta=20)
```
----
## Multiple Regression
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
VX2=seq(0,80,length=251); VX3=seq(5,25,length=251)
VY=outer(VX2,VX3,y)
image(VX2,VX3,VY,xlab="Homicide",ylab="Illiteracy")
contour(VX2,VX3,VY,add=TRUE)
```
----
## Model selection
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
model_c = lm(Fire~.,data=chicago)
summary(model_c)$r.squared
summary(model_c)$adj.r.squared
```
----
## Model selection
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
logLik(model_c)
AIC(model_c, k=2) # AIC
AIC(model_c, k=log(nrow(cars))) # BIC
```
----
## Penalization
$$
\widehat{\boldsymbol{\beta}} = \text{argmin}\left\lbrace
\sum_{i=1}^n \left[y_i- \left(\beta_0+\sum_{j=1}^k \beta_j x_{j,i} \right)\right] + \color{red}{\lambda} \sum_{j=1}^k \beta_j ^2\right\rbrace
$$
with an explicit solution $\widehat{\boldsymbol{\beta}}=(\boldsymbol{X}^{\text{T}}\boldsymbol{X}-\color{red}{\lambda} \mathbb{I})^{-1} \boldsymbol{X}^{\text{T}}\boldsymbol{Y}$.
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(MASS,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_ridge <- lm.ridge(Fire ~ ., data=chicago, lambda=1)
```
see more generally Tikhonov regularization
----
## Optimal Penalization
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
mse <- NULL
n=100
v <- matrix(c(0,coefficients(model_c)[-1]), nr=n, nc=4, byrow=TRUE)
kl <- c(1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2,
.1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.2, 1.4, 1.6, 1.8, 2)
for (k in kl) {
r <- matrix(NA, nr=n, nc=4)
for (i in 1:n) {
boot_c <- chicago[sample(1:nrow(chicago),nrow(chicago),replace=TRUE),]
r[i,2:4] <- model_ridge <- lm.ridge(Fire ~ ., data=boot_c, lambda=k)$coef
r[i,1] <- mean(boot_c[,"Fire"])
}
mse <- append(mse, apply( (r-v)^2, 2, mean )[2])
}
```
----
## Optimal Penalization
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot( mse ~ kl, type='l' )
```
----
## Variable Selection
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
step(lm(Fire ~ .,data=chicago))
```
---
## Variable Selection
LASSO (least absolute shrinkage and selection operator)
$$
\widehat{\boldsymbol{\beta}} = \text{argmin}\left\lbrace
\sum_{i=1}^n \left[y_i- \left(\beta_0+\sum_{j=1}^k \beta_j x_{j,i} \right)\right] + \color{blue}{\lambda} \sum_{j=1}^k \vert\beta_j\vert\right\rbrace
$$
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(glmnet,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
fit = glmnet(x = as.matrix(chicago[,2:4]), y = chicago[,1], family = "gaussian", alpha = 1)
```
----
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(fit, xvar="lambda", label = TRUE )
```
----
## Variable Selection
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
step(model)
```
----
## Dimension Reduction
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
model_acp=lm(Fire ~ princomp(cbind(X_1,X_2,X_3))$scores[,1:3],data=chicago)
predict(model_c)[1:5]
predict(model_acp)[1:5]
```
----
## Dimension Reduction
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
model_acp=lm(Fire ~ princomp(cbind(X_1,X_2,X_3))$scores[,1:2],data=chicago)
predict(model_c)[1:5]
predict(model_acp)[1:5]
```
----
## Logistic Regression
$$
\mathbb{E}[Y\vert \boldsymbol{X}=\boldsymbol{x}]=
p(\boldsymbol{x})=\frac{\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]}{1+\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]}=H(\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta})
$$
logistic regression
----
## Logistic Regression
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
myocarde <-read.table("http://freakonometrics.free.fr/myocarde.csv", head=TRUE,sep=";")
model_bernoulli = glm(PRONO~., data=myocarde, family=binomial)
coefficients(summary(model_bernoulli))
```
----
## Logistic Regression
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
X=cbind(1,as.matrix(myocarde[,1:7]))
Y=myocarde$PRONO=="Survival"
beta=as.matrix(lm(Y~0+X)$coefficients,ncol=1)
for(s in 1:9){
pi=exp(X%*%beta[,s])/(1+exp(X%*%beta[,s]))
gradient=t(X)%*%(Y-pi)
omega=matrix(0,nrow(X),nrow(X));diag(omega)=(pi*(1-pi))
Hessian=-t(X)%*%omega%*%X
beta=cbind(beta,beta[,s]-solve(Hessian)%*%gradient)}
```
----
## Logistic Regression
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
beta[,1]
beta[,9]
```
----
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
# -solve(Hessian)
sqrt(-diag(solve(Hessian)))
```
----
## Logistic Regression
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
GLM <- glm(PRONO ~ PVENT + REPUL, data = myocarde, family = binomial)
pred_GLM = function(p,r){
return(predict(GLM, newdata =
data.frame(PVENT=p,REPUL=r), type="response"))}
vp=seq(min(myocarde$PVENT),max(myocarde$PVENT),length=251)
vr=seq(min(myocarde$REPUL),max(myocarde$REPUL),length=251)
vd=outer(vp,vr,pred_GLM)
library(RColorBrewer,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
CL=colorRampPalette(brewer.pal(name="RdYlBu", 11))(100)
```
----
## Logistic Regression
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
image(vp,vr,vd,col=CL,xlab="PVENT",ylab="REPUL")
```
----
## Logistic Regression
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
probabilities <- predict(GLM, myocarde, type="response")
predictions <- levels(myocarde$PRONO)[(probabilities>.5)+1]
table(predictions, myocarde$PRONO)
```
----
## Residuals and GLM
Pearson's residuals
$$
\widehat{\varepsilon}_i=\frac{y_i - \widehat{\pi}_i}{
\sqrt{\widehat{\pi}_i\cdot[1-\widehat{\pi}_i]}}
$$
Deviance residuals
$$
\widehat{\varepsilon}_i=\pm \left(
y_i \log[\widehat{\pi}_i]+
(1-y_i) \log[1-\widehat{\pi}_i]\right)
$$
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
E1=residuals(model_bernoulli,type="pearson")
E2=residuals(model_bernoulli,type="deviance")
```
----
## Residuals and GLM
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
plot(1:nrow(myocarde),E1)
```
----
## Multinomial Regression
Binomial case, $Y\in\lbrace 0,1\rbrace$,
$\mathbb{P}[Y=1\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]$
$\mathbb{P}[Y=0\vert \boldsymbol{X}=\boldsymbol{x}]\propto1$
Multinomial case, $Y\in\lbrace A,B,C\rbrace$,
$\mathbb{P}[Y=\color{red}{A}\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\color{red}{\boldsymbol{\beta}_A}]$
$\mathbb{P}[Y=\color{blue}{B}\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\color{blue}{\boldsymbol{\beta}_B}]$
$\mathbb{P}[Y=\color{green}{C}\vert \boldsymbol{X}=\boldsymbol{x}]\propto1$
----
## Multinomial Regression
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
library(VGAM,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_multi = vglm(PRONO~., data=myocarde, family="multinomial")
model_multi
```
----
## Generalized Linear Models
Poisson and Gamma distributions in the exponential family
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
base=data.frame(x=c(1,2,3,4,5),y=c(1,2,4,2,6))
regNId <- glm(y~x,family=gaussian(link="identity"),data=base)
regNlog <- glm(y~x,family=gaussian(link="log"),data=base)
regPId <- glm(y~x,family=poisson(link="identity"),data=base)
regPlog <- glm(y~x,family=poisson(link="log"),data=base)
regGId <- glm(y~x,family=Gamma(link="identity"),data=base)
regGlog <- glm(y~x,family=Gamma(link="log"),data=base)
```
----
## Generalized Linear Models
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
visuel=function(regression,titre=""){
plot(base$x,base$y,pch=19,cex=1.5,main=titre,xlab="",ylab="")
abs <- seq(0,7,by=.1)
yp <- predict(regression,newdata=data.frame(x=abs),se.fit = TRUE, type="response")
polygon(c(abs,rev(abs)),c(yp$fit+2*yp$se.fit,rev(yp$fit-2*yp$se.fit)), col="light green",border=NA)
points(base$x,base$y,pch=19,cex=1.5)
lines(abs,yp$fit,lwd=2,col="red")
lines(abs,yp$fit+2*yp$se.fit,lty=2)
lines(abs,yp$fit-2*yp$se.fit,lty=2)}
```
----
## Generalized Linear Models
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
visuel(regNId,"Gaussienne, lien identité")
```
----
## Generalized Linear Models
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
visuel(regPId,"Poisson, lien identité")
```
----
## Generalized Linear Models
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
visuel(regGId,"Gamma, lien identité")
```
----
## Generalized Linear Models
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
visuel(regNlog,"Gaussienne, lien logarithmique")
```
----
## Generalized Linear Models
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
visuel(regPlog,"Poisson, lien logarithme")
```
----
## Generalized Linear Models
```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE}
visuel(regGlog,"Gamma, lien logarithme")
```