--- title: "Data Science for Actuaries" author: "Arthur Charpentier, @freakonometrics" highlighter: highlight.js output: pdf_document job: 2018 knit: slidify::knit2slides mode: selfcontained hitheme: tomorrow framework: io2012 widgets : [mathjax, quiz, bootstrap, interactive] # {mathjax, quiz, bootstrap} ext_widgets : {rCharts: [libraries/nvd3, libraries/leaflet, libraries/dygraphs]} assets : {assets: /assets} #- shiny # library(devtools) # install_github("slidify", "ramnathv") # install_github("slidifyLibraries", "ramnathv") --- # R & Packages ```{r set-options, echo=TRUE, cache=FALSE} getOption("defaultPackages") (.packages(all.available=TRUE)) rm(list=ls()) options(width = 80) library(knitr); library(htmlwidgets); library(dplyr); library(leaflet) library(distr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) library(distrEx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) library(pryr,verbose=FALSE,quietly=TRUE) library(wordcloud,verbose=FALSE,quietly=TRUE) library(tm,verbose=FALSE,quietly=TRUE) ``` --- # Using with RStudio Download **RStudio** from http://www.rstudio.com
--- ```{r, error=TRUE} ?rq library(quantreg,verbose=FALSE,quietly=TRUE) require(quantreg) ?rq rq detach(package:quantreg, unload=TRUE) ?rq ``` --- ```{r} apropos("norm") ?pnorm ``` --- # S3 vs S4 ```{r} person3 <- function(name,age,weight,height){ characteristics<-list(name=name,age=age,weight=weight,height=height) class(characteristics)<-"person3" return(characteristics)} JohnDoe3 <- person3(name="John",age=28, weight=76, height=182) JohnDoe3$age BMI3 <- function(object,...) {return(object$weight*1e4/object$height^2)} BMI3(JohnDoe3) ``` --- ```{r} JohnDoe3 ``` --- ```{r} reg3 <- lm(dist~speed,data=cars) reg3$coefficients coef(reg3) ``` --- ```{r} summary(reg3) ``` --- ```{r} setClass("person4", representation(name="character", age="numeric", weight="numeric", height="numeric"),where=.GlobalEnv) JohnDoe4 <- new("person4",name="John",age=28, weight=76, height=182) JohnDoe4 ``` --- ```{r} JohnDoe4@age setGeneric("BMI4",function(object,separator) return(standardGeneric("BMI4")), where=.GlobalEnv) setMethod("BMI4", "person4",where=.GlobalEnv, function(object){return(object@weight*1e4/object@height^2)}) BMI4(JohnDoe4) ``` --- ```{r} library(VGAM,verbose=FALSE,quietly=TRUE) reg4 <- vglm(dist~speed,data=cars,family=gaussianff) reg4@coefficients coef(reg4) ``` --- ```{r} pnorm(3.65102,mean=5,sd=2) library(distr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) X <- Norm(mean=5,sd=2) mean(X) p(X)(3.65102) ``` --- ```{r} qnorm(0.25,mean=5,sd=2) library(distrEx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) distr::q(X) distr::q(X)(0.25) ``` --- ```{r} X1 <- Norm(mean=5,sd=2) X2 <- Norm(mean=2,sd=1) S <- X1+X2 distr::q(S)(.5) ``` --- ```{r} plot(S) ``` --- # R objects : Numbers ```{r} set.seed(1) U <- runif(20) U[1:4] U[1] == 0.2655087 ``` --- ```{r} options(digits = 3) U[1:4] options(digits = 22) U[1:4] options(digits = 3) ``` --- ```{r} x <- exp(1) y <- x x <- 2 y class(x) ``` --- ```{r} 0/0 1/0 .Machine$double.xmax 2e+3070]) x[-(3:8)] # x[-3:8] x[c(TRUE,FALSE)] ``` --- ```{r} x[c(TRUE,NA,FALSE,TRUE)] x[2] x[[2]] ``` --- ```{r} for(i in 1:2){ nom_var <- paste0("x", i) assign(nom_var, rpois(5,7)) } x1 x2 ``` --- ```{r} x <- runif(4) x x ``` --- ```{r, error=TRUE} library(pryr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) x %.5) {("greater than 50%")} else {("smaller than 50%")} ifelse(u>.5,("greater than 50%"),("smaller than 50%")) u ``` --- ```{r} u <- runif(3) if(u>.5) {print("greater than 50%")} else {("smaller than 50%")} ifelse(u>.5,("greater than 50%"),("smaller than 50%")) u ``` --- # Loops ```{r} v_x <- runif(1e5) sqrt_x <- NULL for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x)) sqrt_x <- NULL system.time(for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x))) ``` --- ```{r} sqrt_x <- numeric(length(v_x)) for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i]) sqrt_x <- numeric(length(v_x)) system.time(for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i])) ``` --- ```{r} system.time(Vectorize(sqrt)(v_x)) sqrt_x <- sapply(v_x,sqrt) sqrt_x <- unlist(lapply(v_x,sqrt)) system.time(unlist(lapply(v_x,sqrt))) ``` --- ```{r} library(parallel,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) (allcores <- detectCores(all.tests=TRUE)) sqrt_x <- unlist(mclapply(v_x,sqrt,mc.cores=4)) system.time(unlist(mclapply(v_x,sqrt,mc.cores=4))) ``` --- ```{r} f<-function(x) dlnorm(x) integrate(f,0,Inf) integrate(f,0,1e5) integrate(f,0,1e3)$value+integrate(f,1e3,1e5)$value ``` --- # Apply ```{r} rN.Poisson <- function(n) rpois(n,5) rX.Exponential <- function(n) rexp(n,2) set.seed(1) rN.Poisson(5) rX.Exponential(3) ``` --- ```{r} rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ V <- rep(0,n) for(i in 1:n){ N <- rN(1) if(N>0){V[i] <- sum(rX(N))} } return(V)} rcpd1(3) ``` --- ```{r} rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ V <- rep(0,n) for (i in 1:n) V[i] <- sum(rX(rN(1))) return(V)} rcpd1(3) ``` --- ```{r} rcpd2 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ N <- rN(n) X <- rX(sum(N)) I <- factor(rep(1:n,N),levels=1:n) return(as.numeric(xtabs(X ~ I)))} rcpd2(3) ``` --- ```{r} rcpd3 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ N <- rN(n) X <- rX(sum(N)) I <- factor(rep(1:n,N),levels=1:n) V <- tapply(X,I,sum) V[is.na(V)] <- 0 return(as.numeric(V))} rcpd3(3) ``` --- ```{r} rcpd4 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ return(sapply(rN(n), function(x) sum(rX(x))))} rcpd4(3) ``` --- ```{r} rcpd5 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ return(sapply(Vectorize(rX)(rN(n)),sum))} rcpd5(3) ``` --- ```{r} rcpd6 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ return(unlist(lapply(lapply(t(rN(n)),rX),sum)))} rcpd6(3) ``` --- ```{r} n <- 100 library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) options(digits=1) microbenchmark(rcpd1(n),rcpd2(n),rcpd3(n),rcpd4(n),rcpd5(n),rcpd6(n)) options(digits=5) ``` --- # Vectorization ```{r} f <- function(x,m=0,s=1){ H<-function(t) 1-pnorm(t,m,s) integral<-integrate(H,lower=x,upper=Inf)$value res<-H(x)/integral return(res) } f(0) f(0:1) ``` --- ```{r} y <- rep(NA,2) x <- 0:1 for(i in 1:2) y[i] <- f(x[i]) y sapply(x,"f") Vectorize(f)(x) ``` --- ```{r} binorm <- function(x1,x2,r=0){ exp(-(x1^2+x2^2-2*r*x1*x2)/(2*(1-r^2)))/(2*pi*sqrt(1-r^2)) } u <- seq(-2,2) binorm(u,u) outer(u,u,binorm) ``` --- ```{r} (uv<-expand.grid(u,u)) matrix(binorm(uv$Var1,uv$Var2),5,5) ``` --- ```{r} u <- seq(-2,2,length=31) persp(u,u,outer(u,u,binorm),theta=25) ``` --- # Computational Efficiency ```{r} library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) n <- 1000 A<-matrix(seq(1,n^2),n,n) B<-matrix(seq(1,n^2),n,n) C<-1:n microbenchmark((A%*%B)%*%C,A%*%(B%*%C),replications=1) ``` --- ```{r} m <- 1000 n <- 50 X <- matrix(rnorm(m * n, mean = 10, sd = 3), nrow = m) group <- rep(c("A","B"), each = n / 2) system.time(for(i in 1:m) t.test(X[i, ] ~ group)$stat) system.time(for(i in 1:m) t.test(X[i, group == "A"], X[i, group == "B"])$stat) ``` --- ```{r} t.test_1 <- function(x, grp){ t.test(x[ group == "A"], x[ group == "B"])$stat } system.time(t1 <- apply(X, 1, t.test_1, grp = grp)) ``` --- ```{r} t.test_2 <- function(x, grp) { t_stat <- function(x) { m <- mean(x) n <- length(x) var <- sum((x - m) ^ 2) / (n - 1) list(m = m, n = n, var = var) } g1 <- t_stat(x[grp == "A"]) g2 <- t_stat(x[grp == "B"]) se_total <- sqrt(g1$var / g1$n + g2$var / g2$n) (g1$m - g2$m) / se_total } system.time(apply(X, 1, t.test_2, grp = group)) ``` --- ```{r} t.test_3 <- function(X, grp){ t_stat <- function(X) { m <- rowMeans(X) n <- ncol(X) var <- rowSums((X - m) ^ 2) / (n - 1) list(m = m, n = n, var = var) } g1 <- t_stat(X[, grp == "A"]) g2 <- t_stat(X[, grp == "B"]) se_total <- sqrt(g1$var / g1$n + g2$var / g2$n) (g1$m - g2$m) / se_total } system.time( t.test_3(X, group)) ``` --- ```{r} "%pm%" <- function(x,s) x + c(qnorm(.05),qnorm(.95))*s 100 %pm% 10 'second<-' <- function(x, value) { x[2] <- value return(x) } x <- 1:10 second(x) <- 5 x ``` --- ```{r} library(pryr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) x <- 1:10 address(x) second(x) <- 6 address(x) ``` --- ```{r} ls(globalenv()) find("x") find("pi") environment(sd) ``` --- ```{r} f <- function(x) x^2 f formals(f) body(f) ``` --- ```{r} e <- new.env() e$d <- 1 e$f <- 1:5 e$g <- 1:5 ls(e) str(e) identical(globalenv(),e) search() ``` --- ```{r} f1 <- function(x1){ f2 <- function(x2){ f3 <- function(x3){ x1+x2+x3 } f3(3) } f2(2) } f1(1) ``` --- ```{r, error=TRUE} f <- function(x) log(x) f("x") try(f("x")) inherits(try(f("x")), "try-error") inherits(try(f("x"), silent=TRUE), "try-error") ``` --- ```{r} a=0 try(a<-f("x"), silent=TRUE) a try(a<-f(x), silent=TRUE) a ``` --- ```{r, error=TRUE} x=1:10 g=function(f) f(x) g(mean) ``` --- ```{r} fibonacci <- function(n){ if(n<2) return(1) fibonacci(n-2)+fibonacci(n-1) } fibonacci(20) system.time(fibonacci(30)) ``` --- ```{r} library(memoise,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) fibonacci <- memoise(function(n){ if(n<2) return(1) fibonacci(n-2)+fibonacci(n-1) }) system.time(fibonacci(30)) ``` --- ```{r} set.seed(1) x<-rexp(6) sum(x) .Primitive("sum")(x) ``` --- ```{r} library(Rcpp,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) cppFunction('double sum_C(NumericVector x) { int n = x.size(); double total = 0; for(int i = 0; i < n; ++i) { total += x[i]; } return total; }') sum_C(x) ``` --- ```{r} library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) x <- runif(1e6) microbenchmark( sum(x), .Primitive("sum")(x), sum_C(x)) ``` --- ```{r} names_list <- function(...){ names(list(...)) } names_list(a=5,b=7) ``` --- ```{r, error=TRUE} power <- function(exponent) { function(x) { x ^ exponent } } square <- power(2) square(4) cube <- power(3) cube(4) ``` --- ```{r} cube ``` --- ```{r} total <- 20 pb <- txtProgressBar(min = 0, max = total, style = 3) for(i in 1:(10*total)){ Sys.sleep(0.01) if(i %%10 == 0) setTxtProgressBar(pb, i%/%10) } close(pb) ``` --- # R objects : Data Frames ```{r} df <- data.frame(x=1:3,y=letters[1:3]) str(df) typeof(df) class(df) ``` --- ```{r} cbind(df,z=9:7) df$z<-5:3 df ``` --- ```{r} df[1] df[,1,drop=FALSE] df[,1,drop=TRUE] ``` --- ```{r} df[[1]] df$x df[,"x"] df[["x"]] ``` --- ```{r} set.seed(1) df[sample(nrow(df)),] set.seed(1) df[sample(nrow(df),nrow(df)*2,replace=TRUE),] ``` --- ```{r} subset(df, x>1) ``` --- ```{r} set.seed(123) df <- data.frame(Y=rnorm(50), X1=as.factor(sample(LETTERS[1:4],size=50, replace=TRUE)), X2=as.factor(sample(1:3,size=50,replace=TRUE))) tail(df) ``` --- ```{r} reg <- lm(Y~X1+X2,data=df) model.matrix(reg)[45:50,] ``` --- ```{r} reg <- lm(Y~(X1=="A")+X2,data=df) model.matrix(reg)[45:50,] ``` --- ```{r} reg <- lm(Y~X1*X2,data=df) model.matrix(reg)[45:50,] ``` --- ```{r} reg <- lm(Y~X1+X2%in%X1,data=df) model.matrix(reg)[45:50,] ``` --- ```{r} download.file("http://freakonometrics.free.fr/superheroes.RData","superheroes.RData") load("superheroes.RData") superheroes publishers ``` --- ```{r} library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) inner_join(superheroes, publishers) ``` --- ```{r} inner_join(publishers, superheroes) ``` --- ```{r} merge(superheroes, publishers, all = TRUE) ``` --- ```{r} semi_join(superheroes, publishers) ``` --- ```{r} semi_join(publishers, superheroes) ``` --- ```{r} left_join(superheroes, publishers) ``` --- ```{r} left_join(publishers, superheroes) ``` --- ```{r} anti_join(superheroes, publishers) ``` --- ```{r} anti_join(publishers, superheroes) ``` --- ```{r} download.file("http://freakonometrics.free.fr/gapminderDataFiveYear.txt", "gapminderDataFiveYear.txt") gdf <- read.delim("gapminderDataFiveYear.txt") head(gdf,4) ``` --- ```{r} str(gdf) ``` --- ```{r} library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) gtbl <- tbl_df(gdf) library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) gdfdt <- data.table(gdf) gdfdt ``` --- ```{r,error=TRUE} subset(gdf, lifeExp < 30) gdf[lifeExp < 30,] gdf[gdf$lifeExp < 30,] ``` --- ```{r} filter(gtbl, lifeExp < 30) subset(gdfdt, lifeExp < 30) gdfdt[lifeExp < 30,] ``` --- ```{r} gdf[gdf$country == "Italy", c("year", "lifeExp")] ``` --- ```{r} gtbl %>% filter(country == "Italy") %>% select(year, lifeExp) gdfdt[country == "Italy",cbind(year, lifeExp)] setkey(gdfdt,country) gdfdt["Italy",cbind(year, lifeExp)] ``` ---- # Data frames (and other format) ```{r} library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) ldf <- tbl_df(gdf) glimpse(ldf) ``` --- ```{r} select(ldf,lifeExp) select(ldf,lifeExp)$lifeExp[1:10] ``` --- ```{r} ldf[4:5,] slice(ldf,4:5) ``` --- ```{r} filter(ldf,lifeExp<30 | gdpPercap<300) rename(ldf,population = pop) ``` --- ```{r} library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) dt <- data.table(gdf) is.data.frame(dt) ``` --- ```{r} object.size(gdf) object.size(ldf) object.size(dt) ``` --- ```{r} df[df$lifeExp<30,] subset(df, lifeExp < 30) filter(ldf, lifeExp < 30) ``` --- ```{r} subset(dt, lifeExp < 30) dt[lifeExp<30] ``` --- ```{r} setkey(dt, country) dt["Italy"] ``` --- ```{r} dt["Italy",mult="first"] dt["Italy",max(lifeExp)] max(df$lifeExp[df$country=="Italy"]) ``` --- ```{r} setkey(dt, country, lifeExp) dt["Italy"] ``` --- ```{r} dt[.("Italy",77.44)] dt[.("Italy",77)] dt[.("Italy",77),roll="nearest"] ``` --- ```{r} df[df$country == "Italy", c("year", "lifeExp")] ldf %>% filter(country == "Italy") %>% select(year, lifeExp) ``` --- ```{r} small_df <- df[df$country %in% c("France","Italy","Spain"), c("country","year", "lifeExp")] ``` --- ```{r} library(tidyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) str(small_df) table_df=spread(small_df,"country","lifeExp") table_df ``` --- ```{r} data_df=gather(table_df,"country","lifeExp",2:4) data_df ``` --- ```{r} library(reshape2,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) dcast(small_df, formula = country~year, value.var="lifeExp") table_df= dcast(small_df, formula = year~country, value.var="lifeExp") table_df ``` --- ```{r} melt(table_df, id.vars=c("year"), value.name = "lifeExp", variable.name="country") ``` --- ```{r} aggregate(small_df$lifeExp,FUN=max,by=list(small_df$country)) tapply(small_df$lifeExp,factor(small_df$country),max) ``` --- ```{r} small_ldf = filter(ldf,country %in% c("France","Italy","Spain")) group_by(small_ldf,country) %>% summarize(max(lifeExp)) filter(ldf,country %in% c("France","Italy","Spain")) %>% group_by(country) %>% summarize(max(lifeExp)) ``` --- ```{r} filter(ldf,country %in% c("France","Italy","Spain")) %>% group_by(country) %>% summarize(max(lifeExp)) ``` --- ```{r} plot(df$gdpPercap,df$lifeExp) ``` --- ```{r} select(ldf,gdpPercap,lifeExp) %>% plot() ``` --- ```{r} setkey(dt, country) dt[c("France","Italy","Spain"),max(lifeExp),by="country"] dt[c("France","Italy","Spain"),max(lifeExp),by="country"] ``` --- ```{r} head(df[with(df,order(-lifeExp)),]) head(dt[order(-lifeExp)]) head(setorder(dt,-lifeExp)) arrange(ldf,-lifeExp) ``` --- ```{r} df$gdp=df$pop*df$gdpPercap df = within(df,gdp = pop*gdpPercap) df = within(df,gdp <- pop*gdpPercap) dt[,gdp:=pop*gdpPercap] head(dt) ``` --- ```{r} mutate(ldf,gdp=pop*gdpPercap) ``` --- ```{r} countries=read.csv2("http://freakonometrics.free.fr/countries.csv",header=TRUE) countries = countries[-1,] head(countries) df2 = countries[,c("Country","Capital")] names(df2)=c("countries","capital") ``` --- ```{r} df1 = aggregate(df$lifeExp,FUN=max,by=list(df$country)) names(df1)=c("countries","lifeExp") dim(df1) dim(df2) tail(merge(df1,df2)) ``` --- ```{r} tail(merge(df1,df2,all.x=TRUE)) tail(merge(df1,df2,all.y=TRUE)) ``` --- ```{r} ldf1 = group_by(ldf, country) %>% summarize(max(lifeExp)) names(ldf1)=c("countries","lifeExp") ldf2 <- tbl_df(df2) inner_join(ldf1,ldf2) left_join(ldf1,ldf2) ldf1 %>% inner_join(ldf2) ``` --- ```{r} dt1 = dt[,max(lifeExp),by="country"] setnames(dt1,c("countries","lifeExp")) dt2 <- data.table(df2) setkey(dt1,"countries") setkey(dt2,"countries") dt1[dt2,nomatch=0] dt1[dt2] dt2[dt1] ``` --- ```{r} library(stringr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) library(lubridate,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) library(plyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) library(LaF,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) ``` --- ```{r} loc="http://freakonometrics.free.fr/gapminderDataFiveYear.txt" download.file(loc,destfile="temp") ssn <- laf_open_csv("temp",sep="\t", column_types=c("character","integer","numeric","character","numeric","numeric"), column_names=names(df)[1:6]) object.size(ssn) str(ssn) nrow(ssn) ``` --- ```{r} go_through <- which(ssn[1:(nrow(ssn)-1),'country'] != ssn[2:nrow(ssn),'country'])+1 if(go_through[length(go_through)] != nrow(ssn)) go_through <- c(go_through, nrow(ssn)) go_through <- cbind(go_through[-length(go_through)], c(go_through[-c(1, length(go_through))]-1, go_through[length(go_through)])) ``` --- ```{r} worst_life_exp <- function(s){ data <- ssn[go_through[s,1]:go_through[s,2],c("country","year","lifeExp")] data[which.min(data$lifeExp),] } worst_life_exp(10) ``` --- # Graphs with ggplot2 ```{r} library(gamair,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) data(chicago) head(chicago) ``` --- ```{r} base=data.frame(death=chicago$death, temp_F=chicago$tmpd, o3=chicago$o3median, date=seq(as.Date("1987-01-01"), as.Date("2000-12-31"),by=1)) base$temp_C <- (base$temp_F-32)/1.8 base$year <- substring(base$date,1,4) ``` --- ```{r} date2season <- function(date){ m <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%m")) d <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%d")) s <- NA if(m %in% c(1,2) | ((m==12)&(d>=21)) | ((m==3)&(d<21))) s <- "winter" if(m %in% c(4,5) | ((m==3)&(d>=21)) | ((m==6)&(d<21))) s <- "spring" if(m %in% c(7,8) | ((m==6)&(d>=21)) | ((m==9)&(d<21))) s <- "summer" if(m %in% c(10,11) | ((m==9)&(d>=21)) | ((m==12)&(d<21))) s <- "autumn" return(s)} base$season <- sapply(base$date,date2season) head(base) ``` --- ```{r} plot(base[,c("date", "temp_C")]) ``` --- ```{r} library(ggplot2,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) g <- ggplot(base, aes(date, temp_C)) + geom_point() g ggsave(file="figure.pdf", scale=3) ``` --- ```{r} g <- g+labs(title='Temperature') g <- g+ggtitle('Temperature') g ``` --- ```{r} g+theme(plot.title = element_text(size=20, face="bold", vjust=2, color="red")) ``` --- ```{r} g<-g+labs( x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="Temperature in Chicago") g ``` --- ```{r} g+ theme( axis.title.x = element_text(color="blue"), axis.title.y = element_text(color="blue") ) ``` --- ```{r} g + ylim(c(-20,60)) g <- g + xlim(as.Date(c("1995-01-01","1999-01-01"))) g ``` --- ```{r} base2=data.frame( today=base$temp_C[1:5113], tomorrow=base$temp_C[2:5114]) ggplot(base2, aes(today, tomorrow))+ geom_point()+ coord_equal() ``` --- ```{r} g <- ggplot(base, aes(date, temp_C, color=factor(season)))+ xlim(as.Date(c("1995-01-01","1999-01-01")))+ geom_point() g+theme(legend.title=element_blank())+ guides(colour = guide_legend(override.aes = list(size=4))) ``` --- ```{r} g<-ggplot(base, aes(date, temp_C))+ xlim(as.Date(c("1995-01-01","1999-01-01"))) g+geom_line(color="grey") ``` --- ```{r} g+geom_line(aes(color="temperature"))+ scale_colour_manual(name='', values=c('temperature'='grey')) ``` --- ```{r} ggplot(base, aes(date, temp_C, color=factor(season)))+ geom_point() + scale_color_manual(values= c("dodgerblue4", "darkolivegreen4", "darkorchid3", "goldenrod1")) ``` --- ```{r} ggplot(base, aes(date, temp_C, color=factor(season)))+ geom_point() + scale_color_brewer(palette="Set1") ``` --- ```{r} ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+ scale_color_gradient(low="blue", high="red") ``` --- ```{r} ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+ scale_color_gradient2(low="blue", mid="white", high="red") ``` --- ```{r} ggplot(base, aes(date, temp_C))+geom_point(color="firebrick")+ theme(panel.background = element_rect(fill = 'light yellow')) ``` --- ```{r} ggplot(base, aes(date,temp_C))+geom_point(color="firebrick")+ facet_wrap(~year, ncol=2, scales="free") ``` --- ```{r, error=TRUE} library(ggthemes) ggplot(base, aes(date, temp_C, color=factor(season)))+ geom_point()+labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like MS Excel")+ theme_excel()+scale_colour_excel()+ theme(legend.title=element_blank()) ``` --- ```{r, error=TRUE} library(ggthemes) ggplot(base, aes(date, temp_C, color=factor(season)))+ geom_point()+labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like 'The Economist'")+ theme_economist()+ theme(legend.title=element_blank()) ``` --- ```{r nvd3plot, results = 'asis', commment = NA, message = F, echo = F} require(rCharts) haireye = as.data.frame(HairEyeColor) n1 <- nPlot(Freq ~ Hair, group = 'Eye', data = subset(haireye, Sex == 'Male'), type = 'multiBarChart' ) n1$print('chart1') ``` --- # OpenStreetMap ```{r} library(OpenStreetMap,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) map <- openmap(c(lat= 51.516, lon= -.141), c(lat= 51.511, lon= -.133)) map <- openproj(map, projection = "+init=epsg:27700") plot(map) ``` --- ```{r, error = TRUE} loc="http://freakonometrics.free.fr/cholera.zip" download.file(loc,destfile="cholera.zip") unzip("cholera.zip", exdir="./") library(maptools,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) deaths<-readShapePoints("Cholera_Deaths") head(deaths@coords) ``` --- ```{r, error = TRUE} plot(map) points(deaths@coords,col="red", pch=19, cex=.7 ) ``` --- ```{r, error = TRUE} X <- deaths@coords library(KernSmooth) kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2]))) library(grDevices,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) clrs <- colorRampPalette(c(rgb(0,0,1,0), rgb(0,0,1,1)), alpha = TRUE)(20) ``` --- ```{r, error = TRUE} plot(map) points(deaths@coords,col="red", pch=19, cex=.7 ) image(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE,col=clrs) contour(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE) ``` --- # Google Map ```{r, error = TRUE} library(ggmap,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) get_london <- get_map(c(-.137,51.513), zoom=17) london <- ggmap(get_london) ``` --- # OpenStreetMap ```{r} london ``` --- ```{r, error = TRUE} df_deaths <- data.frame(X) library(sp,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) library(rgdal,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) coordinates(df_deaths)=~coords.x1+coords.x2 proj4string(df_deaths)=CRS("+init=epsg:27700") df_deaths = spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84")) ``` --- # OpenStreetMap ```{r} london + geom_point(aes(x=coords.x1,y=coords.x2),data=data.frame(df_deaths@coords),col="red") ``` --- ```{r, error = TRUE} london <- london + geom_point(aes(x=coords.x1, y=coords.x2), data=data.frame(df_deaths@coords),col="red") + geom_density2d(data = data.frame(df_deaths@coords), aes(x = coords.x1, y=coords.x2), size = 0.3) + stat_density2d(data = data.frame(df_deaths@coords), aes(x = coords.x1, y=coords.x2,fill = ..level.., alpha = ..level..), size = 0.01, bins = 16, geom = "polygon") + scale_fill_gradient(low = "green", high = "red", guide = FALSE) + scale_alpha(range = c(0, 0.3), guide = FALSE) ``` --- ```{r} london ``` --- ```{r, error = TRUE} require(leaflet) deaths <- readShapePoints("Cholera_Deaths") df_deaths <- data.frame(deaths@coords) coordinates(df_deaths)=~coords.x1+coords.x2 proj4string(df_deaths)=CRS("+init=epsg:27700") df_deaths=spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84")) df=data.frame(df_deaths@coords) lng=df$coords.x1 lat=df$coords.x2 m = leaflet()%>% addTiles() ``` ```{r, eval=FALSE} m %>% fitBounds(-.141, 51.511, -.133, 51.516) ``` ```{r, echo=FALSE, warning=FALSE, cache=FALSE} m = m %>% fitBounds(-.141, 51.511, -.133, 51.516) saveWidget(m, 'leaflet_cholera.html') ``` --- ```{r, 'cholera_map_osm', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'} cat('') ``` --- ```{r, error = TRUE} rd=.5 op=.8 clr="blue" m = leaflet() %>% addTiles() m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) ``` ```{r, echo=FALSE, warning=FALSE, cache=FALSE} m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) saveWidget(m, 'leaflet_cholera_circles.html') ``` --- ```{r, 'cholera_map_osm_circle', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'} cat('') ``` --- ```{r, error = TRUE} X=cbind(lng,lat) kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2]))) x=kde2d$x1 y=kde2d$x2 z=kde2d$fhat CL=contourLines(x , y , z) m = leaflet() %>% addTiles() m %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) ``` ```{r, eval=F} m %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) ``` ```{r, echo=FALSE, warning=FALSE, cache=FALSE} m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) saveWidget(m, 'leaflet_cholera_kde.html') ``` --- ```{r, 'cholera_map_osm_kde', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'} cat('') ``` --- ```{r, error = TRUE} m = leaflet() %>% addTiles() m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) m = leaflet() %>% addTiles() m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>% addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE) ``` ```{r, echo=FALSE, warning=FALSE, cache=FALSE} m = leaflet() %>% addTiles() m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) saveWidget(m, 'leaflet_cholera_kde2.html') ``` ```{r, 'cholera_map_osm_kde2', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'} cat('') ``` ```{r, echo=FALSE, warning=FALSE, cache=FALSE} m = leaflet() %>% addTiles() m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>% addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE) saveWidget(m, 'leaflet_cholera_kde3.html') ``` ```{r, 'cholera_map_osm_kde3', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'} cat('') ``` --- ```{r, error = TRUE} m = leaflet() %>% addTiles() m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>% addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>% addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>% addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red") ``` ```{r, echo=FALSE, warning=FALSE, cache=FALSE} m = leaflet() %>% addTiles() m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>% addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>% addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>% addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red") saveWidget(m, 'leaflet_cholera_kde4.html') ``` ```{r, 'cholera_map_osm_kde4', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'} cat('') ``` --- # Map with googleVis ```{r, warning=FALSE, cache=FALSE, error = FALSE, message=FALSE} library(googleVis) df=data.frame(state=c("AU-WA","AU-VIC","AU-NT", "AU-NSW", "AU-SA","AU-QLD","AU-TAS","NZ"), R_users=c(323,425,154,486,201,195,87,123)) Geo <- gvisGeoChart(df, locationvar="state",colorvar=c("R_users"), options=list(region="AU", dataMode="regions",resolution="provinces", width="80%", height="80%")) ``` --- # Map with googleVis ```{r, 'gvisGeoChart', echo=FALSE, warning=FALSE,cache=FALSE,results='asis'} print(Geo, tag = "chart") ``` --- ```{r, error = TRUE} library(XML,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) year <- 2012 loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="") tabs <- readHTMLTable(htmlParse(loc)) tabs[1] storms <- unlist(strsplit(as.character(tabs[[1]]$Name),split=" ")) storms ``` --- ```{r, error = TRUE} index <- storms%in%c("Tropical", "Storm", paste("Hurricane-",1:6,sep=""), "Depression", "Subtropical", "Extratropical", "Low", paste("Storm-",1:6,sep=""), "Xxx") nstorms <- storms[!index] nstorms TRACK=NULL ``` --- ```{r, error = TRUE} for(i in length(nstorms):1){ if((nstorms[i]=="SIXTEE")&(year==2008)) nstorms[i] <- "SIXTEEN" if((nstorms[i]=="LAUR")&(year==2008)) nstorms[i] <- "LAURA" if((nstorms[i]=="FIFTEE")&(year==2007)) nstorms[i] <- "FIFTEEN" if((nstorms[i]=="CHANTA")&(year==2007)) nstorms[i] <- "CHANTAL" if((nstorms[i]=="ANDR")&(year==2007)) nstorms[i] <- "ANDREA" if((nstorms[i]=="NINETE")&(year==2005)) nstorms[i] <- "NINETEEN" if((nstorms[i]=="JOSEPH")&(year==2002)) nstorms[i] <- "JOSEPHINE" if((nstorms[i]=="FLOY")&(year==1993)) nstorms[i] <- "FLOYD" if((nstorms[i]=="KEIT")&(year==1988)) nstorms[i] <- "KEITH" if((nstorms[i]=="CHARLI")&(year==1972)) nstorms[i] <- "CHARLIE"} ``` --- ```{r} for(i in length(nstorms):1){ loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="") track <- read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20)) names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT") track$LAT=as.numeric(as.character(track$LAT)) track$LON=as.numeric(as.character(track$LON)) track$WIND=as.numeric(as.character(track$WIND)) track$PR=as.numeric(as.character(track$PR)) track$year=year track$name=nstorms[i] TRACK=rbind(TRACK,track)} tail(TRACK,10) ``` --- ```{r,error = TRUE} extract.track <- function(year=2012,p=TRUE){ loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="") tabs <- readHTMLTable(htmlParse(loc)) storms <- unlist(strsplit(as.character( tabs[[1]]$Name),split=" ")) index <- storms%in%c("Tropical","Storm",paste("Hurricane-",1:6,sep=""),"Depression","Subtropical","Extratropical","Low",paste("Storm-",1:6,sep=""), "Xxx") nstorms <- storms[!index] TRACK=NULL for(i in length(nstorms):1){ loc=paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="") track=read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20)) names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT") track$LAT=as.numeric(as.character(track$LAT)) track$LON=as.numeric(as.character(track$LON)) track$WIND=as.numeric(as.character(track$WIND)) track$PR=as.numeric(as.character(track$PR)) track$year=year track$name=nstorms[i] TRACK=rbind(TRACK,track) if(p==TRUE){ cat(year,i,nstorms[i],nrow(track),"\n")}} return(TRACK)} ``` --- ```{r,error = TRUE} TOTTRACK=NULL # for(y in 2012:1851){ # TOTTRACK=rbind(TOTTRACK,extract.track(y)) # } loc="http://freakonometrics.free.fr/TRACK-ATLANTIC.Rdata" download.file(loc,destfile="TRACK-ATLANTIC.Rdata") load("TRACK-ATLANTIC.Rdata") dim(TOTTRACK) ``` --- ```{r} library(grDevices,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) library(maps,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) map("world", col="light yellow", fill=TRUE) ``` --- ```{r} map("world", col="light yellow", fill=TRUE) for(n in unique(TOTTRACK$name)){ lines(TOTTRACK$LON[TOTTRACK$name==n],TOTTRACK$LAT[TOTTRACK$name==n],lwd=.5,col=rgb(1, 0, 0, alpha=.5))} ``` --- # Shapefiles ```{r} loc="http://freakonometrics.free.fr/Italy.zip" download.file(loc,destfile="Italy.zip") unzip("Italy.zip", exdir="./") library(rgdal,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) ita1<-readOGR(dsn="./Italy/",layer="ITA_adm1",verbose=FALSE) names(ita1) ``` --- ```{r} plot(ita1,col="light green") ``` --- ```{r} ita1@data[,"NAME_1"][1:6] pos<-which(ita1@data[,"NAME_1"]=="Sicily") Poly_Sicily<-ita1[pos,] ``` --- ```{r} plot(ita1,col="light green") plot(Poly_Sicily,col="yellow",add=TRUE) ``` --- ```{r} plot(ita1,col="light green") plot(Poly_Sicily,col="yellow",add=TRUE) abline(v=5:20,col="light blue") abline(h=35:50,col="light blue") axis(1) axis(2) ``` --- ```{r} pos<-which(ita1@data[,"NAME_1"] %in% c("Abruzzo","Apulia","Basilicata","Calabria","Campania","Lazio","Molise","Sardegna","Sicily")) ita_south <- ita1[pos,] ita_north <- ita1[-pos,] plot(ita1) plot(ita_south,col="red",add=TRUE) plot(ita_north,col="blue",add=TRUE) ``` --- ```{r} library(xlsx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) loc="http://freakonometrics.free.fr/Italy_codes.xlsx" download.file(loc,destfile="Italy_codes.xlsx") data_codes<-read.xlsx2(file = "Italy_codes.xlsx",sheetName="ITALY", startRow=1, header=TRUE) names(data_codes)[1]="NAME_1" ita2<-merge(ita1, data_codes, by = "NAME_1", all.x=TRUE) pos<-which(ita2@data[,"GROUP"] == "SOUTH") ``` --- ```{r} library(rgeos,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) ita_s<-gUnionCascaded(ita_south) ita_n<-gUnionCascaded(ita_north) plot(ita1) plot(ita_s,col="red",add=TRUE) plot(ita_n,col="blue",add=TRUE) ``` --- ```{r} plot(ita1,col="light green") plot(Poly_Sicily,col="yellow",add=TRUE) gCentroid(Poly_Sicily,byid=TRUE) points(gCentroid(Poly_Sicily,byid=TRUE),pch=19,col="red") ``` --- ```{r} G <- as.data.frame(gCentroid(ita1,byid=TRUE)) plot(ita1,col="light green") text(G$x,G$y,1:20) ``` --- ```{r} c1 <-G[c(17,20,6,16,8,12,2),] c2 <- G[c(13,11,1,5,3,4),] L1<-SpatialLines(list(Lines(list(Line(c1)),"Line1"))) L2<-SpatialLines(list(Lines(list(Line(c2)),"Line2"))) L1@proj4string<-ita1@proj4string L2@proj4string<-ita1@proj4string cross_road <-gIntersection(L1,L2) cross_road@coords ``` --- ```{r} plot(ita1,col="light green") plot(L1,col="red",lwd=2,add=TRUE) plot(L2,col="blue",lwd=2,add=TRUE) plot(cross_road,pch=19,cex=1.5,add=TRUE) ``` --- ## Linear Model $$ y_i=\boldsymbol{x}_i^{\text{T}}\boldsymbol{\beta}+\varepsilon_i=\beta_0 + [\beta_1 x_{1,i}+\cdots+ \beta_k x_{k,i}]+\varepsilon_i $$ See linear model, where $\varepsilon_i\sim\mathcal{N}(0,\sigma^2)$ i.id. $$ (Y\vert \boldsymbol{X}=\boldsymbol{x})\sim \mathcal{N}(\boldsymbol{x}^{\text{T}}\boldsymbol{\beta}, \sigma^2) $$ i.e. $$ \mathbb{E}[Y\vert\boldsymbol{X}=\boldsymbol{x}]=\boldsymbol{x}^{\text{T}}\boldsymbol{\beta} $$ and homoscedastic model, $$ \text{Var}[Y\vert\boldsymbol{X}=\boldsymbol{x}]=\sigma^2 $$ Least squares (and maximum likelihood) estimator $$ \widehat{\boldsymbol{\beta}}=\text{argmin} \left\lbrace \sum_{i=1}^n (y_i-\boldsymbol{x}_i^{\text{T}}\boldsymbol{\beta})^2 \right\rbrace =(\boldsymbol{X}^{\text{T}}\boldsymbol{X})^{-1}\boldsymbol{X}^{\text{T}}\boldsymbol{y}$$ --- ## Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars) ``` ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} model <- lm(dist ~ speed, data=cars) ``` --- ## Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} summary(model) ``` --- ## Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} X=cbind(1,cars$speed) Y=cars$dist solve(crossprod(X,X),crossprod(X,Y)) model$coefficients ``` --- ## Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} summary(model)$sigma^2*solve(crossprod(X,X)) vcov(model) n=nrow(cars) ``` --- ## Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} confint(model) model$coefficients[1]+qt(c(.025,.975),n-2)* summary(model)$coefficients[1,2] ``` --- ## Linear Model coefficient of determination $\displaystyle{R^2 = \frac{\text{explained variance}}{\text{total variance}}}$ ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} summary(model)$r.squared 1-deviance(model)/sum((Y-mean(Y))^2) ``` $$ \text{Var}[Y]= \text{Var}[\mathbb{E}[Y\vert X]]+ \mathbb{E}[\text{Var}[Y\vert X]] $$ see analysis of variance --- ## Linear Model $$ \overline{R}^2 = 1-[1-R^2]\cdot \frac{n-1}{n-(k-1)-1} $$ ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} summary(model)$adj.r.squared ``` --- ## Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} anova(lm(dist~speed,data=cars),lm(dist~1,data=cars)) ``` --- ## Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars) abline(model,col="red") ``` --- ## Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars) abline(model,col="red") x=seq(2,26) y=predict(model, newdata=data.frame(speed=x)) lines(x,y,lwd=2,col="blue") ``` ---- ## Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} y=predict(model, newdata=data.frame(speed=x), interval = "confidence") head(y) ``` $$ \text{Var}[\widehat{Y}(\boldsymbol{x})]=\text{Var}[\boldsymbol{x}^{\text{T}}\widehat{\boldsymbol{\beta}}]=\boldsymbol{x}^{\text{T}}\text{Var}[\widehat{\boldsymbol{\beta}}]\boldsymbol{x} =\widehat{\sigma}^2 \boldsymbol{x}^{\text{T}}[\boldsymbol{X}^{\text{T}}\boldsymbol{X}]^{-1}\boldsymbol{x} $$ ---- ## Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars) polygon(c(x,rev(x)),c(y[,2],rev(y[,3])),col=rgb(0,0,1,.4),border=NA) lines(x,y[,1],lwd=2,col="blue") ``` ---- ## Linear Model and Bagging Method 1 ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} Y=matrix(NA,1000,length(x)) for(b in 1:1000){ idx <- sample(1:nrow(cars),size=nrow(cars),replace=TRUE) modelb <- lm(dist ~ speed, data=cars[idx,]) Y[b,] <- predict(modelb, newdata=data.frame(speed=x)) } ``` See bootstrap, based on pseudo-sample $$ \lbrace(\boldsymbol{x}_{i_1},y_{i_1}), \cdots,(\boldsymbol{x}_{i_n},y_{i_n}) \rbrace $$ where $(i_1,\cdots,i_n)\in\lbrace 1,2,\cdots,n\rbrace$. ---- ## Linear Model and Bagging Method 1 ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars) for(b in 1:100) lines(x,Y[b,],col=rgb(0,0,1,.4)) ``` ---- ## Linear Model and Bagging ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars) lines(x,apply(Y,2,mean),col="blue",lwd=2) lines(x,apply(Y,2,function(x) quantile(x,.025)),col="red",lty=2) lines(x,apply(Y,2,function(x) quantile(x,.975)),col="red",lty=2) ``` ---- ## Linear Model and Bagging Method 2 ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} pred_dist=predict(model) epsilon =residuals(model) Y=matrix(NA,1000,length(x)) for(b in 1:1000){ idx <- sample(1:nrow(cars),size=nrow(cars),replace=TRUE) carsb=data.frame(speed=cars$speed, dist=pred_dist+epsilon[idx]) modelb <- lm(dist ~ speed, data=carsb) Y[b,] <- predict(modelb, newdata=data.frame(speed=x)) } ``` See bootstrap, based on pseudo-sample $$ \lbrace(\boldsymbol{x}_{1},\widehat{y}_{1}+\widehat{\varepsilon}_{i_1}), \cdots,(\boldsymbol{x}_{n},\widehat{y}_{n}+\widehat{\varepsilon}_{i_n}) \rbrace $$ where $(i_1,\cdots,i_n)\in\lbrace 1,2,\cdots,n\rbrace$. ---- ## Linear Model and Bagging Method 2 ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars) for(b in 1:100) lines(x,Y[b,],col=rgb(0,0,1,.4)) ``` ---- ## Linear Model and Bagging ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars) lines(x,apply(Y,2,mean),col="blue",lwd=2) lines(x,apply(Y,2,function(x) quantile(x,.025)),col="red",lty=2) lines(x,apply(Y,2,function(x) quantile(x,.975)),col="red",lty=2) ``` ---- ## Errors ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars) abline(model,col="red") segments(cars$speed,cars$dist,cars$speed,predict(model),col="blue") ``` ---- ## Least Squares? ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} cars2=cars; cars2[,2]=cars[,2]/10 plot(cars2,ylab="dist/10") acp=princomp(cars2) b=acp$loadings[2,1]/acp$loadings[1,1] a=acp$center[2]-b*acp$center[1] abline(a,b,col="red") ``` ---- ## Least Squares? ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars2,ylab="dist/10",xlim=c(0,30),ylim=c(-1,12)) abline(a,b,col="red") t <- acp$loadings[,1] %*% (t(cars2)-acp$center) X1 <- acp$center[1] +t * acp$loadings[1,1] X2 <- acp$center[2] +t * acp$loadings[2,1] segments(cars2$speed,cars2$dist,X1,X2,col="blue") ``` ---- ## Least Absolute Deviation ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} f <- function(a) sum(abs(cars$dist-(a[1]+a[2]*cars$speed))) opt <- optim( c(0,0), f )$par plot(cars) abline(model, col='red', lty=2) abline(opt[1], opt[2],col="blue") ``` ---- ## Quantile Regression ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(quantreg,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) plot(cars) abline(model, col="blue") abline(rq(dist ~ speed,data=cars, tau=.5),col="red",lty=2) abline(rq(dist ~ speed,data=cars, tau=.9),col="purple",lty=2) ``` ---- ## Diagnostic and Errors ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(model,which=1) ``` Scatterplot $(\widehat{Y}_i,\widehat{\varepsilon}_i)$ ---- ## Diagnostic and Errors ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(model,which=2) ``` Scatterplot $\displaystyle{\left(\widehat{\varepsilon}_{i:n},\Phi^{-1} \left(\frac{i}{n}\right)\right)}$ ---- ## Diagnostic and Errors ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(model,which=3) ``` Scatterplot $(\widehat{Y}_i,\sqrt{\vert\widehat{\varepsilon}_i\vert})$ ---- ## Diagnostic and Errors ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(model,which=4) ``` Cook distance, $\displaystyle{C_i=\frac{\widehat{\varepsilon}_i^2}{p\cdot\text{MSE}}\cdot\left(\frac{H_{i,i}}{(1-H_{i,i})^2}\right)}$ with $\boldsymbol{H}=\boldsymbol{X}(\boldsymbol{X}^{\text{T}}\boldsymbol{X})^{-1}\boldsymbol{X}^{\text{T}}=[H_{i,i}]$ ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} C=cooks.distance(model) ``` ---- ## Diagnostic and Errors $H_{i,i}$ are leverages, and define Studentized residuals as $$ \widehat{r}_i=\frac{\widehat{\varepsilon}_i}{ \widehat{\sigma} \sqrt{1-H_{i,i}}} $$ ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} rstudent(model) ``` ---- ## Diagnostic and Errors ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(model,which=5) ``` Scatterplot $(H_{i,i},\widehat{r}_i)$ ---- ## Diagnostic and Errors ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(model,which=6) ``` Scatterplot $(H_{i,i},C_i)$ ---- ## Diagnostic and Errors ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} hist(residuals(model),probability=TRUE, col="light green") lines(density(residuals(model)),lwd=2,col="red") boxplot(residuals(model),horizontal=TRUE,add=TRUE,at=.024, pars=list(boxwex=.004),col=rgb(0,0,1,.25)) ``` ---- ## Diagnostic and Errors ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} sigma=summary(model)$sigma plot(ecdf(residuals(model)/sigma)) lines(seq(-3,3,by=.1),pnorm(seq(-3,3,by=.1)),lwd=2,col="red") ``` ---- ## Diagnostic and Errors Kolmogorov-Smirnov $ d=\sup_{x\in\mathbb{R}}\lbrace \vert\Phi(x)-\widehat{F}_n(x)\vert \rbrace $ ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} ks.test(residuals(model)/sigma,"pnorm") ``` ---- ## Diagnostic and Errors Anderson-Darling, Cramer-von Mises, ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(nortest,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) ad.test(residuals(model)) cvm.test(residuals(model)) ``` ---- ## Model Choice AIC $AIC = 2k - 2\log(\mathcal{L}) = 2k + n\left[\log\left(2\pi \frac{1}{n}\sum_{i=1}^n \widehat{\varepsilon}_i^2 \right) + 1\right]$ BIC $BIC = { k \log(n) -2 \log(\mathcal{L}) } = k \log(n) + n\left[\log\left(2\pi \frac{1}{n}\sum_{i=1}^n \widehat{\varepsilon}_i^2 \right) + 1\right]$ ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} AIC(model) AIC(model,k=log(n)) ``` ---- ## Testing Linear Assumptions ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(splines,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) model_brk <- lm(dist ~ bs(speed,degree=1,knots=(c(4,15,25))), data=cars) x=seq(4,25,by=.1) y=predict(model_brk, newdata=data.frame(speed=x)) ``` see $b$-splines, $$ y_i=\beta_0+\beta_1 x_i + \beta_2 (x_i-15)_+ + \varepsilon_i $$ ---- ## Testing Linear Assumptions ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars) lines(x,y,lwd=2,col="blue") ``` ---- ## Testing Linear Assumptions ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} positive=function(x,s) ifelse(x>s,x-s,0) model_brk <- lm(dist ~ speed + positive(speed,15), data=cars) x=seq(4,25,by=.1) y=predict(model_brk, newdata=data.frame(speed=x)) plot(cars) abline(model,col="red",lty=2) lines(x,y,lwd=2,col="blue") ``` ---- ## Testing Linear Assumptions ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cars) abline(model,col="red",lty=2) lines(x,y,lwd=2,col="blue") abline(coefficients(model_brk)[1],coefficients(model_brk)[2],col="blue",lty=2) summary(model_brk)$coefficients ``` ---- ## Testing Linear Assumptions ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(strucchange,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) plot(Fstats(dist ~ speed,data=cars,from=7/50)) ``` see Chow Test ---- ## Testing Linear Assumptions $$ W_t=\frac{1}{\widehat{\sigma}\sqrt{n}}\sum_{i=1}^{ \lfloor nt \rfloor} \widehat{\varepsilon}_i $$ ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} cusum <- efp(dist ~ speed, type = "OLS-CUSUM",data=cars) plot(cusum,ylim=c(-2,2)) ``` ---- ## Testing Linear Assumptions ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(cusum, alpha = 0.05, alt.boundary = TRUE,ylim=c(-2,2)) ``` see CUSUM test ---- ## Model with Categorical Covariates ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} model_cut=lm(dist~ cut(speed, breaks=c(0,10,15,20,25)),data=cars) y=predict(model_cut, newdata=data.frame(speed=x)) plot(cars) abline(model,col="red",lty=2) lines(x,y,lwd=2,col="blue",type="s") ``` ---- ## Model with Categorical Covariates ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(rpart,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) tree=rpart(dist~speed,data=cars) y=predict(tree,newdata=data.frame(speed=x)) plot(cars) abline(model,col="red",lty=2) lines(x,y,lwd=2,col="blue",type="s") ``` ---- ## Smoothing : Polynomial Regression ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} model_poly=lm(dist~ poly(speed, df=3),data=cars) y=predict(model_poly, newdata=data.frame(speed=x)) plot(cars) abline(model,col="red",lty=2) lines(x,y,lwd=2,col="blue") ``` ---- ## Smoothing : Local Regression ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) plot(cars) bw <- dpill(cars$speed,cars$dist) lines( locpoly(cars$speed,cars$dist,degree=0, bandwidth=bw), col='red' ) lines( locpoly(cars$speed,cars$dist,degree=1, bandwidth=bw), col='green' ) lines( locpoly(cars$speed,cars$dist,degree=2, bandwidth=bw), col='blue' ) ``` ---- ## Smoothing : $k$-Nearest Neighboors ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(FNN,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) knn=knn.reg(train=cars$speed,y=cars$dist,k=5) plot(cars) lines(cars$speed,knn$pred,col="red") ``` ---- ## Smoothing : $b$ Splines ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(splines,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) model_bs <- lm(dist ~ bs(speed), data=cars) y=predict(model_bs, newdata=data.frame(speed=x)) plot(cars) abline(model,col="red",lty=2) lines(x,y,lwd=2,col="blue") ``` ---- ## Smoothing Linear Model Tuning parameter selection, Silverman's Rule $$ b^\star=0.9 \cdot\frac{ \min\lbrace \sigma,F^{-1}(.75)-F^{-1}(.25)\rbrace}{1.34 \cdot n^{1/5}} $$ ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} bw.nrd0(cars$speed) ``` See kernel regression ---- ## Smoothing Linear Model Tuning parameter selection, Cross Validation $$ b^\star=\text{argmin}\left\lbrace \sum_{i=1}^n \left(y_i - \widehat{m}_{(i)}(\boldsymbol{x}_i)\right)^2\right\rbrace $$ ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} bw.ucv(cars$speed) ``` ---- ## Smoothing Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) Nadaraya_Watson =with(cars,ksmooth(speed, dist, "normal",bandwidth=2.75)) plot(cars) abline(model,col="red",lty=2) lines(Nadaraya_Watson,lwd=2,col="blue") ``` ---- ## Smoothing Linear Model ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) model_loess=loess(dist~ speed, data=cars,degree=1, family="gaussian") y=predict(model_loess, newdata=data.frame(speed=x)) plot(cars) abline(model,col="red",lty=2) lines(x,y,lwd=2,col="blue") ``` ---- ## Multiple Regression Life Expectancy (1), Homicide Rate (2), Illiteracy Rate (3) ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} chicago=read.table("http://freakonometrics.free.fr/chicago.txt",header=TRUE,sep=";") model_c = lm(Fire~X_2+X_3,data=chicago) y=function(x2,x3) predict(model_c,newdata=data.frame(X_2=x2,X_3=x3)) VX2=seq(0,80,length=26); VX3=seq(5,25,length=26) VY=outer(VX2,VX3,y) persp(VX2,VX3,VY,xlab="Homicide",ylab="Illiteracy",zlab="Fire",theta=20) ``` ---- ## Multiple Regression ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} VX2=seq(0,80,length=251); VX3=seq(5,25,length=251) VY=outer(VX2,VX3,y) image(VX2,VX3,VY,xlab="Homicide",ylab="Illiteracy") contour(VX2,VX3,VY,add=TRUE) ``` ---- ## Model selection ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} model_c = lm(Fire~.,data=chicago) summary(model_c)$r.squared summary(model_c)$adj.r.squared ``` ---- ## Model selection ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} logLik(model_c) AIC(model_c, k=2) # AIC AIC(model_c, k=log(nrow(cars))) # BIC ``` ---- ## Penalization $$ \widehat{\boldsymbol{\beta}} = \text{argmin}\left\lbrace \sum_{i=1}^n \left[y_i- \left(\beta_0+\sum_{j=1}^k \beta_j x_{j,i} \right)\right] + \color{red}{\lambda} \sum_{j=1}^k \beta_j ^2\right\rbrace $$ with an explicit solution $\widehat{\boldsymbol{\beta}}=(\boldsymbol{X}^{\text{T}}\boldsymbol{X}-\color{red}{\lambda} \mathbb{I})^{-1} \boldsymbol{X}^{\text{T}}\boldsymbol{Y}$. ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(MASS,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) model_ridge <- lm.ridge(Fire ~ ., data=chicago, lambda=1) ``` see more generally Tikhonov regularization ---- ## Optimal Penalization ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} mse <- NULL n=100 v <- matrix(c(0,coefficients(model_c)[-1]), nr=n, nc=4, byrow=TRUE) kl <- c(1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.2, 1.4, 1.6, 1.8, 2) for (k in kl) { r <- matrix(NA, nr=n, nc=4) for (i in 1:n) { boot_c <- chicago[sample(1:nrow(chicago),nrow(chicago),replace=TRUE),] r[i,2:4] <- model_ridge <- lm.ridge(Fire ~ ., data=boot_c, lambda=k)$coef r[i,1] <- mean(boot_c[,"Fire"]) } mse <- append(mse, apply( (r-v)^2, 2, mean )[2]) } ``` ---- ## Optimal Penalization ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot( mse ~ kl, type='l' ) ``` ---- ## Variable Selection ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} step(lm(Fire ~ .,data=chicago)) ``` --- ## Variable Selection LASSO (least absolute shrinkage and selection operator) $$ \widehat{\boldsymbol{\beta}} = \text{argmin}\left\lbrace \sum_{i=1}^n \left[y_i- \left(\beta_0+\sum_{j=1}^k \beta_j x_{j,i} \right)\right] + \color{blue}{\lambda} \sum_{j=1}^k \vert\beta_j\vert\right\rbrace $$ ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(glmnet,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) fit = glmnet(x = as.matrix(chicago[,2:4]), y = chicago[,1], family = "gaussian", alpha = 1) ``` ---- ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(fit, xvar="lambda", label = TRUE ) ``` ---- ## Variable Selection ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} step(model) ``` ---- ## Dimension Reduction ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} model_acp=lm(Fire ~ princomp(cbind(X_1,X_2,X_3))$scores[,1:3],data=chicago) predict(model_c)[1:5] predict(model_acp)[1:5] ``` ---- ## Dimension Reduction ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} model_acp=lm(Fire ~ princomp(cbind(X_1,X_2,X_3))$scores[,1:2],data=chicago) predict(model_c)[1:5] predict(model_acp)[1:5] ``` ---- ## Logistic Regression $$ \mathbb{E}[Y\vert \boldsymbol{X}=\boldsymbol{x}]= p(\boldsymbol{x})=\frac{\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]}{1+\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]}=H(\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}) $$ logistic regression ---- ## Logistic Regression ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} myocarde <-read.table("http://freakonometrics.free.fr/myocarde.csv", head=TRUE,sep=";") model_bernoulli = glm(PRONO~., data=myocarde, family=binomial) coefficients(summary(model_bernoulli)) ``` ---- ## Logistic Regression ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} X=cbind(1,as.matrix(myocarde[,1:7])) Y=myocarde$PRONO=="Survival" beta=as.matrix(lm(Y~0+X)$coefficients,ncol=1) for(s in 1:9){ pi=exp(X%*%beta[,s])/(1+exp(X%*%beta[,s])) gradient=t(X)%*%(Y-pi) omega=matrix(0,nrow(X),nrow(X));diag(omega)=(pi*(1-pi)) Hessian=-t(X)%*%omega%*%X beta=cbind(beta,beta[,s]-solve(Hessian)%*%gradient)} ``` ---- ## Logistic Regression ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} beta[,1] beta[,9] ``` ---- ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} # -solve(Hessian) sqrt(-diag(solve(Hessian))) ``` ---- ## Logistic Regression ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} GLM <- glm(PRONO ~ PVENT + REPUL, data = myocarde, family = binomial) pred_GLM = function(p,r){ return(predict(GLM, newdata = data.frame(PVENT=p,REPUL=r), type="response"))} vp=seq(min(myocarde$PVENT),max(myocarde$PVENT),length=251) vr=seq(min(myocarde$REPUL),max(myocarde$REPUL),length=251) vd=outer(vp,vr,pred_GLM) library(RColorBrewer,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) CL=colorRampPalette(brewer.pal(name="RdYlBu", 11))(100) ``` ---- ## Logistic Regression ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} image(vp,vr,vd,col=CL,xlab="PVENT",ylab="REPUL") ``` ---- ## Logistic Regression ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} probabilities <- predict(GLM, myocarde, type="response") predictions <- levels(myocarde$PRONO)[(probabilities>.5)+1] table(predictions, myocarde$PRONO) ``` ---- ## Residuals and GLM Pearson's residuals $$ \widehat{\varepsilon}_i=\frac{y_i - \widehat{\pi}_i}{ \sqrt{\widehat{\pi}_i\cdot[1-\widehat{\pi}_i]}} $$ Deviance residuals $$ \widehat{\varepsilon}_i=\pm \left( y_i \log[\widehat{\pi}_i]+ (1-y_i) \log[1-\widehat{\pi}_i]\right) $$ ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} E1=residuals(model_bernoulli,type="pearson") E2=residuals(model_bernoulli,type="deviance") ``` ---- ## Residuals and GLM ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} plot(1:nrow(myocarde),E1) ``` ---- ## Multinomial Regression Binomial case, $Y\in\lbrace 0,1\rbrace$, $\mathbb{P}[Y=1\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]$ $\mathbb{P}[Y=0\vert \boldsymbol{X}=\boldsymbol{x}]\propto1$ Multinomial case, $Y\in\lbrace A,B,C\rbrace$, $\mathbb{P}[Y=\color{red}{A}\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\color{red}{\boldsymbol{\beta}_A}]$ $\mathbb{P}[Y=\color{blue}{B}\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\color{blue}{\boldsymbol{\beta}_B}]$ $\mathbb{P}[Y=\color{green}{C}\vert \boldsymbol{X}=\boldsymbol{x}]\propto1$ ---- ## Multinomial Regression ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} library(VGAM,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE) model_multi = vglm(PRONO~., data=myocarde, family="multinomial") model_multi ``` ---- ## Generalized Linear Models Poisson and Gamma distributions in the exponential family ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} base=data.frame(x=c(1,2,3,4,5),y=c(1,2,4,2,6)) regNId <- glm(y~x,family=gaussian(link="identity"),data=base) regNlog <- glm(y~x,family=gaussian(link="log"),data=base) regPId <- glm(y~x,family=poisson(link="identity"),data=base) regPlog <- glm(y~x,family=poisson(link="log"),data=base) regGId <- glm(y~x,family=Gamma(link="identity"),data=base) regGlog <- glm(y~x,family=Gamma(link="log"),data=base) ``` ---- ## Generalized Linear Models ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} visuel=function(regression,titre=""){ plot(base$x,base$y,pch=19,cex=1.5,main=titre,xlab="",ylab="") abs <- seq(0,7,by=.1) yp <- predict(regression,newdata=data.frame(x=abs),se.fit = TRUE, type="response") polygon(c(abs,rev(abs)),c(yp$fit+2*yp$se.fit,rev(yp$fit-2*yp$se.fit)), col="light green",border=NA) points(base$x,base$y,pch=19,cex=1.5) lines(abs,yp$fit,lwd=2,col="red") lines(abs,yp$fit+2*yp$se.fit,lty=2) lines(abs,yp$fit-2*yp$se.fit,lty=2)} ``` ---- ## Generalized Linear Models ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} visuel(regNId,"Gaussienne, lien identité") ``` ---- ## Generalized Linear Models ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} visuel(regPId,"Poisson, lien identité") ``` ---- ## Generalized Linear Models ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} visuel(regGId,"Gamma, lien identité") ``` ---- ## Generalized Linear Models ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} visuel(regNlog,"Gaussienne, lien logarithmique") ``` ---- ## Generalized Linear Models ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} visuel(regPlog,"Poisson, lien logarithme") ``` ---- ## Generalized Linear Models ```{r, fig.width = 10, fig.height = 5, fig.align = "center", tidy.opts=list(width.cutoff=70), message = FALSE} visuel(regGlog,"Gamma, lien logarithme") ```