output: html_document

DSA 2017

Arthur Charpentier, @freakonometrics

R Crash Course

Slides available at http://freakonometrics.free.fr/DSA2017/DSAslides.html

Markdown available at http://freakonometrics.free.fr/DSA2017M/DSAmark.html

Additional information at http://freakonometrics.hypotheses.org

Download RStudio from http://www.rstudio.com

?rq
## No documentation for 'rq' in specified packages and libraries:
## you could try '??rq'
library(quantreg,verbose=FALSE,quietly=TRUE)
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
require(quantreg)
?rq
rq
## function (formula, tau = 0.5, data, subset, weights, na.action, 
##     method = "br", model = TRUE, contrasts = NULL, ...) 
## {
##     call <- match.call()
##     mf <- match.call(expand.dots = FALSE)
##     m <- match(c("formula", "data", "subset", "weights", "na.action"), 
##         names(mf), 0)
##     mf <- mf[c(1, m)]
##     mf$drop.unused.levels <- TRUE
##     mf[[1]] <- as.name("model.frame")
##     mf <- eval.parent(mf)
##     if (method == "model.frame") 
##         return(mf)
##     mt <- attr(mf, "terms")
##     weights <- as.vector(model.weights(mf))
##     Y <- model.response(mf)
##     if (method == "sfn") {
##         if (requireNamespace("MatrixModels") && requireNamespace("Matrix")) {
##             X <- MatrixModels::model.Matrix(mt, data, sparse = TRUE)
##             vnames <- dimnames(X)[[2]]
##             X <- as(X, "matrix.csr")
##             mf$x <- X
##         }
##     }
##     else X <- model.matrix(mt, mf, contrasts)
##     eps <- .Machine$double.eps^(2/3)
##     Rho <- function(u, tau) u * (tau - (u < 0))
##     if (length(tau) > 1) {
##         if (any(tau < 0) || any(tau > 1)) 
##             stop("invalid tau:  taus should be >= 0 and <= 1")
##         if (any(tau == 0)) 
##             tau[tau == 0] <- eps
##         if (any(tau == 1)) 
##             tau[tau == 1] <- 1 - eps
##         coef <- matrix(0, ncol(X), length(tau))
##         rho <- rep(0, length(tau))
##         fitted <- resid <- matrix(0, nrow(X), length(tau))
##         for (i in 1:length(tau)) {
##             z <- {
##                 if (length(weights)) 
##                   rq.wfit(X, Y, tau = tau[i], weights, method, 
##                     ...)
##                 else rq.fit(X, Y, tau = tau[i], method, ...)
##             }
##             coef[, i] <- z$coefficients
##             resid[, i] <- z$residuals
##             rho[i] <- sum(Rho(z$residuals, tau[i]))
##             fitted[, i] <- Y - z$residuals
##         }
##         taulabs <- paste("tau=", format(round(tau, 3)))
##         dimnames(coef) <- list(dimnames(X)[[2]], taulabs)
##         dimnames(resid) <- list(dimnames(X)[[1]], taulabs)
##         fit <- z
##         fit$coefficients <- coef
##         fit$residuals <- resid
##         fit$fitted.values <- fitted
##         if (method == "lasso") 
##             class(fit) <- c("lassorqs", "rqs")
##         else if (method == "scad") 
##             class(fit) <- c("scadrqs", "rqs")
##         else class(fit) <- "rqs"
##     }
##     else {
##         process <- (tau < 0 || tau > 1)
##         if (tau == 0) 
##             tau <- eps
##         if (tau == 1) 
##             tau <- 1 - eps
##         fit <- {
##             if (length(weights)) 
##                 rq.wfit(X, Y, tau = tau, weights, method, ...)
##             else rq.fit(X, Y, tau = tau, method, ...)
##         }
##         if (process) 
##             rho <- list(x = fit$sol[1, ], y = fit$sol[3, ])
##         else {
##             if (length(dim(fit$residuals))) 
##                 dimnames(fit$residuals) <- list(dimnames(X)[[1]], 
##                   NULL)
##             rho <- sum(Rho(fit$residuals, tau))
##         }
##         if (method == "lasso") 
##             class(fit) <- c("lassorq", "rq")
##         else if (method == "scad") 
##             class(fit) <- c("scadrq", "rq")
##         else class(fit) <- ifelse(process, "rq.process", "rq")
##     }
##     fit$na.action <- attr(mf, "na.action")
##     fit$formula <- formula
##     fit$terms <- mt
##     fit$xlevels <- .getXlevels(mt, mf)
##     fit$call <- call
##     fit$tau <- tau
##     fit$weights <- weights
##     fit$residuals <- drop(fit$residuals)
##     fit$rho <- rho
##     fit$method <- method
##     fit$fitted.values <- drop(fit$fitted.values)
##     attr(fit, "na.message") <- attr(m, "na.message")
##     if (model) 
##         fit$model <- mf
##     fit
## }
## <environment: namespace:quantreg>
detach(package:quantreg, unload=TRUE)
apropos("norm")
##  [1] "addNormals"             "amlnormal"              "binormal"              
##  [4] "binormalcop"            "cennormal"              "cens.normal"           
##  [7] ".__C__Lnorm"            ".__C__LnormParameter"   ".__C__Norm"            
## [10] ".__C__NormParameter"    ".__C__UniNormParameter" "dbetanorm"             
## [13] "dbinorm"                "dbinormcop"             "denorm"                
## [16] "dfoldnorm"              "dlnorm"                 "dnorm"                 
## [19] "dnorm2"                 "double.cens.normal"     "dposnorm"              
## [22] "dskewnorm"              "foldnormal"             "Lnorm"                 
## [25] "lognormal"              "lqnorm"                 "mix2normal"            
## [28] "mix2normal.control"     "norm"                   "norm"                  
## [31] "Norm"                   "normalize.mesh3d"       "normalizePath"         
## [34] "normal_print"           "normal.vcm"             "pbetanorm"             
## [37] "pbinorm"                "pbinormcop"             "p.dnorm"               
## [40] "penorm"                 "pfoldnorm"              "plnorm"                
## [43] "pnorm"                  "pnorm2"                 "posnormal"             
## [46] "pposnorm"               "qbetanorm"              "qenorm"                
## [49] "qfoldnorm"              "qlnorm"                 "qnorm"                 
## [52] "qposnorm"               "qqnorm"                 "rbetanorm"             
## [55] "rbinorm"                "rbinormcop"             "rec.normal"            
## [58] "rec.normal.control"     "renorm"                 "rfoldnorm"             
## [61] "rlnorm"                 "rnorm"                  "rposnorm"              
## [64] "rskewnorm"              "skewnormal"             ".__T__norm:base"       
## [67] "uninormal"
?pnorm

S3 vs S4

person3 <- function(name,age,weight,height){
  characteristics<-list(name=name,age=age,weight=weight,height=height)
  class(characteristics)<-"person3"
  return(characteristics)}
JohnDoe3 <- person3(name="John",age=28, weight=76, height=182)
JohnDoe3$age
## [1] 28
BMI3 <- function(object,...) {return(object$weight*1e4/object$height^2)}
BMI3(JohnDoe3)
## [1] 22.94409

Class S3

JohnDoe3
## $name
## [1] "John"
## 
## $age
## [1] 28
## 
## $weight
## [1] 76
## 
## $height
## [1] 182
## 
## attr(,"class")
## [1] "person3"
reg3 <- lm(dist~speed,data=cars)
reg3$coefficients
## (Intercept)       speed 
##  -17.579095    3.932409
coef(reg3)
## (Intercept)       speed 
##  -17.579095    3.932409
summary(reg3)
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.069  -9.525  -2.272   9.215  43.201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
## speed         3.9324     0.4155   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12
setClass("person4", representation(name="character",
age="numeric", weight="numeric", height="numeric"),where=.GlobalEnv)
JohnDoe4 <- new("person4",name="John",age=28, weight=76, height=182)
JohnDoe4
## An object of class "person4"
## Slot "name":
## [1] "John"
## 
## Slot "age":
## [1] 28
## 
## Slot "weight":
## [1] 76
## 
## Slot "height":
## [1] 182

Class S4

JohnDoe4@age
## [1] 28
setGeneric("BMI4",function(object,separator) return(standardGeneric("BMI4")),
           where=.GlobalEnv)
## [1] "BMI4"
setMethod("BMI4", "person4",where=.GlobalEnv,
          function(object){return(object@weight*1e4/object@height^2)})
## [1] "BMI4"
BMI4(JohnDoe4)
## [1] 22.94409
library(VGAM,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
reg4 <- vglm(dist~speed,data=cars,family=gaussianff)
reg4@coefficients
## (Intercept)       speed 
##  -17.579095    3.932409
coef(reg4)
## (Intercept)       speed 
##  -17.579095    3.932409
pnorm(3.65102,mean=5,sd=2)
## [1] 0.2499999
library(distr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
X <- Norm(mean=5,sd=2)
distr::mean(X)
## [1] 5
p(X)(3.65102)
## [1] 0.2499999

ditribution class

qnorm(0.25,mean=5,sd=2)
## [1] 3.65102
library(distrEx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
distr::q(X)
## function (p, lower.tail = TRUE, log.p = FALSE) 
## {
##     qnorm(p, mean = 5, sd = 2, lower.tail = lower.tail, log.p = log.p)
## }
## <environment: 0x96ac9d0>
distr::q(X)(0.25)
## [1] 3.65102
X1 <- Norm(mean=5,sd=2)
X2 <- Norm(mean=2,sd=1)
S <- X1+X2
distr::q(S)(.5)
## [1] 7
plot(S)

R objects : Numbers

set.seed(1)
U <- runif(20)
U[1:4]
## [1] 0.2655087 0.3721239 0.5728534 0.9082078
U[1] == 0.2655087 
## [1] FALSE
options(digits = 3)
U[1:4]
## [1] 0.266 0.372 0.573 0.908
options(digits = 22)
U[1:4]
## [1] 0.2655086631420999765396 0.3721238996367901563644 0.5728533633518964052200
## [4] 0.9082077899947762489319
options(digits = 3)
x <- exp(1)
y <- x
x <- 2
y
## [1] 2.72
class(x)
## [1] "numeric"
0/0
## [1] NaN
1/0
## [1] Inf
.Machine$double.xmax
## [1] 1.8e+308
2e+308<Inf
## [1] FALSE
(3/10-1/10)
## [1] 0.2
(3/10-1/10)==(7/10-5/10)
## [1] FALSE
(3/10-1/10)-(7/10-5/10)
## [1] 2.78e-17
all.equal((3/10-1/10),(7/10-5/10))
## [1] TRUE
sqrt(2)^2 == 2
## [1] FALSE
.Machine$double.eps
## [1] 2.22e-16
.Machine$double.xmin
## [1] 2.23e-308
x <- rnorm(8)
names(x) <- letters[1:8]
x
##       a       b       c       d       e       f       g       h 
##  1.5118  0.3898 -0.6212 -2.2147  1.1249 -0.0449 -0.0162  0.9438
x[2:4]
##      b      c      d 
##  0.390 -0.621 -2.215
x[c("b","c","d")]
##      b      c      d 
##  0.390 -0.621 -2.215
min(x[x>0])
## [1] 0.39
x[-(3:8)]
##    a    b 
## 1.51 0.39
# x[-3:8]
x[c(TRUE,FALSE)]
##       a       c       e       g 
##  1.5118 -0.6212  1.1249 -0.0162
x[c(TRUE,NA,FALSE,TRUE)]
##      a   <NA>      d      e   <NA>      h 
##  1.512     NA -2.215  1.125     NA  0.944
x[2]
##    b 
## 0.39
x[[2]]
## [1] 0.39
for(i in 1:2){
  nom_var <- paste0("x", i)
  assign(nom_var, rpois(5,7))
}
x1
## [1] 9 4 8 6 9
x2
## [1] 8 9 7 7 9
x <- runif(4)
x
## [1] 0.0233 0.4772 0.7323 0.6927
x
## [1] 0.0233 0.4772 0.7323 0.6927
library(pryr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
x %<a-% runif(1)
x
## [1] 0.478
x
## [1] 0.861
x <- 1:2
x
## [1] 0.245
rm("x")
x
## Error in eval(expr, envir, enclos): objet 'x' introuvable

R objects : Integers

(x_num=c(1,6,10))
## [1]  1  6 10
(x_int=c(1L,6L,10L))
## [1]  1  6 10
object.size(x_num)
## 72 bytes
object.size(x_int)
## 56 bytes
typeof(x_num)
## [1] "double"
typeof(x_int)
## [1] "integer"
is.integer(x_num)
## [1] FALSE
is.integer(x_int)
## [1] TRUE
str(x_num)
##  num [1:3] 1 6 10
str(x_int)
##  int [1:3] 1 6 10
c(1,c(2,c(3,c(4,5))))
## [1] 1 2 3 4 5

R objects : Matrices

M<-1:24
dim(M)=c(6,4)
M
##      [,1] [,2] [,3] [,4]
## [1,]    1    7   13   19
## [2,]    2    8   14   20
## [3,]    3    9   15   21
## [4,]    4   10   16   22
## [5,]    5   11   17   23
## [6,]    6   12   18   24
str(M)
##  int [1:6, 1:4] 1 2 3 4 5 6 7 8 9 10 ...
colnames(M)=letters[1:4]
rownames(M)=LETTERS[10:15]
M
##   a  b  c  d
## J 1  7 13 19
## K 2  8 14 20
## L 3  9 15 21
## M 4 10 16 22
## N 5 11 17 23
## O 6 12 18 24
M["K",]
##  a  b  c  d 
##  2  8 14 20
M[c("K","N"),]
##   a  b  c  d
## K 2  8 14 20
## N 5 11 17 23
M[c(2,5),]
##   a  b  c  d
## K 2  8 14 20
## N 5 11 17 23
M[c("K",5),]
## Error in M[c("K", 5), ]: indice hors limites
M[c(1,6),c(1,4)]
##   a  d
## J 1 19
## O 6 24
diag(M)
## [1]  1  8 15 22
t(M)%*%M
##     a    b    c    d
## a  91  217  343  469
## b 217  559  901 1243
## c 343  901 1459 2017
## d 469 1243 2017 2791

On Windows machines

rm(list=ls())
memory.size()
## Warning: 'memory.size()' is Windows-specific
## [1] Inf
memory.limit()
## Warning: 'memory.limit()' is Windows-specific
## [1] Inf
memory.limit(size=8000)
## Warning: 'memory.limit()' is Windows-specific
## [1] Inf
memory.limit(size=4000)
## Warning: 'memory.limit()' is Windows-specific
## [1] Inf
x3 <- 1:1e3
x4 <- 1:1e4
x5 <- 1:1e5
x6 <- 1:1e6
object.size(x3)
## 4040 bytes
object.size(x4)
## 40040 bytes
object.size(x6)
## 4000040 bytes
z1 <- matrix(0,1e+06,3) 
z2 <- matrix(as.integer(0),1e+06,3) 
object.size(z1)
## 24000200 bytes
print(object.size(z1),units="Mb")
## 22.9 Mb
object.size(z2)
## 12000200 bytes
print(object.size(z2),units="Mb")
## 11.4 Mb
mydat_r <- matrix(rnorm(3*5e6,0,2),5e6,3)
colnames(mydat_r) <- c("first","second","third")
mydat_rdf <- as.data.frame(mydat_r)
print(object.size(mydat_rdf ),units="Mb")
## 114.4 Mb
library(biglm)
## Loading required package: DBI
z1 <- matrix(as.integer(5), 300, 400)
object.size(z1)
## 480200 bytes
library(bigmemory)
## Loading required package: bigmemory.sri
z2 <- big.matrix(300, 400, type="integer", init=5)
## Error in big.matrix(300, 400, type = "integer", init = 5): Error: memory could not be allocated for instance of type big.matrix
object.size(z2)
## 12000200 bytes
rm(list=ls())

R objects : Factors

x <- c("A", "A", "B", "B", "C")
x
## [1] "A" "A" "B" "B" "C"
factor(x)
## [1] A A B B C
## Levels: A B C
relevel(factor(x),"B")
## [1] A A B B C
## Levels: B A C
model.matrix(~0+x)
##   xA xB xC
## 1  1  0  0
## 2  1  0  0
## 3  0  1  0
## 4  0  1  0
## 5  0  0  1
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$x
## [1] "contr.treatment"
n <- 10; nn <- 100
group <- factor(round(n * runif(n * nn)))
x <- rnorm(n * nn) + sqrt(as.numeric(group))
tapply(x,group,mean)
##    0    1    2    3    4    5    6    7    8    9   10 
## 1.08 1.35 1.73 2.11 2.28 2.50 2.79 2.69 3.04 3.13 3.23
aggregate(x,by=list(group),mean)
##    Group.1    x
## 1        0 1.08
## 2        1 1.35
## 3        2 1.73
## 4        3 2.11
## 5        4 2.28
## 6        5 2.50
## 7        6 2.79
## 8        7 2.69
## 9        8 3.04
## 10       9 3.13
## 11      10 3.23
xg <- split(x, group)
sapply(xg, length)
##   0   1   2   3   4   5   6   7   8   9  10 
##  54 101  93 105 103 101  99 100  93 107  44
sapply(xg, mean)
##    0    1    2    3    4    5    6    7    8    9   10 
## 1.08 1.35 1.73 2.11 2.28 2.50 2.79 2.69 3.04 3.13 3.23
library(pbapply,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
n <- 10; nn <- 1e6
group <- factor(round(n * runif(n * nn)))
x <- rnorm(n * nn) + sqrt(as.numeric(group))
xg <- split(x, group)
pbsapply(xg, mean)
##    0    1    2    3    4    5    6    7    8    9   10 
## 1.00 1.41 1.73 2.00 2.24 2.45 2.65 2.83 3.00 3.16 3.32
x <- factor(c("b","a","b"))
levels(x)
## [1] "a" "b"
x[3]="c"
## Warning in `[<-.factor`(`*tmp*`, 3, value = "c"): invalid factor level, NA
## generated
x
## [1] b    a    <NA>
## Levels: a b
factor(x,levels=c("b","a"))
## [1] b    a    <NA>
## Levels: b a
x[1]
## [1] b
## Levels: a b
x[1,drop=TRUE]
## [1] b
## Levels: b
U <- runif(20)
cut(U,breaks=2)
##  [1] (0.515,0.966]  (0.515,0.966]  (0.0631,0.515] (0.0631,0.515] (0.515,0.966] 
##  [6] (0.0631,0.515] (0.0631,0.515] (0.0631,0.515] (0.515,0.966]  (0.515,0.966] 
## [11] (0.0631,0.515] (0.515,0.966]  (0.0631,0.515] (0.515,0.966]  (0.0631,0.515]
## [16] (0.515,0.966]  (0.515,0.966]  (0.515,0.966]  (0.0631,0.515] (0.0631,0.515]
## Levels: (0.0631,0.515] (0.515,0.966]
cut(U,breaks=2,labels=c("small","large"))
##  [1] large large small small large small small small large large small large
## [13] small large small large large large small small
## Levels: small large
cut(U,breaks=c(0,.3,.8,1),labels=c("small","medium","large"))
##  [1] large  large  small  medium large  medium small  small  medium medium
## [11] medium medium small  large  medium medium medium medium small  small 
## Levels: small medium large
table(cut(U,breaks=c(0,.3,.8,1),labels=c("small","medium","large")))
## 
##  small medium  large 
##      6     10      4

R objects : Strings and Characters

"Be carefull of 'quotes'"
'Be carefull of "quotes"'
cities <- c("New York, NY", "Los Angeles, CA", "Boston, MA")
substr(cities, nchar(cities)-1, nchar(cities))
## [1] "NY" "CA" "MA"
unlist(strsplit(cities, ", "))[seq(2,6,by=2)]
## [1] "NY" "CA" "MA"
library(stringr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
tweet = "Emerging #climate change, environment and sustainability concerns for #actuaries Apr 9 #Toronto. Register TODAY http://bit.ly/CIAClimateForum"
hash="#[a-zA-Z]{1,}"
str_extract(tweet,hash)
## [1] "#climate"
str_extract_all(tweet,hash)
## [[1]]
## [1] "#climate"   "#actuaries" "#Toronto"
str_locate_all(tweet,hash)
## [[1]]
##      start end
## [1,]    10  17
## [2,]    71  80
## [3,]    88  95
urls="http://([\\da-z\\.-]+)\\.([a-z\\.]{2,6})/[a-zA-Z0-9]{1,}"
str_extract_all(tweet,urls)
## [[1]]
## [1] "http://bit.ly/CIAClimateForum"
email="^([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})$"
grep(pattern = email, x = "charpentier.arthur@uqam.ca")
## [1] 1
grepl(pattern = email, x = "charpentier.arthur@uqam.ca")
## [1] TRUE
str_detect(pattern = email, string=c("charpentier.arthur@uqam.ca","@freakonometrics"))
## [1]  TRUE FALSE
ex_sentence = "This is 1 simple sentence, just to play with, then we'll play with 4, and that will be more difficult"
library(tm,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ex_corpus <- Corpus(VectorSource(ex_sentence))
ex_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
inspect(ex_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 101
grep("hard", ex_sentence)
## integer(0)
grep("difficult", ex_sentence)
## [1] 1
library(stringr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
word(ex_sentence,4)
## [1] "simple"
word(ex_sentence,1:20)
##  [1] "This"      "is"        "1"         "simple"    "sentence," "just"     
##  [7] "to"        "play"      "with,"     "then"      "we'll"     "play"     
## [13] "with"      "4,"        "and"       "that"      "will"      "be"       
## [19] "more"      "difficult"
ex_words <- strsplit(ex_sentence, split=" ")[[1]]
ex_words
##  [1] "This"      "is"        "1"         "simple"    "sentence," "just"     
##  [7] "to"        "play"      "with,"     "then"      "we'll"     "play"     
## [13] "with"      "4,"        "and"       "that"      "will"      "be"       
## [19] "more"      "difficult"
grep(pattern="w",ex_words,value=TRUE)
## [1] "with," "we'll" "with"  "will"
str_count(ex_words,"w")
##  [1] 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0
grep(pattern="[ai]",ex_words,value=TRUE)
##  [1] "This"      "is"        "simple"    "play"      "with,"     "play"     
##  [7] "with"      "and"       "that"      "will"      "difficult"
grep(pattern="[[:punct:]]",ex_words,value=TRUE)
## [1] "sentence," "with,"     "we'll"     "4,"
require(wordcloud)
wordcloud(ex_corpus)

cols<-colorRampPalette(c("lightgrey","blue"))(40)
wordcloud(words = ex_corpus, max.words = 40, random.order=FALSE, scale = c(5, 0.5), colors=cols)

tdm <- TermDocumentMatrix(ex_corpus)
tdm
## <<TermDocumentMatrix (terms: 14, documents: 1)>>
## Non-/sparse entries: 14/0
## Sparsity           : 0%
## Maximal term length: 9
## Weighting          : term frequency (tf)
inspect(tdm)
## <<TermDocumentMatrix (terms: 14, documents: 1)>>
## Non-/sparse entries: 14/0
## Sparsity           : 0%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## 
##            Docs
## Terms       1
##   and       1
##   difficult 1
##   just      1
##   more      1
##   play      2
##   sentence, 1
##   simple    1
##   that      1
##   then      1
##   this      1
##   we'll     1
##   will      1
##   with      1
##   with,     1
inspect(ex_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 101
fix_contractions <- function(doc) {
  doc <- gsub("won't", "will not", doc)
  doc <- gsub("n't", " not", doc)
  doc <- gsub("'ll", " will", doc)
  doc <- gsub("'re", " are", doc)
  doc <- gsub("'ve", " have", doc)
  doc <- gsub("'m", " am", doc)
  doc <- gsub("'s", "", doc) 
  return(doc)
}
ex_corpus <- tm_map(ex_corpus, fix_contractions)
inspect(ex_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] This is 1 simple sentence, just to play with, then we will play with 4, and that will be more difficult
ex_corpus <- tm_map(ex_corpus, tolower)
inspect(ex_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] this is 1 simple sentence, just to play with, then we will play with 4, and that will be more difficult
ex_corpus <- tm_map(ex_corpus, removeNumbers)
inspect(ex_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] this is  simple sentence, just to play with, then we will play with , and that will be more difficult
ex_corpus <- tm_map(ex_corpus,gsub, pattern= "[[:punct:]]", replacement = " ")
inspect(ex_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] this is  simple sentence  just to play with  then we will play with   and that will be more difficult
stopwords("en")[sample(1:length(stopwords("en")),size=10)]
##  [1] "that's"    "as"        "too"       "shouldn't" "doesn't"   "does"     
##  [7] "few"       "what's"    "yourself"  "we're"
inspect(ex_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] this is  simple sentence  just to play with  then we will play with   and that will be more difficult
library(SnowballC,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ex_corpus <- tm_map(ex_corpus, stemDocument)
inspect(ex_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] this is  simple sentence  just to play with  then we will play with   and that will be more difficult
wordcloud(ex_corpus[[1]])

wordcloud(words = ex_corpus[[1]], max.words = 40, random.order=FALSE, scale = c(5, 0.5),colors=cols)

R objects : Dates

some.dates <- as.Date(c("16/10/12","19/11/12"),format="%d/%m/%y")
diff(some.dates)
## Time difference of 34 days
sequence.date <- seq(from=some.dates[1],to=some.dates[2],by=7)
format(sequence.date,"%b")
## [1] "oct." "oct." "oct." "nov." "nov."
months(sequence.date)
## [1] "octobre"  "octobre"  "octobre"  "novembre" "novembre"
weekdays(some.dates)
## [1] "mardi" "lundi"
substr(as.POSIXct(some.dates), 1, 4)
## [1] "2012" "2012"
strftime(some.dates,"%Y")
## [1] "2012" "2012"
Sys.setlocale("LC_TIME", "fr_FR")
## Warning in Sys.setlocale("LC_TIME", "fr_FR"): la requête OS pour spécifier la
## localisation à "fr_FR" n'a pas pu être honorée
## [1] ""
months(sequence.date)
## [1] "octobre"  "octobre"  "octobre"  "novembre" "novembre"

Dates with R

R objects : Lists

x <- list(1:5,c(1,2,3,4,5),a="test",
b=c(TRUE,FALSE),rpois(5,8))
x
## [[1]]
## [1] 1 2 3 4 5
## 
## [[2]]
## [1] 1 2 3 4 5
## 
## $a
## [1] "test"
## 
## $b
## [1]  TRUE FALSE
## 
## [[5]]
## [1] 7 7 8 7 8
str(x)
## List of 5
##  $  : int [1:5] 1 2 3 4 5
##  $  : num [1:5] 1 2 3 4 5
##  $ a: chr "test"
##  $ b: logi [1:2] TRUE FALSE
##  $  : int [1:5] 7 7 8 7 8
x <- list(
  list(1:5,c(1,2,3,4,5)),a="test",
  b=c(TRUE,FALSE),rpois(5,8))
x
## [[1]]
## [[1]][[1]]
## [1] 1 2 3 4 5
## 
## [[1]][[2]]
## [1] 1 2 3 4 5
## 
## 
## $a
## [1] "test"
## 
## $b
## [1]  TRUE FALSE
## 
## [[4]]
## [1] 8 9 6 5 3
names(x)
## [1] ""  "a" "b" ""
str(x)
## List of 4
##  $  :List of 2
##   ..$ : int [1:5] 1 2 3 4 5
##   ..$ : num [1:5] 1 2 3 4 5
##  $ a: chr "test"
##  $ b: logi [1:2] TRUE FALSE
##  $  : int [1:5] 8 9 6 5 3
is.list(x)
## [1] TRUE
is.recursive(x)
## [1] TRUE
x[[3]]
## [1]  TRUE FALSE
x[["b"]]
## [1]  TRUE FALSE
x[[1]]
## [[1]]
## [1] 1 2 3 4 5
## 
## [[2]]
## [1] 1 2 3 4 5
x[[1]][[1]]
## [1] 1 2 3 4 5
f <- function(x) { return(x*(1-x)) }
optim.f <- optimize(f, interval=c(0, 1), maximum=TRUE)
str(optim.f)
## List of 2
##  $ maximum  : num 0.5
##  $ objective: num 0.25
list(abc=1)$a
## [1] 1
list(abc=1,a=2)$a
## [1] 2
list(bac=1)$a
## NULL
list(abc=1,b=2)$a
## [1] 1

R objects : Functions

factorial
## function (x) 
## gamma(x + 1)
## <bytecode: 0x1623d1e0>
## <environment: namespace:base>
gamma
## function (x)  .Primitive("gamma")
?gamma

If Condition

set.seed(1)
u <- runif(1)
if(u>.5) {("greater than 50%")} else {("smaller than 50%")}
## [1] "smaller than 50%"
ifelse(u>.5,("greater than 50%"),("smaller than 50%"))
## [1] "smaller than 50%"
u
## [1] 0.266
u <- runif(3)
if(u>.5) {print("greater than 50%")} else {("smaller than 50%")}
## Warning in if (u > 0.5) {: la condition a une longueur > 1 et seul le premier
## élément est utilisé
## [1] "smaller than 50%"
ifelse(u>.5,("greater than 50%"),("smaller than 50%"))
## [1] "smaller than 50%" "greater than 50%" "greater than 50%"
u                
## [1] 0.372 0.573 0.908

Loops

v_x <- runif(1e5)
sqrt_x <- NULL
for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x))

sqrt_x <- NULL
system.time(for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x)))
##    user  system elapsed 
##  14.200   0.068  14.270
sqrt_x <- numeric(length(v_x))
for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i])

sqrt_x <- numeric(length(v_x))
system.time(for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i]))
##    user  system elapsed 
##   0.132   0.000   0.132
system.time(Vectorize(sqrt)(v_x))
##    user  system elapsed 
##   0.000   0.000   0.002
sqrt_x <- sapply(v_x,sqrt)
sqrt_x <- unlist(lapply(v_x,sqrt))
system.time(unlist(lapply(v_x,sqrt)))
##    user  system elapsed 
##   0.036   0.000   0.038
library(parallel,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
(allcores <- detectCores(all.tests=TRUE))
## [1] 4
sqrt_x <- unlist(mclapply(v_x,sqrt,mc.cores=4))
system.time(unlist(mclapply(v_x,sqrt,mc.cores=4)))
##    user  system elapsed 
##   0.140   0.132   0.101
f<-function(x) dlnorm(x)
integrate(f,0,Inf)
## 1 with absolute error < 2.5e-07
integrate(f,0,1e5)
## 1.82e-05 with absolute error < 3.6e-05
integrate(f,0,1e3)$value+integrate(f,1e3,1e5)$value
## [1] 1

Apply

see data camp on apply

rN.Poisson <- function(n) rpois(n,5)
rX.Exponential <- function(n) rexp(n,2)
set.seed(1)
rN.Poisson(5)
## [1] 4 4 5 8 3
rX.Exponential(3)
## [1] 0.229 0.129 1.447
rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  V <- rep(0,n)
  for(i in 1:n){
    N <- rN(1)
    if(N>0){V[i] <- sum(rX(N))}
  }
  return(V)}
rcpd1(3)
## [1] 1.52 3.88 0.79
rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  V <- rep(0,n)
  for (i in 1:n) V[i] <- sum(rX(rN(1)))
  return(V)}
rcpd1(3)
## [1] 0.893 3.368 3.023
rcpd2 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  N <- rN(n)
  X <- rX(sum(N))
  I <- factor(rep(1:n,N),levels=1:n)
  return(as.numeric(xtabs(X ~ I)))}
rcpd2(3)
## [1] 2.29 2.06 5.46
rcpd3 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  N <- rN(n)
  X <- rX(sum(N))
  I <- factor(rep(1:n,N),levels=1:n)
  V <- tapply(X,I,sum)
  V[is.na(V)] <- 0
  return(as.numeric(V))}
rcpd3(3)
## [1] 2.22 6.03 4.33
rcpd4 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(sapply(rN(n), function(x) sum(rX(x))))}
rcpd4(3)
## [1] 2.83 4.09 2.99
rcpd5 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(sapply(Vectorize(rX)(rN(n)),sum))}
rcpd5(3)
## [1] 1.81 3.69 3.60
rcpd6 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(unlist(lapply(lapply(t(rN(n)),rX),sum)))}
rcpd6(3)
## [1] 0.291 1.344 2.948
n <- 100
library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
options(digits=1)
microbenchmark(rcpd1(n),rcpd2(n),rcpd3(n),rcpd4(n),rcpd5(n),rcpd6(n))
## Unit: microseconds
##      expr  min   lq mean median   uq   max neval cld
##  rcpd1(n)  476  545  647    657  725  1020   100  b 
##  rcpd2(n) 1018 1169 1287   1235 1351  2195   100   c
##  rcpd3(n)  485  540  609    589  633  1407   100  b 
##  rcpd4(n)  313  369  421    407  443  1518   100 a  
##  rcpd5(n)  387  468  633    540  585 10251   100  b 
##  rcpd6(n)  309  363  411    406  449   600   100 a
options(digits=5)

Vectorization

f <- function(x,m=0,s=1){
  H<-function(t) 1-pnorm(t,m,s)
  integral<-integrate(H,lower=x,upper=Inf)$value
  res<-H(x)/integral
  return(res)
}
f(0)
## [1] 1.2533
f(0:1) 
## Warning in if (is.finite(lower)) {: la condition a une longueur > 1 et seul le
## premier élément est utilisé
## [1] 1.25331 0.39769
y <- rep(NA,2)
x <- 0:1
for(i in 1:2) y[i] <- f(x[i])
y
## [1] 1.2533 1.9043
sapply(x,"f")
## [1] 1.2533 1.9043
Vectorize(f)(x)
## [1] 1.2533 1.9043
binorm <- function(x1,x2,r=0){
  exp(-(x1^2+x2^2-2*r*x1*x2)/(2*(1-r^2)))/(2*pi*sqrt(1-r^2))
}
u <- seq(-2,2)
binorm(u,u)
## [1] 0.002915 0.058550 0.159155 0.058550 0.002915
outer(u,u,binorm)
##          [,1]     [,2]     [,3]     [,4]     [,5]
## [1,] 0.002915 0.013064 0.021539 0.013064 0.002915
## [2,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [3,] 0.021539 0.096532 0.159155 0.096532 0.021539
## [4,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [5,] 0.002915 0.013064 0.021539 0.013064 0.002915
(uv<-expand.grid(u,u))
##    Var1 Var2
## 1    -2   -2
## 2    -1   -2
## 3     0   -2
## 4     1   -2
## 5     2   -2
## 6    -2   -1
## 7    -1   -1
## 8     0   -1
## 9     1   -1
## 10    2   -1
## 11   -2    0
## 12   -1    0
## 13    0    0
## 14    1    0
## 15    2    0
## 16   -2    1
## 17   -1    1
## 18    0    1
## 19    1    1
## 20    2    1
## 21   -2    2
## 22   -1    2
## 23    0    2
## 24    1    2
## 25    2    2
matrix(binorm(uv$Var1,uv$Var2),5,5)
##          [,1]     [,2]     [,3]     [,4]     [,5]
## [1,] 0.002915 0.013064 0.021539 0.013064 0.002915
## [2,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [3,] 0.021539 0.096532 0.159155 0.096532 0.021539
## [4,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [5,] 0.002915 0.013064 0.021539 0.013064 0.002915
u <- seq(-2,2,length=31)
persp(u,u,outer(u,u,binorm),theta=25)


library(rgl)
library(rglwidget)
## The functions in the rglwidget package have been moved to rgl.
knit_hooks$set(webgl = hook_webgl)
persp3d(u,u,outer(u,u,binorm),zlab="",col="green")
## Warning in par3d(userMatrix = structure(c(1, 0, 0, 0, 0, 0.342020143325668, :
## font family "sans" not found, using "bitmap"
rgl::rglwidget()

Computational Efficiency

library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
n <- 1000
A<-matrix(seq(1,n^2),n,n)
B<-matrix(seq(1,n^2),n,n)
C<-1:n
microbenchmark((A%*%B)%*%C,A%*%(B%*%C),replications=1)
## Unit: nanoseconds
##             expr       min        lq       mean     median         uq
##  (A %*% B) %*% C 865485989 878007768 9.0447e+08 8.8428e+08 8.9980e+08
##  A %*% (B %*% C)   9174756   9265742 1.0243e+07 9.3492e+06 1.0948e+07
##     replications         5        14 2.8926e+02 2.9550e+02 4.5150e+02
##         max neval cld
##  1142594904   100   b
##    18312757   100  a 
##        1530   100  a
m <- 1000
n <- 50
X <- matrix(rnorm(m * n, mean = 10, sd = 3), nrow = m)
group <- rep(c("A","B"), each = n / 2)

system.time(for(i in 1:m) t.test(X[i, ] ~ group)$stat)
##    user  system elapsed 
##   0.952   0.000   0.951
system.time(for(i in 1:m) t.test(X[i, group == "A"], X[i, group == "B"])$stat)
##    user  system elapsed 
##   0.196   0.000   0.193
t.test_1 <- function(x, grp){
  t.test(x[ group == "A"], x[ group == "B"])$stat
}
system.time(t1 <- apply(X, 1, t.test_1, grp = grp))
##    user  system elapsed 
##   0.204   0.000   0.206
t.test_2 <- function(x, grp) {
  t_stat <- function(x) {
    m <- mean(x)
    n <- length(x)
    var <- sum((x - m) ^ 2) / (n - 1)
    list(m = m, n = n, var = var)
  }
  g1 <- t_stat(x[grp == "A"])
  g2 <- t_stat(x[grp == "B"])
  se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
  (g1$m - g2$m) / se_total
}
system.time(apply(X, 1, t.test_2, grp = group))
##    user  system elapsed 
##   0.040   0.000   0.041
t.test_3 <- function(X, grp){
  t_stat <- function(X) {
    m <- rowMeans(X)
    n <- ncol(X)
    var <- rowSums((X - m) ^ 2) / (n - 1)
    list(m = m, n = n, var = var)
  }
  g1 <- t_stat(X[, grp == "A"])
  g2 <- t_stat(X[, grp == "B"])
  se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
  (g1$m - g2$m) / se_total
}
system.time( t.test_3(X, group))
##    user  system elapsed 
##   0.000   0.000   0.002
"%pm%" <- function(x,s) x + c(qnorm(.05),qnorm(.95))*s
100 %pm% 10
## [1]  83.551 116.449
'second<-' <- function(x, value) {
  x[2] <- value
  return(x)
}
x <- 1:10
second(x) <- 5
x
##  [1]  1  5  3  4  5  6  7  8  9 10
library(pryr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
x <- 1:10
address(x)
## [1] "0x172b0938"
second(x) <- 6
address(x)
## [1] "0x1b41c868"
ls(globalenv())
##  [1] "A"                "allcores"         "B"                "binorm"          
##  [5] "C"                "cities"           "cols"             "email"           
##  [9] "ex_corpus"        "ex_sentence"      "ex_words"         "f"               
## [13] "fix_contractions" "group"            "hash"             "i"               
## [17] "m"                "n"                "nn"               "optim.f"         
## [21] "%pm%"             "rcpd1"            "rcpd2"            "rcpd3"           
## [25] "rcpd4"            "rcpd5"            "rcpd6"            "rN.Poisson"      
## [29] "rX.Exponential"   "second<-"         "sequence.date"    "some.dates"      
## [33] "sqrt_x"           "t1"               "tdm"              "t.test_1"        
## [37] "t.test_2"         "t.test_3"         "tweet"            "u"               
## [41] "U"                "urls"             "uv"               "v_x"             
## [45] "x"                "X"                "xg"               "y"
utils::find("x")
## [1] ".GlobalEnv"
utils::find("pi")
## [1] "package:base"
environment(sd)
## <environment: 0x51fb5c0>
f <- function(x) x^2
f
## function(x) x^2
formals(f)
## $x
body(f)
## x^2
e <- new.env()
e$d <- 1
e$f <- 1:5
e$g <- 1:5
ls(e)
## [1] "d" "f" "g"
str(e)
## <environment: 0xe6e4da0>
identical(globalenv(),e)
## [1] FALSE
search()
##  [1] ".GlobalEnv"                 "package:rglwidget"         
##  [3] "package:microbenchmark"     "package:parallel"          
##  [5] "package:SnowballC"          "package:stringr"           
##  [7] "package:pbapply"            "package:bigmemory"         
##  [9] "package:bigmemory.sri"      "package:biglm"             
## [11] "package:DBI"                "package:SparseM"           
## [13] "package:rgl"                "package:osmar"             
## [15] "package:geosphere"          "package:sp"                
## [17] "package:RCurl"              "package:bitops"            
## [19] "package:XML"                "package:data.table"        
## [21] "package:VGAM"               "package:splines"           
## [23] "package:stats4"             "package:tm"                
## [25] "package:NLP"                "package:wordcloud"         
## [27] "package:RColorBrewer"       "package:plyr"              
## [29] "package:pryr"               "package:distrEx"           
## [31] "package:distr"              "package:SweaveListingUtils"
## [33] "package:sfsmisc"            "package:startupmsg"        
## [35] "package:leaflet"            "package:dplyr"             
## [37] "package:htmlwidgets"        "package:knitr"             
## [39] "package:stats"              "package:graphics"          
## [41] "package:grDevices"          "package:utils"             
## [43] "package:datasets"           "package:methods"           
## [45] "Autoloads"                  "package:base"
f1 <- function(x1){
  f2 <- function(x2){
    f3 <- function(x3){
      x1+x2+x3
    }
    f3(3)
  } 
  f2(2)
}
f1(1)
## [1] 6
f <- function(x) log(x)
f("x")
## Error in log(x): argument non numérique pour une fonction mathématique
try(f("x"))
inherits(try(f("x")), "try-error")
## [1] TRUE
inherits(try(f("x"), silent=TRUE), "try-error")
## [1] TRUE
a=0
try(a<-f("x"), silent=TRUE)
a
## [1] 0
try(a<-f(x), silent=TRUE)
a
##  [1] 0.0000 1.7918 1.0986 1.3863 1.6094 1.7918 1.9459 2.0794 2.1972 2.3026
x=1:10
g=function(f) f(x)
g(mean)
## [1] 5.5
fibonacci <- function(n){
  if(n<2) return(1)
  fibonacci(n-2)+fibonacci(n-1)
}
fibonacci(20)
## [1] 10946
system.time(fibonacci(30))
##    user  system elapsed 
##   1.964   0.000   1.965
library(memoise,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
fibonacci <- memoise(function(n){
  if(n<2) return(1)
  fibonacci(n-2)+fibonacci(n-1)
})
system.time(fibonacci(30))
##    user  system elapsed 
##   0.008   0.000   0.006
set.seed(1)
x<-rexp(6)
sum(x)
## [1] 5.5534
.Primitive("sum")(x)
## [1] 5.5534
library(Rcpp,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
cppFunction('double sum_C(NumericVector x) {
            int n = x.size();
            double total = 0;
            for(int i = 0; i < n; ++i) {
            total += x[i];
            }
            return total;
            }')
sum_C(x)
## [1] 5.5534

Rcpp package.

library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
x <- runif(1e6)
microbenchmark(
  sum(x),
  .Primitive("sum")(x),
  sum_C(x))
## Unit: milliseconds
##                  expr    min     lq   mean median     uq    max neval cld
##                sum(x) 1.1608 1.1717 1.1851 1.1817 1.1933 1.2794   100  a 
##  .Primitive("sum")(x) 1.1670 1.1777 1.2251 1.1875 1.2081 2.0554   100   b
##              sum_C(x) 1.1607 1.1675 1.1844 1.1730 1.1959 1.3061   100  a
names_list <- function(...){
  names(list(...))
}
names_list(a=5,b=7)
## [1] "a" "b"
power <- function(exponent) {
  function(x) {
    x ^ exponent
  }
}
square <- power(2)
square(4)
## [1] 16
cube <- power(3)
cube(4)
## [1] 64
cube
## function(x) {
##     x ^ exponent
##   }
## <environment: 0x14555b00>
total <- 20
pb <- txtProgressBar(min = 0, max = total, style = 3)
## 
  |                                                                            
  |                                                                      |   0%
for(i in 1:(10*total)){
  Sys.sleep(0.01)
  if(i %%10 == 0) setTxtProgressBar(pb, i%/%10)
}
## 
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |======================================================================| 100%
close(pb)

R objects : Data Frames

df <- data.frame(x=1:3,y=letters[1:3])
str(df)
## 'data.frame':    3 obs. of  2 variables:
##  $ x: int  1 2 3
##  $ y: Factor w/ 3 levels "a","b","c": 1 2 3
typeof(df)
## [1] "list"
class(df)
## [1] "data.frame"
cbind(df,z=9:7)
##   x y z
## 1 1 a 9
## 2 2 b 8
## 3 3 c 7
df$z<-5:3
df
##   x y z
## 1 1 a 5
## 2 2 b 4
## 3 3 c 3
df[1]
##   x
## 1 1
## 2 2
## 3 3
df[,1,drop=FALSE]
##   x
## 1 1
## 2 2
## 3 3
df[,1,drop=TRUE]
## [1] 1 2 3
df[[1]]
## [1] 1 2 3
df$x
## [1] 1 2 3
df[,"x"]
## [1] 1 2 3
df[["x"]]
## [1] 1 2 3
set.seed(1)
df[sample(nrow(df)),]
##   x y z
## 1 1 a 5
## 3 3 c 3
## 2 2 b 4
set.seed(1)
df[sample(nrow(df),nrow(df)*2,replace=TRUE),]
##     x y z
## 1   1 a 5
## 2   2 b 4
## 2.1 2 b 4
## 3   3 c 3
## 1.1 1 a 5
## 3.1 3 c 3
subset(df, x>1)
##   x y z
## 2 2 b 4
## 3 3 c 3
set.seed(123)
df <- data.frame(Y=rnorm(50), X1=as.factor(sample(LETTERS[1:4],size=50,
replace=TRUE)), X2=as.factor(sample(1:3,size=50,replace=TRUE)))
tail(df)
##            Y X1 X2
## 45  1.207962  D  3
## 46 -1.123109  A  2
## 47 -0.402885  A  2
## 48 -0.466655  A  2
## 49  0.779965  A  1
## 50 -0.083369  C  2
reg <- lm(Y~X1+X2,data=df)
model.matrix(reg)[45:50,]
##    (Intercept) X1B X1C X1D X22 X23
## 45           1   0   0   1   0   1
## 46           1   0   0   0   1   0
## 47           1   0   0   0   1   0
## 48           1   0   0   0   1   0
## 49           1   0   0   0   0   0
## 50           1   0   1   0   1   0
reg <- lm(Y~(X1=="A")+X2,data=df)
model.matrix(reg)[45:50,]
##    (Intercept) X1 == "A"TRUE X22 X23
## 45           1             0   0   1
## 46           1             1   1   0
## 47           1             1   1   0
## 48           1             1   1   0
## 49           1             1   0   0
## 50           1             0   1   0
reg <- lm(Y~X1*X2,data=df)
model.matrix(reg)[45:50,]
##    (Intercept) X1B X1C X1D X22 X23 X1B:X22 X1C:X22 X1D:X22 X1B:X23 X1C:X23
## 45           1   0   0   1   0   1       0       0       0       0       0
## 46           1   0   0   0   1   0       0       0       0       0       0
## 47           1   0   0   0   1   0       0       0       0       0       0
## 48           1   0   0   0   1   0       0       0       0       0       0
## 49           1   0   0   0   0   0       0       0       0       0       0
## 50           1   0   1   0   1   0       0       1       0       0       0
##    X1D:X23
## 45       1
## 46       0
## 47       0
## 48       0
## 49       0
## 50       0
reg <- lm(Y~X1+X2%in%X1,data=df)
model.matrix(reg)[45:50,]
##    (Intercept) X1B X1C X1D X1A:X22 X1B:X22 X1C:X22 X1D:X22 X1A:X23 X1B:X23
## 45           1   0   0   1       0       0       0       0       0       0
## 46           1   0   0   0       1       0       0       0       0       0
## 47           1   0   0   0       1       0       0       0       0       0
## 48           1   0   0   0       1       0       0       0       0       0
## 49           1   0   0   0       0       0       0       0       0       0
## 50           1   0   1   0       0       0       1       0       0       0
##    X1C:X23 X1D:X23
## 45       0       1
## 46       0       0
## 47       0       0
## 48       0       0
## 49       0       0
## 50       0       0
download.file("http://freakonometrics.free.fr/superheroes.RData","superheroes.RData")
load("superheroes.RData")
superheroes
##       name alignment gender         publisher
## 1  Magneto       bad   male            Marvel
## 2    Storm      good female            Marvel
## 3 Mystique       bad female            Marvel
## 4   Batman      good   male                DC
## 5    Joker       bad   male                DC
## 6 Catwoman       bad female                DC
## 7  Hellboy      good   male Dark Horse Comics
publishers
##   publisher yr_founded
## 1        DC       1934
## 2    Marvel       1939
## 3     Image       1992
library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
inner_join(superheroes, publishers)
## Joining, by = "publisher"
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector
##       name alignment gender publisher yr_founded
## 1  Magneto       bad   male    Marvel       1939
## 2    Storm      good female    Marvel       1939
## 3 Mystique       bad female    Marvel       1939
## 4   Batman      good   male        DC       1934
## 5    Joker       bad   male        DC       1934
## 6 Catwoman       bad female        DC       1934
inner_join(publishers, superheroes)
## Joining, by = "publisher"
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector
##   publisher yr_founded     name alignment gender
## 1        DC       1934   Batman      good   male
## 2        DC       1934    Joker       bad   male
## 3        DC       1934 Catwoman       bad female
## 4    Marvel       1939  Magneto       bad   male
## 5    Marvel       1939    Storm      good female
## 6    Marvel       1939 Mystique       bad female

dplyr package.

merge(superheroes, publishers, all = TRUE)
##           publisher     name alignment gender yr_founded
## 1 Dark Horse Comics  Hellboy      good   male         NA
## 2                DC   Batman      good   male       1934
## 3                DC    Joker       bad   male       1934
## 4                DC Catwoman       bad female       1934
## 5            Marvel  Magneto       bad   male       1939
## 6            Marvel    Storm      good female       1939
## 7            Marvel Mystique       bad female       1939
## 8             Image     <NA>      <NA>   <NA>       1992
semi_join(superheroes, publishers)
## Joining, by = "publisher"
##       name alignment gender publisher
## 1   Batman      good   male        DC
## 2    Joker       bad   male        DC
## 3 Catwoman       bad female        DC
## 4  Magneto       bad   male    Marvel
## 5    Storm      good female    Marvel
## 6 Mystique       bad female    Marvel
semi_join(publishers, superheroes)
## Joining, by = "publisher"
##   publisher yr_founded
## 1    Marvel       1939
## 2        DC       1934
left_join(superheroes, publishers)
## Joining, by = "publisher"
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining factors
## with different levels, coercing to character vector
##       name alignment gender         publisher yr_founded
## 1  Magneto       bad   male            Marvel       1939
## 2    Storm      good female            Marvel       1939
## 3 Mystique       bad female            Marvel       1939
## 4   Batman      good   male                DC       1934
## 5    Joker       bad   male                DC       1934
## 6 Catwoman       bad female                DC       1934
## 7  Hellboy      good   male Dark Horse Comics         NA
left_join(publishers, superheroes)
## Joining, by = "publisher"
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining factors
## with different levels, coercing to character vector
##   publisher yr_founded     name alignment gender
## 1        DC       1934   Batman      good   male
## 2        DC       1934    Joker       bad   male
## 3        DC       1934 Catwoman       bad female
## 4    Marvel       1939  Magneto       bad   male
## 5    Marvel       1939    Storm      good female
## 6    Marvel       1939 Mystique       bad female
## 7     Image       1992     <NA>      <NA>   <NA>
anti_join(superheroes, publishers)
## Joining, by = "publisher"
##      name alignment gender         publisher
## 1 Hellboy      good   male Dark Horse Comics
anti_join(publishers, superheroes)
## Joining, by = "publisher"
##   publisher yr_founded
## 1     Image       1992
download.file("http://freakonometrics.free.fr/gapminderDataFiveYear.txt",
              "gapminderDataFiveYear.txt")
gdf <- read.delim("gapminderDataFiveYear.txt")
head(gdf,4)
##       country year      pop continent lifeExp gdpPercap
## 1 Afghanistan 1952  8425333      Asia  28.801    779.45
## 2 Afghanistan 1957  9240934      Asia  30.332    820.85
## 3 Afghanistan 1962 10267083      Asia  31.997    853.10
## 4 Afghanistan 1967 11537966      Asia  34.020    836.20
str(gdf)
## 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
gtbl <- tbl_df(gdf)
library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
gdfdt <- data.table(gdf)
gdfdt
##           country year      pop continent lifeExp gdpPercap
##    1: Afghanistan 1952  8425333      Asia  28.801    779.45
##    2: Afghanistan 1957  9240934      Asia  30.332    820.85
##    3: Afghanistan 1962 10267083      Asia  31.997    853.10
##    4: Afghanistan 1967 11537966      Asia  34.020    836.20
##    5: Afghanistan 1972 13079460      Asia  36.088    739.98
##   ---                                                      
## 1700:    Zimbabwe 1987  9216418    Africa  62.351    706.16
## 1701:    Zimbabwe 1992 10704340    Africa  60.377    693.42
## 1702:    Zimbabwe 1997 11404948    Africa  46.809    792.45
## 1703:    Zimbabwe 2002 11926563    Africa  39.989    672.04
## 1704:    Zimbabwe 2007 12311143    Africa  43.487    469.71
subset(gdf, lifeExp < 30)
##          country year     pop continent lifeExp gdpPercap
## 1    Afghanistan 1952 8425333      Asia  28.801    779.45
## 1293      Rwanda 1992 7290203    Africa  23.599    737.07
gdf[lifeExp < 30,]
## Error in `[.data.frame`(gdf, lifeExp < 30, ): objet 'lifeExp' introuvable
gdf[gdf$lifeExp < 30,]
##          country year     pop continent lifeExp gdpPercap
## 1    Afghanistan 1952 8425333      Asia  28.801    779.45
## 1293      Rwanda 1992 7290203    Africa  23.599    737.07
filter(gtbl, lifeExp < 30)
## # A tibble: 2 × 6
##       country  year     pop continent lifeExp gdpPercap
##        <fctr> <int>   <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1952 8425333      Asia  28.801    779.45
## 2      Rwanda  1992 7290203    Africa  23.599    737.07
subset(gdfdt, lifeExp < 30)
##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07
gdfdt[lifeExp < 30,]
##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07
gdf[gdf$country == "Italy", c("year", "lifeExp")]
##     year lifeExp
## 769 1952  65.940
## 770 1957  67.810
## 771 1962  69.240
## 772 1967  71.060
## 773 1972  72.190
## 774 1977  73.480
## 775 1982  74.980
## 776 1987  76.420
## 777 1992  77.440
## 778 1997  78.820
## 779 2002  80.240
## 780 2007  80.546
gdfdt[country == "Italy",cbind(year, lifeExp)]
##       year lifeExp
##  [1,] 1952  65.940
##  [2,] 1957  67.810
##  [3,] 1962  69.240
##  [4,] 1967  71.060
##  [5,] 1972  72.190
##  [6,] 1977  73.480
##  [7,] 1982  74.980
##  [8,] 1987  76.420
##  [9,] 1992  77.440
## [10,] 1997  78.820
## [11,] 2002  80.240
## [12,] 2007  80.546
setkey(gdfdt,country)
gdfdt["Italy",cbind(year, lifeExp)]
##       year lifeExp
##  [1,] 1952  65.940
##  [2,] 1957  67.810
##  [3,] 1962  69.240
##  [4,] 1967  71.060
##  [5,] 1972  72.190
##  [6,] 1977  73.480
##  [7,] 1982  74.980
##  [8,] 1987  76.420
##  [9,] 1992  77.440
## [10,] 1997  78.820
## [11,] 2002  80.240
## [12,] 2007  80.546

data table presentations

Data frames (and other format)

library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ldf <- tbl_df(gdf)
glimpse(ldf)
## Observations: 1,704
## Variables: 6
## $ country   <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afgh...
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 199...
## $ pop       <dbl> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372,...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, As...
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 4...
## $ gdpPercap <dbl> 779.45, 820.85, 853.10, 836.20, 739.98, 786.11, 978.01, 8...
ldf[4:5,]
## # A tibble: 2 × 6
##       country  year      pop continent lifeExp gdpPercap
##        <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1967 11537966      Asia  34.020    836.20
## 2 Afghanistan  1972 13079460      Asia  36.088    739.98
slice(ldf,4:5)
## # A tibble: 2 × 6
##       country  year      pop continent lifeExp gdpPercap
##        <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1967 11537966      Asia  34.020    836.20
## 2 Afghanistan  1972 13079460      Asia  36.088    739.98
filter(ldf,lifeExp<30 | gdpPercap<300)
## # A tibble: 6 × 6
##            country  year      pop continent lifeExp gdpPercap
##             <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>
## 1      Afghanistan  1952  8425333      Asia  28.801    779.45
## 2 Congo, Dem. Rep.  2002 55379852    Africa  44.966    241.17
## 3 Congo, Dem. Rep.  2007 64606759    Africa  46.462    277.55
## 4    Guinea-Bissau  1952   580653    Africa  32.500    299.85
## 5          Lesotho  1952   748747    Africa  42.138    298.85
## 6           Rwanda  1992  7290203    Africa  23.599    737.07
library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
dt <- data.table(gdf)
is.data.frame(dt)
## [1] TRUE
object.size(gdf)
## 72184 bytes
object.size(ldf)
## 72304 bytes
object.size(dt)
## 72880 bytes
df[ldf$lifeExp<30,]
##           Y   X1   X2
## 1  -0.56048    C    3
## NA       NA <NA> <NA>
subset(ldf, lifeExp < 30)
## # A tibble: 2 × 6
##       country  year     pop continent lifeExp gdpPercap
##        <fctr> <int>   <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1952 8425333      Asia  28.801    779.45
## 2      Rwanda  1992 7290203    Africa  23.599    737.07
filter(ldf, lifeExp < 30)
## # A tibble: 2 × 6
##       country  year     pop continent lifeExp gdpPercap
##        <fctr> <int>   <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1952 8425333      Asia  28.801    779.45
## 2      Rwanda  1992 7290203    Africa  23.599    737.07
subset(dt, lifeExp < 30)
##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07
dt[lifeExp<30]
##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07
setkey(dt, country)
dt["Italy"]
##     country year      pop continent lifeExp gdpPercap
##  1:   Italy 1952 47666000    Europe  65.940    4931.4
##  2:   Italy 1957 49182000    Europe  67.810    6248.7
##  3:   Italy 1962 50843200    Europe  69.240    8243.6
##  4:   Italy 1967 52667100    Europe  71.060   10022.4
##  5:   Italy 1972 54365564    Europe  72.190   12269.3
##  6:   Italy 1977 56059245    Europe  73.480   14256.0
##  7:   Italy 1982 56535636    Europe  74.980   16537.5
##  8:   Italy 1987 56729703    Europe  76.420   19207.2
##  9:   Italy 1992 56840847    Europe  77.440   22013.6
## 10:   Italy 1997 57479469    Europe  78.820   24675.0
## 11:   Italy 2002 57926999    Europe  80.240   27968.1
## 12:   Italy 2007 58147733    Europe  80.546   28569.7
dt["Italy",mult="first"]
##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1952 47666000    Europe   65.94    4931.4
dt["Italy",max(lifeExp)]
## [1] 80.546
max(ldf$lifeExp[ldf$country=="Italy"])
## [1] 80.546
setkey(dt, country, lifeExp) 
dt["Italy"]
##     country year      pop continent lifeExp gdpPercap
##  1:   Italy 1952 47666000    Europe  65.940    4931.4
##  2:   Italy 1957 49182000    Europe  67.810    6248.7
##  3:   Italy 1962 50843200    Europe  69.240    8243.6
##  4:   Italy 1967 52667100    Europe  71.060   10022.4
##  5:   Italy 1972 54365564    Europe  72.190   12269.3
##  6:   Italy 1977 56059245    Europe  73.480   14256.0
##  7:   Italy 1982 56535636    Europe  74.980   16537.5
##  8:   Italy 1987 56729703    Europe  76.420   19207.2
##  9:   Italy 1992 56840847    Europe  77.440   22013.6
## 10:   Italy 1997 57479469    Europe  78.820   24675.0
## 11:   Italy 2002 57926999    Europe  80.240   27968.1
## 12:   Italy 2007 58147733    Europe  80.546   28569.7
dt[.("Italy",77.44)]
##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1992 56840847    Europe   77.44     22014
dt[.("Italy",77)]
##    country year pop continent lifeExp gdpPercap
## 1:   Italy   NA  NA        NA      77        NA
dt[.("Italy",77),roll="nearest"]
##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1992 56840847    Europe      77     22014
ldf[ldf$country == "Italy", c("year", "lifeExp")]
## # A tibble: 12 × 2
##     year lifeExp
##    <int>   <dbl>
## 1   1952  65.940
## 2   1957  67.810
## 3   1962  69.240
## 4   1967  71.060
## 5   1972  72.190
## 6   1977  73.480
## 7   1982  74.980
## 8   1987  76.420
## 9   1992  77.440
## 10  1997  78.820
## 11  2002  80.240
## 12  2007  80.546
ldf %>%
  filter(country == "Italy") %>%
  select(year, lifeExp)
## # A tibble: 12 × 2
##     year lifeExp
##    <int>   <dbl>
## 1   1952  65.940
## 2   1957  67.810
## 3   1962  69.240
## 4   1967  71.060
## 5   1972  72.190
## 6   1977  73.480
## 7   1982  74.980
## 8   1987  76.420
## 9   1992  77.440
## 10  1997  78.820
## 11  2002  80.240
## 12  2007  80.546
small_df <- ldf[ldf$country %in% c("France","Italy","Spain"), c("country","year", "lifeExp")]
library(tidyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
str(small_df)
## Classes 'tbl_df', 'tbl' and 'data.frame':    36 obs. of  3 variables:
##  $ country: Factor w/ 142 levels "Afghanistan",..: 45 45 45 45 45 45 45 45 45 45 ...
##  $ year   : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp: num  67.4 68.9 70.5 71.5 72.4 ...
table_df=spread(small_df,"country","lifeExp")
table_df
## # A tibble: 12 × 4
##     year France  Italy  Spain
## *  <int>  <dbl>  <dbl>  <dbl>
## 1   1952 67.410 65.940 64.940
## 2   1957 68.930 67.810 66.660
## 3   1962 70.510 69.240 69.690
## 4   1967 71.550 71.060 71.440
## 5   1972 72.380 72.190 73.060
## 6   1977 73.830 73.480 74.390
## 7   1982 74.890 74.980 76.300
## 8   1987 76.340 76.420 76.900
## 9   1992 77.460 77.440 77.570
## 10  1997 78.640 78.820 78.770
## 11  2002 79.590 80.240 79.780
## 12  2007 80.657 80.546 80.941
data_df=gather(table_df,"country","lifeExp",2:4)
data_df
## # A tibble: 36 × 3
##     year country lifeExp
##    <int>   <chr>   <dbl>
## 1   1952  France   67.41
## 2   1957  France   68.93
## 3   1962  France   70.51
## 4   1967  France   71.55
## 5   1972  France   72.38
## 6   1977  France   73.83
## 7   1982  France   74.89
## 8   1987  France   76.34
## 9   1992  France   77.46
## 10  1997  France   78.64
## # ... with 26 more rows
library(reshape2,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
dcast(small_df, formula = country~year,
      value.var="lifeExp")
##   country  1952  1957  1962  1967  1972  1977  1982  1987  1992  1997  2002
## 1  France 67.41 68.93 70.51 71.55 72.38 73.83 74.89 76.34 77.46 78.64 79.59
## 2   Italy 65.94 67.81 69.24 71.06 72.19 73.48 74.98 76.42 77.44 78.82 80.24
## 3   Spain 64.94 66.66 69.69 71.44 73.06 74.39 76.30 76.90 77.57 78.77 79.78
##     2007
## 1 80.657
## 2 80.546
## 3 80.941
table_df= dcast(small_df, formula = year~country,
                value.var="lifeExp")
table_df
##    year France  Italy  Spain
## 1  1952 67.410 65.940 64.940
## 2  1957 68.930 67.810 66.660
## 3  1962 70.510 69.240 69.690
## 4  1967 71.550 71.060 71.440
## 5  1972 72.380 72.190 73.060
## 6  1977 73.830 73.480 74.390
## 7  1982 74.890 74.980 76.300
## 8  1987 76.340 76.420 76.900
## 9  1992 77.460 77.440 77.570
## 10 1997 78.640 78.820 78.770
## 11 2002 79.590 80.240 79.780
## 12 2007 80.657 80.546 80.941
melt(table_df, id.vars=c("year"),
     value.name = "lifeExp",
     variable.name="country")
##    year country lifeExp
## 1  1952  France  67.410
## 2  1957  France  68.930
## 3  1962  France  70.510
## 4  1967  France  71.550
## 5  1972  France  72.380
## 6  1977  France  73.830
## 7  1982  France  74.890
## 8  1987  France  76.340
## 9  1992  France  77.460
## 10 1997  France  78.640
## 11 2002  France  79.590
## 12 2007  France  80.657
## 13 1952   Italy  65.940
## 14 1957   Italy  67.810
## 15 1962   Italy  69.240
## 16 1967   Italy  71.060
## 17 1972   Italy  72.190
## 18 1977   Italy  73.480
## 19 1982   Italy  74.980
## 20 1987   Italy  76.420
## 21 1992   Italy  77.440
## 22 1997   Italy  78.820
## 23 2002   Italy  80.240
## 24 2007   Italy  80.546
## 25 1952   Spain  64.940
## 26 1957   Spain  66.660
## 27 1962   Spain  69.690
## 28 1967   Spain  71.440
## 29 1972   Spain  73.060
## 30 1977   Spain  74.390
## 31 1982   Spain  76.300
## 32 1987   Spain  76.900
## 33 1992   Spain  77.570
## 34 1997   Spain  78.770
## 35 2002   Spain  79.780
## 36 2007   Spain  80.941
aggregate(small_df$lifeExp,FUN=max,by=list(small_df$country))
##   Group.1      x
## 1  France 80.657
## 2   Italy 80.546
## 3   Spain 80.941
tapply(small_df$lifeExp,factor(small_df$country),max)
## France  Italy  Spain 
## 80.657 80.546 80.941
small_ldf = filter(ldf,country %in% c("France","Italy","Spain"))
group_by(small_ldf,country) %>% summarize(max(lifeExp))
##   max(lifeExp)
## 1       80.941
filter(ldf,country %in% c("France","Italy","Spain")) %>%
  group_by(country) %>% 
  summarize(max(lifeExp))
##   max(lifeExp)
## 1       80.941
filter(ldf,country %in% c("France","Italy","Spain")) %>%
  group_by(country) %>% 
  summarize(max(lifeExp)) 
##   max(lifeExp)
## 1       80.941
plot(ldf[,"gdpPercap"][[1]],as.numeric(ldf[,"lifeExp"][[1]]))

select(ldf,gdpPercap,lifeExp) %>% plot()

setkey(dt, country)
dt[c("France","Italy","Spain"),max(lifeExp),by="country"]
##    country     V1
## 1:  France 80.657
## 2:   Italy 80.546
## 3:   Spain 80.941
dt[c("France","Italy","Spain"),max(lifeExp),by="country"]
##    country     V1
## 1:  France 80.657
## 2:   Italy 80.546
## 3:   Spain 80.941
head(ldf[with(ldf,order(-lifeExp)),])
## # A tibble: 6 × 6
##            country  year       pop continent lifeExp gdpPercap
##             <fctr> <int>     <dbl>    <fctr>   <dbl>     <dbl>
## 1            Japan  2007 127467972      Asia  82.603     31656
## 2 Hong Kong, China  2007   6980412      Asia  82.208     39725
## 3            Japan  2002 127065841      Asia  82.000     28605
## 4          Iceland  2007    301931    Europe  81.757     36181
## 5      Switzerland  2007   7554661    Europe  81.701     37506
## 6 Hong Kong, China  2002   6762476      Asia  81.495     30209
head(dt[order(-lifeExp)])
##             country year       pop continent lifeExp gdpPercap
## 1:            Japan 2007 127467972      Asia  82.603     31656
## 2: Hong Kong, China 2007   6980412      Asia  82.208     39725
## 3:            Japan 2002 127065841      Asia  82.000     28605
## 4:          Iceland 2007    301931    Europe  81.757     36181
## 5:      Switzerland 2007   7554661    Europe  81.701     37506
## 6: Hong Kong, China 2002   6762476      Asia  81.495     30209
head(setorder(dt,-lifeExp))
##             country year       pop continent lifeExp gdpPercap
## 1:            Japan 2007 127467972      Asia  82.603     31656
## 2: Hong Kong, China 2007   6980412      Asia  82.208     39725
## 3:            Japan 2002 127065841      Asia  82.000     28605
## 4:          Iceland 2007    301931    Europe  81.757     36181
## 5:      Switzerland 2007   7554661    Europe  81.701     37506
## 6: Hong Kong, China 2002   6762476      Asia  81.495     30209
arrange(ldf,-lifeExp)
## # A tibble: 1,704 × 6
##             country  year       pop continent lifeExp gdpPercap
##              <fctr> <int>     <dbl>    <fctr>   <dbl>     <dbl>
## 1             Japan  2007 127467972      Asia  82.603     31656
## 2  Hong Kong, China  2007   6980412      Asia  82.208     39725
## 3             Japan  2002 127065841      Asia  82.000     28605
## 4           Iceland  2007    301931    Europe  81.757     36181
## 5       Switzerland  2007   7554661    Europe  81.701     37506
## 6  Hong Kong, China  2002   6762476      Asia  81.495     30209
## 7         Australia  2007  20434176   Oceania  81.235     34435
## 8             Spain  2007  40448191    Europe  80.941     28821
## 9            Sweden  2007   9031088    Europe  80.884     33860
## 10           Israel  2007   6426679      Asia  80.745     25523
## # ... with 1,694 more rows
ldf$gdp=ldf$pop*ldf$gdpPercap
ldf = within(ldf,gdp = pop*gdpPercap)
## Error in eval(expr, envir, enclos): argument manquant, sans valeur associée par défaut
ldf = within(ldf,gdp <- pop*gdpPercap)
dt[,gdp:=pop*gdpPercap]
##                country year       pop continent lifeExp gdpPercap        gdp
##    1:            Japan 2007 127467972      Asia  82.603  31656.07 4.0351e+12
##    2: Hong Kong, China 2007   6980412      Asia  82.208  39724.98 2.7730e+11
##    3:            Japan 2002 127065841      Asia  82.000  28604.59 3.6347e+12
##    4:          Iceland 2007    301931    Europe  81.757  36180.79 1.0924e+10
##    5:      Switzerland 2007   7554661    Europe  81.701  37506.42 2.8335e+11
##   ---                                                                       
## 1700:     Sierra Leone 1952   2143249    Africa  30.331    879.79 1.8856e+09
## 1701:           Angola 1952   4232095    Africa  30.015   3520.61 1.4900e+10
## 1702:           Gambia 1952    284320    Africa  30.000    485.23 1.3796e+08
## 1703:      Afghanistan 1952   8425333      Asia  28.801    779.45 6.5671e+09
## 1704:           Rwanda 1992   7290203    Africa  23.599    737.07 5.3734e+09
head(dt)
##             country year       pop continent lifeExp gdpPercap        gdp
## 1:            Japan 2007 127467972      Asia  82.603     31656 4.0351e+12
## 2: Hong Kong, China 2007   6980412      Asia  82.208     39725 2.7730e+11
## 3:            Japan 2002 127065841      Asia  82.000     28605 3.6347e+12
## 4:          Iceland 2007    301931    Europe  81.757     36181 1.0924e+10
## 5:      Switzerland 2007   7554661    Europe  81.701     37506 2.8335e+11
## 6: Hong Kong, China 2002   6762476      Asia  81.495     30209 2.0429e+11
mutate(ldf,gdp=pop*gdpPercap)
## # A tibble: 1,704 × 7
##        country  year      pop continent lifeExp gdpPercap        gdp
##         <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>      <dbl>
## 1  Afghanistan  1952  8425333      Asia  28.801    779.45 6.5671e+09
## 2  Afghanistan  1957  9240934      Asia  30.332    820.85 7.5854e+09
## 3  Afghanistan  1962 10267083      Asia  31.997    853.10 8.7589e+09
## 4  Afghanistan  1967 11537966      Asia  34.020    836.20 9.6480e+09
## 5  Afghanistan  1972 13079460      Asia  36.088    739.98 9.6786e+09
## 6  Afghanistan  1977 14880372      Asia  38.438    786.11 1.1698e+10
## 7  Afghanistan  1982 12881816      Asia  39.854    978.01 1.2599e+10
## 8  Afghanistan  1987 13867957      Asia  40.822    852.40 1.1821e+10
## 9  Afghanistan  1992 16317921      Asia  41.674    649.34 1.0596e+10
## 10 Afghanistan  1997 22227415      Asia  41.763    635.34 1.4122e+10
## # ... with 1,694 more rows
countries=read.csv2("http://freakonometrics.free.fr/countries.csv",header=TRUE)
countries = countries[-1,]
head(countries)
##       Country Interational.Direct.Dial..IDD..Code ISO.3166 Car.Code
## 2 Afghanistan                                  93       AF         
## 3     Albania                                 355       AL         
## 4     Algeria                                 213       DZ         
## 5     Andorra                              33 628       AD         
## 6      Angola                                 244       AO         
## 7    Anguilla                               1 809       AI         
##         Capital              Currency ISO.Currency Digraph  ICAO   Language
## 2         Kabul               Afghani          AFA      AF   YA-     Pashto
## 3        Tirana                   Lek          ALL      AL   ZA-   Albanian
## 4       Algiers                 Dinar          DZD      DZ   7T-     Arabic
## 5 And. La Vella          French Franc          FRF      AD   C3-    Catalan
## 6        Luanda           New Irwanza          AON      AO   D2- Portuguese
## 7               East Caribbean Dollar          XCD      AI VP-LA           
##   ISO.639
## 2      PS
## 3      SQ
## 4      AR
## 5      CA
## 6      PT
## 7
df2 = countries[,c("Country","Capital")]
names(df2)=c("countries","capital")
df1 = aggregate(ldf$lifeExp,FUN=max,by=list(ldf$country))
names(df1)=c("countries","lifeExp")
dim(df1)
## [1] 142   2
dim(df2)
## [1] 263   2
tail(merge(df1,df2))
##          countries lifeExp    capital
## 122 United Kingdom  79.425     London
## 123        Uruguay  76.384 Montevideo
## 124      Venezuela  73.747    Caracas
## 125        Vietnam  74.249      Hanoi
## 126         Zambia  51.821     Lusaka
## 127       Zimbabwe  62.351     Harare
tail(merge(df1,df2,all.x=TRUE))
##              countries lifeExp capital
## 137          Venezuela  73.747 Caracas
## 138            Vietnam  74.249   Hanoi
## 139 West Bank and Gaza  73.422    <NA>
## 140        Yemen, Rep.  62.698    <NA>
## 141             Zambia  51.821  Lusaka
## 142           Zimbabwe  62.351  Harare
tail(merge(df1,df2,all.y=TRUE))
##                     countries lifeExp  capital
## 258 Wallis and Futuna Islands      NA         
## 259                 West Bank      NA         
## 260                     World      NA         
## 261                     Yemen      NA   Sana'a
## 262                Yugoslavia      NA Belgrade
## 263                     Zaire      NA Kinshasa
ldf1 = group_by(ldf, country) %>% summarize(max(lifeExp)) 
names(ldf1)=c("countries","lifeExp")
## Error in names(ldf1) = c("countries", "lifeExp"): attribut 'names' [2] doit être de même longueur que le vecteur [1]
ldf2 <- tbl_df(df2)
inner_join(ldf1,ldf2)
## Error: No common variables. Please specify `by` param.
left_join(ldf1,ldf2)
## Error: No common variables. Please specify `by` param.
ldf1 %>% inner_join(ldf2)
## Error: No common variables. Please specify `by` param.
dt1 = dt[,max(lifeExp),by="country"]
setnames(dt1,c("countries","lifeExp"))
dt2 <- data.table(df2)
setkey(dt1,"countries")
setkey(dt2,"countries")
dt1[dt2,nomatch=0]
##        countries lifeExp      capital
##   1: Afghanistan  43.828        Kabul
##   2:     Albania  76.423       Tirana
##   3:     Algeria  72.301      Algiers
##   4:      Angola  42.731       Luanda
##   5:   Argentina  75.320 Buenos Aires
##  ---                                 
## 123:     Uruguay  76.384   Montevideo
## 124:   Venezuela  73.747      Caracas
## 125:     Vietnam  74.249        Hanoi
## 126:      Zambia  51.821       Lusaka
## 127:    Zimbabwe  62.351       Harare
dt1[dt2]
##        countries lifeExp       capital
##   1: Afghanistan  43.828         Kabul
##   2:     Albania  76.423        Tirana
##   3:     Algeria  72.301       Algiers
##   4:     Andorra      NA And. La Vella
##   5:      Angola  42.731        Luanda
##  ---                                  
## 259:       Yemen      NA        Sana'a
## 260:  Yugoslavia      NA      Belgrade
## 261:       Zaire      NA      Kinshasa
## 262:      Zambia  51.821        Lusaka
## 263:    Zimbabwe  62.351        Harare
dt2[dt1]
##               countries      capital lifeExp
##   1:        Afghanistan        Kabul  43.828
##   2:            Albania       Tirana  76.423
##   3:            Algeria      Algiers  72.301
##   4:             Angola       Luanda  42.731
##   5:          Argentina Buenos Aires  75.320
##  ---                                        
## 138:            Vietnam        Hanoi  74.249
## 139: West Bank and Gaza           NA  73.422
## 140:        Yemen, Rep.           NA  62.698
## 141:             Zambia       Lusaka  51.821
## 142:           Zimbabwe       Harare  62.351
library(stringr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(lubridate,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(LaF,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
loc="http://freakonometrics.free.fr/gapminderDataFiveYear.txt"
download.file(loc,destfile="temp")
ssn <- laf_open_csv("temp",sep="\t", 
                    column_types=c("character","integer","numeric","character","numeric","numeric"),
                    column_names=names(ldf)[1:6])
object.size(ssn)
## 2888 bytes

LaF (large ASCII files)

str(ssn)
## Formal class 'laf' [package "LaF"] with 8 slots
##   ..@ file_id      : int 0
##   ..@ filename     : chr "temp"
##   ..@ file_type    : chr "csv"
##   ..@ column_names : chr [1:6] "country" "year" "pop" "continent" ...
##   ..@ column_types : int [1:6] 3 1 0 3 0 0
##   ..@ column_widths: int(0) 
##   ..@ levels       : list()
##   ..@ options      :List of 4
##   .. ..$ sep : chr "\t"
##   .. ..$ dec : chr "."
##   .. ..$ skip: int 0
##   .. ..$ trim: logi FALSE
nrow(ssn)
## [1] 1705
go_through <- which(ssn[1:(nrow(ssn)-1),'country'] != ssn[2:nrow(ssn),'country'])+1
if(go_through[length(go_through)] != nrow(ssn)) go_through <- c(go_through, nrow(ssn))
go_through <- cbind(go_through[-length(go_through)], c(go_through[-c(1, length(go_through))]-1, go_through[length(go_through)]))
worst_life_exp <- function(s){
  data <- ssn[go_through[s,1]:go_through[s,2],c("country","year","lifeExp")]
  data[which.min(data$lifeExp),]
}
worst_life_exp(10)
##   country year lifeExp
## 1 Belgium 1952      68
s <- 10
ldf[go_through[s,1]:go_through[s,2]-1,]
## # A tibble: 12 × 7
##    country  year      pop continent lifeExp gdpPercap        gdp
##     <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>      <dbl>
## 1  Belgium  1952  8730405    Europe  68.000    8343.1 7.2839e+10
## 2  Belgium  1957  8989111    Europe  69.240    9715.0 8.7329e+10
## 3  Belgium  1962  9218400    Europe  70.250   10991.2 1.0132e+11
## 4  Belgium  1967  9556500    Europe  70.940   13149.0 1.2566e+11
## 5  Belgium  1972  9709100    Europe  71.440   16672.1 1.6187e+11
## 6  Belgium  1977  9821800    Europe  72.800   19118.0 1.8777e+11
## 7  Belgium  1982  9856303    Europe  73.930   20979.8 2.0678e+11
## 8  Belgium  1987  9870200    Europe  75.350   22525.6 2.2233e+11
## 9  Belgium  1992 10045622    Europe  76.460   25575.6 2.5692e+11
## 10 Belgium  1997 10199787    Europe  77.530   27561.2 2.8112e+11
## 11 Belgium  2002 10311970    Europe  78.320   30485.9 3.1437e+11
## 12 Belgium  2007 10392226    Europe  79.441   33692.6 3.5014e+11

Graphs with ggplot2

library(gamair,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
data(chicago)
head(chicago)
##   death pm10median pm25median o3median so2median    time tmpd
## 1   130   -7.43354         NA  -19.592   1.92804 -2556.5 31.5
## 2   150         NA         NA  -19.039  -0.98556 -2555.5 33.0
## 3   101   -0.82653         NA  -20.217  -1.89142 -2554.5 33.0
## 4   135    5.56646         NA  -19.676   6.13934 -2553.5 29.0
## 5   126         NA         NA  -19.217   2.27846 -2552.5 32.0
## 6   130    6.56646         NA  -17.634   9.85858 -2551.5 40.0

ggplot2 documentation

base=data.frame(death=chicago$death,
                temp_F=chicago$tmpd,
                o3=chicago$o3median,
                date=seq(as.Date("1987-01-01"),
                         as.Date("2000-12-31"),by=1))
base$temp_C <- (base$temp_F-32)/1.8
base$year <- substring(base$date,1,4)
date2season <- function(date){
  m <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%m"))
  d <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%d"))
  s <- NA
  if(m %in% c(1,2) | ((m==12)&(d>=21)) |  ((m==3)&(d<21))) s <- "winter"
  if(m %in% c(4,5) | ((m==3)&(d>=21)) |  ((m==6)&(d<21))) s <- "spring"
  if(m %in% c(7,8) | ((m==6)&(d>=21)) |  ((m==9)&(d<21))) s <- "summer"
  if(m %in% c(10,11) | ((m==9)&(d>=21)) |  ((m==12)&(d<21))) s <- "autumn"
  return(s)}
base$season <- sapply(base$date,date2season)
head(base)
##   death temp_F      o3       date   temp_C year season
## 1   130   31.5 -19.592 1987-01-01 -0.27778 1987 winter
## 2   150   33.0 -19.039 1987-01-02  0.55556 1987 winter
## 3   101   33.0 -20.217 1987-01-03  0.55556 1987 winter
## 4   135   29.0 -19.676 1987-01-04 -1.66667 1987 winter
## 5   126   32.0 -19.217 1987-01-05  0.00000 1987 winter
## 6   130   40.0 -17.634 1987-01-06  4.44444 1987 winter
plot(base[,c("date", "temp_C")])

library(ggplot2,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
g <- ggplot(base, aes(date, temp_C)) +
  geom_point()
g

ggsave(file="figure.pdf", scale=3)
## Saving 30 x 15 in image
g <- g+labs(title='Temperature')
g <- g+ggtitle('Temperature')
g

g+theme(plot.title = element_text(size=20, face="bold", vjust=2, color="red"))

g<-g+labs(
  x="Date", 
  y=expression(paste("Temperature ( ", degree ~ C, " )")), 
  title="Temperature in Chicago")
g

g+ theme(
  axis.title.x = element_text(color="blue"),
  axis.title.y = element_text(color="blue")   
)

g + ylim(c(-20,60))
## Warning: Removed 10 rows containing missing values (geom_point).

g <- g + xlim(as.Date(c("1995-01-01","1999-01-01")))
g
## Warning: Removed 3652 rows containing missing values (geom_point).

base2=data.frame(
  today=base$temp_C[1:5113],tomorrow=base$temp_C[2:5114])
ggplot(base2, aes(today, tomorrow))+geom_point()+ coord_equal()

g <- ggplot(base, aes(date, temp_C, color=factor(season)))+
  xlim(as.Date(c("1995-01-01","1999-01-01")))+
  geom_point()
g+theme(legend.title=element_blank())+
  guides(colour = guide_legend(override.aes = list(size=4)))
## Warning: Removed 3652 rows containing missing values (geom_point).

g<-ggplot(base, aes(date, temp_C))+
  xlim(as.Date(c("1995-01-01","1999-01-01")))
g+geom_line(color="grey")
## Warning: Removed 3652 rows containing missing values (geom_path).

g+geom_line(aes(color="temperature"))+
  scale_colour_manual(name='', values=c('temperature'='grey'))
## Warning: Removed 3652 rows containing missing values (geom_path).

ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point() + 
  scale_color_manual(values=
                       c("dodgerblue4", "darkolivegreen4",
                         "darkorchid3", "goldenrod1"))

ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")

ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
  scale_color_gradient(low="blue", high="red")

ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
  scale_color_gradient2(low="blue", mid="white", high="red")

ggplot(base, aes(date, temp_C))+geom_point(color="firebrick")+
  theme(panel.background = element_rect(fill = 'light yellow'))

ggplot(base, aes(date,temp_C))+geom_point(color="firebrick")+
  facet_wrap(~year, ncol=2, scales="free")

library(ggthemes)
ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point()+labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like MS Excel")+
  theme_excel()+scale_colour_excel()+
  theme(legend.title=element_blank())

library(ggthemes)
ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point()+labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like 'The Economist'")+
  theme_economist()+theme(legend.title=element_blank())

OpenStreetMap

library(OpenStreetMap,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
map <- openmap(c(lat= 51.516, lon= -.141), c(lat= 51.511, lon= -.133))
map <- openproj(map, projection = "+init=epsg:27700") 
plot(map)

loc="http://freakonometrics.free.fr/cholera.zip"
download.file(loc,destfile="cholera.zip")
unzip("cholera.zip", exdir="./")
library(maptools,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## Checking rgeos availability: TRUE
deaths<-readShapePoints("Cholera_Deaths")
head(deaths@coords)
##   coords.x1 coords.x2
## 0    529309    181031
## 1    529312    181025
## 2    529314    181020
## 3    529317    181014
## 4    529321    181008
## 5    529337    181006

documentation on shapefiles

plot(map)
points(deaths@coords,col="red", pch=19, cex=.7 )

X <- deaths@coords
library(KernSmooth)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))
## Warning in bw.ucv(X[, 1]): minimum occurred at one end of the range
library(grDevices,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
clrs <- colorRampPalette(c(rgb(0,0,1,0), rgb(0,0,1,1)), alpha = TRUE)(20)
plot(map)
points(deaths@coords,col="red", pch=19, cex=.7 )
image(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE,col=clrs)
contour(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE)

Google Map

library(ggmap,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
get_london <- get_map(c(-.137,51.513), zoom=17)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=51.513,-0.137&zoom=17&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
london <- ggmap(get_london)

map visualization with Google Maps

OpenStreetMap

london

df_deaths <- data.frame(X)
library(sp,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(rgdal,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## rgdal: version: 1.2-4, (SVN revision 643)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 1.11.3, released 2015/09/16
##  Path to GDAL shared files: /usr/share/gdal/1.11
##  Loaded PROJ.4 runtime: Rel. 4.9.2, 08 September 2015, [PJ_VERSION: 492]
##  Path to PROJ.4 shared files: (autodetected)
##  Linking to sp version: 1.2-3
coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700") 
df_deaths = spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))

OpenStreetMap

london + geom_point(aes(x=coords.x1,y=coords.x2),data=data.frame(df_deaths@coords),col="red")
## Warning: Removed 9 rows containing missing values (geom_point).

london <- london + geom_point(aes(x=coords.x1, y=coords.x2), 
data=data.frame(df_deaths@coords),col="red") +
  geom_density2d(data = data.frame(df_deaths@coords), 
aes(x = coords.x1, y=coords.x2), size = 0.3) + 
  stat_density2d(data = data.frame(df_deaths@coords), 
 aes(x = coords.x1, y=coords.x2,fill = ..level.., alpha = ..level..),
 size = 0.01, bins = 16, geom = "polygon") + scale_fill_gradient(low = "green", high = "red",
 guide = FALSE) + scale_alpha(range = c(0, 0.3), guide = FALSE)
london
## Warning: Removed 9 rows containing non-finite values (stat_density2d).

## Warning: Removed 9 rows containing non-finite values (stat_density2d).
## Warning: Removed 9 rows containing missing values (geom_point).

require(leaflet)
deaths <- readShapePoints("Cholera_Deaths")
df_deaths <- data.frame(deaths@coords)
coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700") 
df_deaths=spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))
df=data.frame(df_deaths@coords)
lng=df$coords.x1
lat=df$coords.x2
m = leaflet()%>% addTiles() 
m = m %>% fitBounds(-.141,  51.511, -.133, 51.516)
rd=.5; op=.8; clr="blue"
m = leaflet() %>% addTiles()
m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr)
X=cbind(lng,lat)
kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))
## Warning in bw.ucv(X[, 1]): minimum occurred at one end of the range
x=kde2d$x1; y=kde2d$x2; z=kde2d$fhat
CL=contourLines(x , y , z)
m = leaflet() %>% addTiles()
m = m %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
m = leaflet() %>% addTiles() 
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
m = leaflet() %>% addTiles() 
m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE)
m = leaflet() %>% addTiles() 
m = m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>%
  addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>%
  addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red")
 library(osmar)
 s=osmsource_api()
 city=c((51.511+51.516)/2, -(.133+.141)/2)
 bb=center_bbox(city[2],city[1], 800, 800)
 ua=get_osm(bb, source = s)
 bg_ids=find(ua, way(tags(k=="building")))
 bg_ids=find_down(ua, way(bg_ids))
 bg=subset(ua, ids = bg_ids)
 bg_poly=as_sp(bg, "polygons")
 plot(bg_poly, col = gray.colors(12)[11],border="gray")

 cw_ids=find(ua, way(tags(v %in% c("residential","pedestrian"))))
 cw_ids=find_down(ua, way(cw_ids))
 cw=subset(ua, ids = cw_ids)
 cw_line=as_sp(cw, "lines")
 plot(bg_poly, col = gray.colors(12)[11],border="gray")
 plot(cw_line, add = TRUE, col = gray.colors(12)[5],lwd=3)

Map with googleVis

library(googleVis)
df=data.frame(state=c("AU-WA","AU-VIC","AU-NT", "AU-NSW", "AU-SA","AU-QLD","AU-TAS","NZ"),
R_users=c(323,425,154,486,201,195,87,123))
Geo <- gvisGeoChart(df,
locationvar="state",colorvar=c("R_users"),
options=list(region="AU",
dataMode="regions",resolution="provinces",
width="80%", height="80%"))

Map with googleVis

library(XML,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
year <- 2012
loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="")
tabs <- readHTMLTable(htmlParse(loc)) 
tabs[1]
## $`NULL`
##     #                    Name          Date Wind Pres Cat  
## 1   1  Tropical Storm ALBERTO     19-23 MAY   50  995   -  
## 2   2    Tropical Storm BERYL 25 MAY- 2 JUN   60  992   -  
## 3   3       Hurricane-1 CHRIS     17-24 JUN   75  974   1  
## 4   4    Tropical Storm DEBBY     23-27 JUN   55  990   -  
## 5   5     Hurricane-2 ERNESTO      1-10 AUG   85  973   2  
## 6   6 Tropical Storm FLORENCE      3- 8 AUG   50 1002   -  
## 7   7   Tropical Storm HELENE      9-19 AUG   40 1004   -  
## 8   8      Hurricane-2 GORDON     15-21 AUG   95  965   2  
## 9   9       Hurricane-1 ISAAC 20 AUG- 1 SEP   70  965   1  
## 10 10    Tropical Storm JOYCE     21-24 AUG   35 1006   -  
## 11 11        Hurricane-2 KIRK 28 AUG- 3 SEP   90  970   2  
## 12 12      Hurricane-1 LESLIE 28 AUG-12 SEP   70  968   1  
## 13 13     Hurricane-3 MICHAEL      2-12 SEP  100  964   3  
## 14 14      Hurricane-1 NADINE 10 SEP- 4 OCT   80  978   1  
## 15 15    Tropical Storm OSCAR      2- 5 OCT   45  994   -  
## 16 16    Tropical Storm PATTY     10-13 OCT   40 1005   -  
## 17 17      Hurricane-1 RAFAEL     12-26 OCT   80  966   1  
## 18 18       Hurricane-3 SANDY     21-31 OCT  100  940   3  
## 19 19     Tropical Storm TONY     21-26 OCT   45 1000   -  
storms <- unlist(strsplit(as.character(tabs[[1]]$Name),split=" "))
storms
##  [1] "Tropical"    "Storm"       "ALBERTO"     "Tropical"    "Storm"      
##  [6] "BERYL"       "Hurricane-1" "CHRIS"       "Tropical"    "Storm"      
## [11] "DEBBY"       "Hurricane-2" "ERNESTO"     "Tropical"    "Storm"      
## [16] "FLORENCE"    "Tropical"    "Storm"       "HELENE"      "Hurricane-2"
## [21] "GORDON"      "Hurricane-1" "ISAAC"       "Tropical"    "Storm"      
## [26] "JOYCE"       "Hurricane-2" "KIRK"        "Hurricane-1" "LESLIE"     
## [31] "Hurricane-3" "MICHAEL"     "Hurricane-1" "NADINE"      "Tropical"   
## [36] "Storm"       "OSCAR"       "Tropical"    "Storm"       "PATTY"      
## [41] "Hurricane-1" "RAFAEL"      "Hurricane-3" "SANDY"       "Tropical"   
## [46] "Storm"       "TONY"
index <- storms%in%c("Tropical", "Storm", paste("Hurricane-",1:6,sep=""), "Depression", "Subtropical", "Extratropical", "Low", paste("Storm-",1:6,sep=""), "Xxx")
nstorms  <- storms[!index]
nstorms
##  [1] "ALBERTO"  "BERYL"    "CHRIS"    "DEBBY"    "ERNESTO"  "FLORENCE"
##  [7] "HELENE"   "GORDON"   "ISAAC"    "JOYCE"    "KIRK"     "LESLIE"  
## [13] "MICHAEL"  "NADINE"   "OSCAR"    "PATTY"    "RAFAEL"   "SANDY"   
## [19] "TONY"
TRACK=NULL
for(i in length(nstorms):1){
  if((nstorms[i]=="SIXTEE")&(year==2008)) nstorms[i] <- "SIXTEEN"
  if((nstorms[i]=="LAUR")&(year==2008)) nstorms[i] <- "LAURA"
  if((nstorms[i]=="FIFTEE")&(year==2007)) nstorms[i] <- "FIFTEEN"
  if((nstorms[i]=="CHANTA")&(year==2007)) nstorms[i] <- "CHANTAL"
  if((nstorms[i]=="ANDR")&(year==2007)) nstorms[i] <- "ANDREA"
  if((nstorms[i]=="NINETE")&(year==2005)) nstorms[i] <- "NINETEEN"
  if((nstorms[i]=="JOSEPH")&(year==2002)) nstorms[i] <- "JOSEPHINE"
  if((nstorms[i]=="FLOY")&(year==1993)) nstorms[i] <- "FLOYD"
  if((nstorms[i]=="KEIT")&(year==1988)) nstorms[i] <- "KEITH"
  if((nstorms[i]=="CHARLI")&(year==1972)) nstorms[i] <- "CHARLIE"}
for(i in length(nstorms):1){
  loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="")
  track <- read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20))
  
  names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT")
  track$LAT=as.numeric(as.character(track$LAT))
  track$LON=as.numeric(as.character(track$LON))
  track$WIND=as.numeric(as.character(track$WIND))
  track$PR=as.numeric(as.character(track$PR))
  track$year=year
  track$name=nstorms[i]
  TRACK=rbind(TRACK,track)}

tail(TRACK,10)
##     ADV  LAT   LON         TIME WIND   PR           STAT year    name
## 645  11 30.4 -79.1  05/21/12Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 646  12 30.5 -78.3  05/21/18Z     35 1006 TROPICAL STORM 2012 ALBERTO
## 647  13 30.7 -77.1  05/22/00Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 648  14 31.5 -76.1  05/22/06Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 649  15 32.5 -74.7  05/22/12Z     30 1008            LOW 2012 ALBERTO
## 650  16 33.4 -73.4  05/22/18Z     30 1008            LOW 2012 ALBERTO
## 651  17 34.1 -71.9  05/23/00Z     25 1010            LOW 2012 ALBERTO
## 652  18 34.9 -70.1  05/23/06Z     25 1011            LOW 2012 ALBERTO
## 653  19 35.5 -67.9  05/23/12Z     25 1012            LOW 2012 ALBERTO
## 654  20 35.9 -66.0  05/23/18Z     25 1012            LOW 2012 ALBERTO
extract.track <- function(year=2012,p=TRUE){
loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="")
tabs <- readHTMLTable(htmlParse(loc)) 
storms <- unlist(strsplit(as.character(
tabs[[1]]$Name),split=" "))
index <- storms%in%c("Tropical","Storm",paste("Hurricane-",1:6,sep=""),"Depression","Subtropical","Extratropical","Low",paste("Storm-",1:6,sep=""),
"Xxx")
nstorms  <- storms[!index]
TRACK=NULL
for(i in length(nstorms):1){
loc=paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="")
track=read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20))
names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT")
track$LAT=as.numeric(as.character(track$LAT))
track$LON=as.numeric(as.character(track$LON))
track$WIND=as.numeric(as.character(track$WIND))
track$PR=as.numeric(as.character(track$PR))
track$year=year
track$name=nstorms[i]
TRACK=rbind(TRACK,track)
if(p==TRUE){  cat(year,i,nstorms[i],nrow(track),"\n")}}
return(TRACK)}
TOTTRACK=NULL
#  for(y in 2012:1851){
#    TOTTRACK=rbind(TOTTRACK,extract.track(y))
#  }

loc="http://freakonometrics.free.fr/TRACK-ATLANTIC.Rdata"
download.file(loc,destfile="TRACK-ATLANTIC.Rdata")
load("TRACK-ATLANTIC.Rdata")
dim(TOTTRACK)
## [1] 26712     9
library(grDevices,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(maps,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
map("world", col="light yellow", fill=TRUE)

map("world", col="light yellow", fill=TRUE)
for(n in unique(TOTTRACK$name)){
  lines(TOTTRACK$LON[TOTTRACK$name==n],TOTTRACK$LAT[TOTTRACK$name==n],lwd=.5,col=rgb(1, 0, 0, alpha=.5))}

Shapefiles

loc="http://freakonometrics.free.fr/Italy.zip"
download.file(loc,destfile="Italy.zip")
unzip("Italy.zip", exdir="./")
library(rgdal,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ita1<-readOGR(dsn="./Italy/",layer="ITA_adm1",verbose=FALSE)
names(ita1)
## [1] "ID_0"      "ISO"       "NAME_0"    "ID_1"      "NAME_1"    "NL_NAME_1"
## [7] "VARNAME_1" "TYPE_1"    "ENGTYPE_1"
plot(ita1,col="light green")

ita1@data[,"NAME_1"][1:6]
## [1] Abruzzo        Apulia         Basilicata     Calabria       Campania      
## [6] Emilia-Romagna
## 20 Levels: Abruzzo Apulia Basilicata Calabria Campania ... Veneto
pos<-which(ita1@data[,"NAME_1"]=="Sicily")
Poly_Sicily<-ita1[pos,]
plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)

plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
abline(v=5:20,col="light blue")
abline(h=35:50,col="light blue")
axis(1)
axis(2)

pos<-which(ita1@data[,"NAME_1"] %in% c("Abruzzo","Apulia","Basilicata","Calabria","Campania","Lazio","Molise","Sardegna","Sicily"))
ita_south <- ita1[pos,]
ita_north <- ita1[-pos,]
plot(ita1)
plot(ita_south,col="red",add=TRUE)
plot(ita_north,col="blue",add=TRUE)

library(xlsx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## 
## Attaching package: 'rJava'
## The following object is masked from 'package:RCurl':
## 
##     clone
loc="http://freakonometrics.free.fr/Italy_codes.xlsx"
download.file(loc,destfile="Italy_codes.xlsx")
data_codes<-read.xlsx2(file = "Italy_codes.xlsx",sheetName="ITALY", startRow=1, header=TRUE)
names(data_codes)[1]="NAME_1"
ita2<-merge(ita1, data_codes, by = "NAME_1", all.x=TRUE)
library(rgeos,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## rgeos version: 0.3-21, (SVN revision 540)
##  GEOS runtime version: 3.5.0-CAPI-1.9.0 r4084 
##  Linking to sp version: 1.2-3 
##  Polygon checking: TRUE
ita_s<-gUnionCascaded(ita_south)
ita_n<-gUnionCascaded(ita_north)
plot(ita1)
plot(ita_s,col="red",add=TRUE)
plot(ita_n,col="blue",add=TRUE)

plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
gCentroid(Poly_Sicily,byid=TRUE)
## class       : SpatialPoints 
## features    : 1 
## extent      : 14.147, 14.147, 37.588, 37.588  (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
points(gCentroid(Poly_Sicily,byid=TRUE),pch=19,col="red")

G <- as.data.frame(gCentroid(ita1,byid=TRUE))
plot(ita1,col="light green")
text(G$x,G$y,1:20)

c1 <-G[c(17,20,6,16,8,12,2),]
c2 <- G[c(13,11,1,5,3,4),]
L1<-SpatialLines(list(Lines(list(Line(c1)),"Line1")))
L2<-SpatialLines(list(Lines(list(Line(c2)),"Line2")))
L1@proj4string<-ita1@proj4string
L2@proj4string<-ita1@proj4string
cross_road <-gIntersection(L1,L2)
cross_road@coords
##        x      y
## 1 11.069 44.033
## 1 14.200 41.749
plot(ita1,col="light green")
plot(L1,col="red",lwd=2,add=TRUE)
plot(L2,col="blue",lwd=2,add=TRUE)
plot(cross_road,pch=19,cex=1.5,add=TRUE)

Linear Model

\[ y_i=\boldsymbol{x}_i^{\text{T}}\boldsymbol{\beta}+\varepsilon_i=\beta_0 + [\beta_1 x_{1,i}+\cdots+ \beta_k x_{k,i}]+\varepsilon_i \]

See linear model, where \(\varepsilon_i\sim\mathcal{N}(0,\sigma^2)\) i.id.

\[ (Y\vert \boldsymbol{X}=\boldsymbol{x})\sim \mathcal{N}(\boldsymbol{x}^{\text{T}}\boldsymbol{\beta}, \sigma^2) \] i.e. \[ \mathbb{E}[Y\vert\boldsymbol{X}=\boldsymbol{x}]=\boldsymbol{x}^{\text{T}}\boldsymbol{\beta} \] and homoscedastic model, \[ \text{Var}[Y\vert\boldsymbol{X}=\boldsymbol{x}]=\sigma^2 \]

Least squares (and maximum likelihood) estimator \[ \widehat{\boldsymbol{\beta}}=\text{argmin} \left\lbrace \sum_{i=1}^n (y_i-\boldsymbol{x}_i^{\text{T}}\boldsymbol{\beta})^2 \right\rbrace =(\boldsymbol{X}^{\text{T}}\boldsymbol{X})^{-1}\boldsymbol{X}^{\text{T}}\boldsymbol{y}\]

Linear Model

plot(cars)

model <- lm(dist ~ speed, data=cars)

Linear Model

summary(model)
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29.07  -9.53  -2.27   9.21  43.20 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.579      6.758   -2.60    0.012 *  
## speed          3.932      0.416    9.46  1.5e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.4 on 48 degrees of freedom
## Multiple R-squared:  0.651,  Adjusted R-squared:  0.644 
## F-statistic: 89.6 on 1 and 48 DF,  p-value: 1.49e-12

Linear Model

X=cbind(1,cars$speed)
Y=cars$dist
solve(crossprod(X,X),crossprod(X,Y))
##          [,1]
## [1,] -17.5791
## [2,]   3.9324
model$coefficients
## (Intercept)       speed 
##    -17.5791      3.9324

Linear Model

summary(model)$sigma^2*solve(crossprod(X,X))
##         [,1]     [,2]
## [1,] 45.6765 -2.65882
## [2,] -2.6588  0.17265
vcov(model)
##             (Intercept)    speed
## (Intercept)     45.6765 -2.65882
## speed           -2.6588  0.17265
n=nrow(cars)

Linear Model

confint(model)
##               2.5 %  97.5 %
## (Intercept) -31.168 -3.9903
## speed         3.097  4.7679
model$coefficients[1]+qt(c(.025,.975),n-2)* summary(model)$coefficients[1,2]
## [1] -31.1678  -3.9903

Linear Model

coefficient of determination \(\displaystyle{R^2 = \frac{\text{explained variance}}{\text{total variance}}}\)

summary(model)$r.squared
## [1] 0.65108
1-deviance(model)/sum((Y-mean(Y))^2)
## [1] 0.65108

\[ \text{Var}[Y]= \text{Var}[\mathbb{E}[Y\vert X]]+ \mathbb{E}[\text{Var}[Y\vert X]] \]

see analysis of variance

Linear Model

\[ \overline{r, error=TRUE}^2 = 1-[1-R^2]\cdot \frac{n-1}{n-(k-1)-1} \]

summary(model)$adj.r.squared
## [1] 0.64381

Linear Model

anova(lm(dist~speed,data=cars),lm(dist~1,data=cars))
## Analysis of Variance Table
## 
## Model 1: dist ~ speed
## Model 2: dist ~ 1
##   Res.Df   RSS Df Sum of Sq    F  Pr(>F)    
## 1     48 11354                              
## 2     49 32539 -1    -21185 89.6 1.5e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Linear Model

plot(cars)
abline(model,col="red")

Linear Model

plot(cars)
abline(model,col="red")
x=seq(2,26)
y=predict(model, newdata=data.frame(speed=x))
lines(x,y,lwd=2,col="blue")

Linear Model

y=predict(model, newdata=data.frame(speed=x), interval = "confidence")
head(y)
##       fit      lwr     upr
## 1 -9.7143 -21.7331  2.3045
## 2 -5.7819 -17.0266  5.4629
## 3 -1.8495 -12.3295  8.6306
## 4  2.0829  -7.6442 11.8100
## 5  6.0154  -2.9733 15.0041
## 6  9.9478   1.6790 18.2166

\[ \text{Var}[\widehat{Y}(\boldsymbol{x})]=\text{Var}[\boldsymbol{x}^{\text{T}}\widehat{\boldsymbol{\beta}}]=\boldsymbol{x}^{\text{T}}\text{Var}[\widehat{\boldsymbol{\beta}}]\boldsymbol{x} =\widehat{\sigma}^2 \boldsymbol{x}^{\text{T}}[\boldsymbol{X}^{\text{T}}\boldsymbol{X}]^{-1}\boldsymbol{x} \]

Linear Model

plot(cars)
polygon(c(x,rev(x)),c(y[,2],rev(y[,3])),col=rgb(0,0,1,.4),border=NA)
lines(x,y[,1],lwd=2,col="blue")

Linear Model and Bagging

Method 1

Y=matrix(NA,1000,length(x))
for(b in 1:1000){
  idx <- sample(1:nrow(cars),size=nrow(cars),replace=TRUE)
  modelb <- lm(dist ~ speed, data=cars[idx,])
  Y[b,] <- predict(modelb, newdata=data.frame(speed=x))
}

See bootstrap, based on pseudo-sample \[ \lbrace(\boldsymbol{x}_{i_1},y_{i_1}), \cdots,(\boldsymbol{x}_{i_n},y_{i_n}) \rbrace \] where \((i_1,\cdots,i_n)\in\lbrace 1,2,\cdots,n\rbrace\).

Linear Model and Bagging

Method 1

plot(cars)
for(b in 1:100) lines(x,Y[b,],col=rgb(0,0,1,.4))

Linear Model and Bagging

plot(cars)
lines(x,apply(Y,2,mean),col="blue",lwd=2)
lines(x,apply(Y,2,function(x) quantile(x,.025)),col="red",lty=2)
lines(x,apply(Y,2,function(x) quantile(x,.975)),col="red",lty=2)

Linear Model and Bagging

Method 2

pred_dist=predict(model)
epsilon  =residuals(model)
Y=matrix(NA,1000,length(x))
for(b in 1:1000){
  idx <- sample(1:nrow(cars),size=nrow(cars),replace=TRUE)
  carsb=data.frame(speed=cars$speed,
                   dist=pred_dist+epsilon[idx])
  modelb <- lm(dist ~ speed, data=carsb)
  Y[b,] <- predict(modelb, newdata=data.frame(speed=x))
}

See bootstrap, based on pseudo-sample \[ \lbrace(\boldsymbol{x}_{1},\widehat{y}_{1}+\widehat{\varepsilon}_{i_1}), \cdots,(\boldsymbol{x}_{n},\widehat{y}_{n}+\widehat{\varepsilon}_{i_n}) \rbrace \] where \((i_1,\cdots,i_n)\in\lbrace 1,2,\cdots,n\rbrace\).

Linear Model and Bagging

Method 2

plot(cars)
for(b in 1:100) lines(x,Y[b,],col=rgb(0,0,1,.4))

Linear Model and Bagging

plot(cars)
lines(x,apply(Y,2,mean),col="blue",lwd=2)
lines(x,apply(Y,2,function(x) quantile(x,.025)),col="red",lty=2)
lines(x,apply(Y,2,function(x) quantile(x,.975)),col="red",lty=2)

Errors

plot(cars)
abline(model,col="red")
segments(cars$speed,cars$dist,cars$speed,predict(model),col="blue")

Least Squares?

cars2=cars; cars2[,2]=cars[,2]/10
plot(cars2,ylab="dist/10")
acp=princomp(cars2)
b=acp$loadings[2,1]/acp$loadings[1,1]
a=acp$center[2]-b*acp$center[1]
abline(a,b,col="red")

Least Squares?

plot(cars2,ylab="dist/10",xlim=c(0,30),ylim=c(-1,12))
abline(a,b,col="red")
t <- acp$loadings[,1] %*% (t(cars2)-acp$center)
X1 <- acp$center[1] +t * acp$loadings[1,1]
X2 <- acp$center[2] +t * acp$loadings[2,1]
segments(cars2$speed,cars2$dist,X1,X2,col="blue")

Least Absolute Deviation

  f <- function(a) sum(abs(cars$dist-(a[1]+a[2]*cars$speed))) 
  opt <- optim( c(0,0), f )$par
  plot(cars)
  abline(model, col='red', lty=2)
  abline(opt[1], opt[2],col="blue")

Quantile Regression

library(quantreg,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(cars)
abline(model, col="blue")
abline(rq(dist ~ speed,data=cars, tau=.5),col="red",lty=2)
abline(rq(dist ~ speed,data=cars, tau=.9),col="purple",lty=2)

Diagnostic and Errors

plot(model,which=1)

Scatterplot \((\widehat{Y}_i,\widehat{\varepsilon}_i)\)

Diagnostic and Errors

plot(model,which=2)

Scatterplot \(\displaystyle{\left(\widehat{\varepsilon}_{i:n},\Phi^{-1} \left(\frac{i}{n}\right)\right)}\)

Diagnostic and Errors

plot(model,which=3)

Scatterplot \((\widehat{Y}_i,\sqrt{\vert\widehat{\varepsilon}_i\vert})\)

Diagnostic and Errors

plot(model,which=4)

Cook distance, \(\displaystyle{C_i=\frac{\widehat{\varepsilon}_i^2}{p\cdot\text{MSE}}\cdot\left(\frac{H_{i,i}}{(1-H_{i,i})^2}\right)}\) with \(\boldsymbol{H}=\boldsymbol{X}(\boldsymbol{X}^{\text{T}}\boldsymbol{X})^{-1}\boldsymbol{X}^{\text{T}}=[H_{i,i}]\)

C=cooks.distance(model)

Diagnostic and Errors

\(H_{i,i}\) are leverages, and define Studentized residuals as \[ \widehat{r, error=TRUE}_i=\frac{\widehat{\varepsilon}_i}{ \widehat{\sigma} \sqrt{1-H_{i,i}}} \]

rstudent(model)
##         1         2         3         4         5         6         7         8 
##  0.263450  0.816078 -0.397812  0.810353  0.140703 -0.517161 -0.246246  0.279834 
##         9        10        11        12        13        14        15        16 
##  0.810904 -0.570047  0.152092 -1.030378 -0.629925 -0.366705 -0.105093 -0.492517 
##        17        18        19        20        21        22        23        24 
##  0.029817  0.029817  0.817162 -0.750784 -0.095921  1.499720  3.022829 -1.420977 
##        25        26        27        28        29        30        31        32 
## -1.012276  0.824408 -0.874115 -0.347522 -1.139035 -0.605535  0.047371 -0.734220 
##        33        34        35        36        37        38        39        40 
##  0.182229  1.521459  2.098482 -1.409292 -0.731459  0.713309 -1.982389 -0.862936 
##        41        42        43        44        45        46        47        48 
## -0.596376 -0.332475  0.192085 -0.193933 -1.274939 -0.455573  1.027735  1.097019 
##        49        50 
##  3.184993  0.287745

Diagnostic and Errors

plot(model,which=5)

Scatterplot \((H_{i,i},\widehat{r, error=TRUE}_i)\)

Diagnostic and Errors

plot(model,which=6)

Scatterplot \((H_{i,i},C_i)\)

Diagnostic and Errors

hist(residuals(model),probability=TRUE, col="light green")
lines(density(residuals(model)),lwd=2,col="red")
boxplot(residuals(model),horizontal=TRUE,add=TRUE,at=.024, 
  pars=list(boxwex=.004),col=rgb(0,0,1,.25))

Diagnostic and Errors

sigma=summary(model)$sigma
plot(ecdf(residuals(model)/sigma))
lines(seq(-3,3,by=.1),pnorm(seq(-3,3,by=.1)),lwd=2,col="red")

Diagnostic and Errors

Kolmogorov-Smirnov $ d=_{x}(x)-_n(x)$

ks.test(residuals(model)/sigma,"pnorm")
## Warning in ks.test(residuals(model)/sigma, "pnorm"): ties should not be present
## for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  residuals(model)/sigma
## D = 0.131, p-value = 0.36
## alternative hypothesis: two-sided

Diagnostic and Errors

Anderson-Darling, Cramer-von Mises,

library(nortest,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ad.test(residuals(model))
## 
##  Anderson-Darling normality test
## 
## data:  residuals(model)
## A = 0.794, p-value = 0.037
cvm.test(residuals(model))
## 
##  Cramer-von Mises normality test
## 
## data:  residuals(model)
## W = 0.126, p-value = 0.048

Model Choice

AIC \(AIC = 2k - 2\log(\mathcal{L}) = 2k + n\left[\log\left(2\pi \frac{1}{n}\sum_{i=1}^n \widehat{\varepsilon}_i^2 \right) + 1\right]\)

BIC \(BIC = { k \log(n) -2 \log(\mathcal{L}) } = k \log(n) + n\left[\log\left(2\pi \frac{1}{n}\sum_{i=1}^n \widehat{\varepsilon}_i^2 \right) + 1\right]\)

AIC(model)
## [1] 419.16
AIC(model,k=log(n))
## [1] 424.89

Testing Linear Assumptions

library(splines,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_brk <- lm(dist ~ bs(speed,degree=1,knots=(c(4,15,25))), data=cars)
x=seq(4,25,by=.1)
y=predict(model_brk, newdata=data.frame(speed=x))
## Warning in predict.lm(model_brk, newdata = data.frame(speed = x)): prediction
## from a rank-deficient fit may be misleading

see \(b\)-splines, \[ y_i=\beta_0+\beta_1 x_i + \beta_2 (x_i-15)_+ + \varepsilon_i \]

Testing Linear Assumptions

plot(cars)
lines(x,y,lwd=2,col="blue")

Testing Linear Assumptions

positive=function(x,s) ifelse(x>s,x-s,0)
model_brk <- lm(dist ~ speed + positive(speed,15), data=cars)
x=seq(4,25,by=.1)
y=predict(model_brk, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

Testing Linear Assumptions

plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")
abline(coefficients(model_brk)[1],coefficients(model_brk)[2],col="blue",lty=2)

summary(model_brk)$coefficients
##                     Estimate Std. Error  t value  Pr(>|t|)
## (Intercept)          -7.6519   10.62536 -0.72016 0.4749955
## speed                 3.0186    0.86268  3.49916 0.0010324
## positive(speed, 15)   1.7562    1.45513  1.20694 0.2334961

Testing Linear Assumptions

library(strucchange,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(Fstats(dist ~ speed,data=cars,from=7/50))

see Chow Test

Testing Linear Assumptions

\[ W_t=\frac{1}{\widehat{\sigma}\sqrt{n}}\sum_{i=1}^{ \lfloor nt \rfloor} \widehat{\varepsilon}_i \]

cusum <- efp(dist ~ speed, type = "OLS-CUSUM",data=cars)
plot(cusum,ylim=c(-2,2))

Testing Linear Assumptions

plot(cusum, alpha = 0.05, alt.boundary = TRUE,ylim=c(-2,2))

see CUSUM test

Model with Categorical Covariates

model_cut=lm(dist~ cut(speed, breaks=c(0,10,15,20,25)),data=cars)
y=predict(model_cut, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue",type="s")

Model with Categorical Covariates

library(rpart,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
tree=rpart(dist~speed,data=cars)
y=predict(tree,newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue",type="s")

Smoothing : Polynomial Regression

model_poly=lm(dist~ poly(speed, df=3),data=cars)
y=predict(model_poly, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

Smoothing : Local Regression

library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(cars)
bw <- dpill(cars$speed,cars$dist) 
lines( locpoly(cars$speed,cars$dist,degree=0, bandwidth=bw), col='red' )
lines( locpoly(cars$speed,cars$dist,degree=1, bandwidth=bw), col='green' )
lines( locpoly(cars$speed,cars$dist,degree=2, bandwidth=bw), col='blue' )

Smoothing : \(k\)-Nearest Neighboors

library(FNN,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
knn=knn.reg(train=cars$speed,y=cars$dist,k=5)
plot(cars)
lines(cars$speed,knn$pred,col="red")

Smoothing : \(b\) Splines

library(splines,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_bs <- lm(dist ~ bs(speed), data=cars)
y=predict(model_bs, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

Smoothing Linear Model

Tuning parameter selection, Silverman’s Rule \[ b^\star=0.9 \cdot\frac{ \min\lbrace \sigma,F^{-1}(.75)-F^{-1}(.25)\rbrace}{1.34 \cdot n^{1/5}} \]

bw.nrd0(cars$speed)
## [1] 2.15

See kernel regression

Smoothing Linear Model

Tuning parameter selection, Cross Validation \[ b^\star=\text{argmin}\left\lbrace \sum_{i=1}^n \left(y_i - \widehat{m}_{(i)}(\boldsymbol{x}_i)\right)^2\right\rbrace \]

bw.ucv(cars$speed)
## Warning in bw.ucv(cars$speed): minimum occurred at one end of the range
## [1] 2.7489

Smoothing Linear Model

library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
Nadaraya_Watson =with(cars,ksmooth(speed, dist, "normal",bandwidth=2.75))
plot(cars)
abline(model,col="red",lty=2)
lines(Nadaraya_Watson,lwd=2,col="blue")

Smoothing Linear Model

library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_loess=loess(dist~ speed, data=cars,degree=1, family="gaussian")
y=predict(model_loess, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

Multiple Regression

Life Expectancy (1), Homicide Rate (2), Illiteracy Rate (3)

chicago=read.table("http://freakonometrics.free.fr/chicago.txt",header=TRUE,sep=";")
model_c = lm(Fire~X_2+X_3,data=chicago)
y=function(x2,x3) predict(model_c,newdata=data.frame(X_2=x2,X_3=x3))
VX2=seq(0,80,length=26); VX3=seq(5,25,length=26)
VY=outer(VX2,VX3,y)
persp(VX2,VX3,VY,xlab="Homicide",ylab="Illiteracy",zlab="Fire",theta=20)

Multiple Regression

VX2=seq(0,80,length=251); VX3=seq(5,25,length=251)
VY=outer(VX2,VX3,y)
image(VX2,VX3,VY,xlab="Homicide",ylab="Illiteracy")
contour(VX2,VX3,VY,add=TRUE)

Model selection

model_c = lm(Fire~.,data=chicago)
summary(model_c)$r.squared
## [1] 0.44167
summary(model_c)$adj.r.squared
## [1] 0.40272

Model selection

logLik(model_c)
## 'log Lik.' -152.77 (df=5)
AIC(model_c, k=2)               # AIC
## [1] 315.54
AIC(model_c, k=log(nrow(cars))) # BIC
## [1] 325.1

Penalization

\[ \widehat{\boldsymbol{\beta}} = \text{argmin}\left\lbrace \sum_{i=1}^n \left[y_i- \left(\beta_0+\sum_{j=1}^k \beta_j x_{j,i} \right)\right] + \color{red}{\lambda} \sum_{j=1}^k \beta_j ^2\right\rbrace \] with an explicit solution \(\widehat{\boldsymbol{\beta}}=(\boldsymbol{X}^{\text{T}}\boldsymbol{X}-\color{red}{\lambda} \mathbb{I})^{-1} \boldsymbol{X}^{\text{T}}\boldsymbol{Y}\).

library(MASS,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_ridge <- lm.ridge(Fire ~ ., data=chicago, lambda=1)

see more generally Tikhonov regularization

Optimal Penalization

mse <- NULL
n=100
v <- matrix(c(0,coefficients(model_c)[-1]), nr=n, nc=4, byrow=TRUE)
kl <- c(1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 
        .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.2, 1.4, 1.6, 1.8, 2)
for (k in kl) {
  r <- matrix(NA, nr=n, nc=4)
  for (i in 1:n) {
    boot_c <- chicago[sample(1:nrow(chicago),nrow(chicago),replace=TRUE),]
    r[i,2:4] <- model_ridge <- lm.ridge(Fire ~ ., data=boot_c, lambda=k)$coef
    r[i,1] <- mean(boot_c[,"Fire"])
  }
  mse <- append(mse, apply( (r-v)^2, 2, mean )[2])
}

Optimal Penalization

plot( mse ~ kl, type='l' )  

Variable Selection

step(lm(Fire ~ .,data=chicago))
## Start:  AIC=180.16
## Fire ~ X_1 + X_2 + X_3
## 
##        Df Sum of Sq  RSS AIC
## - X_1   1         1 1832 178
## <none>              1832 180
## - X_2   1       562 2394 191
## - X_3   1       702 2534 193
## 
## Step:  AIC=178.17
## Fire ~ X_2 + X_3
## 
##        Df Sum of Sq  RSS AIC
## <none>              1832 178
## - X_2   1       621 2453 190
## - X_3   1      1004 2836 197
## 
## Call:
## lm(formula = Fire ~ X_2 + X_3, data = chicago)
## 
## Coefficients:
## (Intercept)          X_2          X_3  
##      21.496        0.221       -1.525

Variable Selection

LASSO (least absolute shrinkage and selection operator) \[ \widehat{\boldsymbol{\beta}} = \text{argmin}\left\lbrace \sum_{i=1}^n \left[y_i- \left(\beta_0+\sum_{j=1}^k \beta_j x_{j,i} \right)\right] + \color{blue}{\lambda} \sum_{j=1}^k \vert\beta_j\vert\right\rbrace \]

library(glmnet,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
fit = glmnet(x = as.matrix(chicago[,2:4]), y = chicago[,1], family = "gaussian", alpha = 1)
plot(fit, xvar="lambda", label = TRUE )

Variable Selection

step(model)
## Start:  AIC=275.26
## dist ~ speed
## 
##         Df Sum of Sq   RSS AIC
## <none>               11354 275
## - speed  1     21185 32539 326
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Coefficients:
## (Intercept)        speed  
##      -17.58         3.93

Dimension Reduction

model_acp=lm(Fire ~ princomp(cbind(X_1,X_2,X_3))$scores[,1:3],data=chicago)
predict(model_c)[1:5]
##       1       2       3       4       5 
##  9.9755 16.9851 14.2446 13.4120 18.3372
predict(model_acp)[1:5]
##       1       2       3       4       5 
##  9.9755 16.9851 14.2446 13.4120 18.3372

Dimension Reduction

model_acp=lm(Fire ~ princomp(cbind(X_1,X_2,X_3))$scores[,1:2],data=chicago)
predict(model_c)[1:5]
##       1       2       3       4       5 
##  9.9755 16.9851 14.2446 13.4120 18.3372
predict(model_acp)[1:5]
##      1      2      3      4      5 
## 10.010 17.021 14.300 13.439 18.394

Logistic Regression

\[ \mathbb{E}[Y\vert \boldsymbol{X}=\boldsymbol{x}]= p(\boldsymbol{x})=\frac{\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]}{1+\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]}=H(\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}) \]

logistic regression

Logistic Regression

myocarde <-read.table("http://freakonometrics.free.fr/myocarde.csv", head=TRUE,sep=";")
model_bernoulli = glm(PRONO~., data=myocarde, family=binomial)
coefficients(summary(model_bernoulli))
##                Estimate Std. Error   z value Pr(>|z|)
## (Intercept) -10.1876417 11.8952271 -0.856448  0.39175
## FRCAR         0.1381781  0.1141122  1.210897  0.22593
## INCAR        -5.8624290  6.7487855 -0.868664  0.38503
## INSYS         0.7170840  0.5614447  1.277212  0.20153
## PRDIA        -0.0736682  0.2916360 -0.252603  0.80057
## PAPUL         0.0167565  0.3419423  0.049004  0.96092
## PVENT        -0.1067760  0.1105501 -0.965861  0.33411
## REPUL        -0.0031542  0.0048909 -0.644907  0.51899

Logistic Regression

X=cbind(1,as.matrix(myocarde[,1:7]))
Y=myocarde$PRONO=="Survival"
beta=as.matrix(lm(Y~0+X)$coefficients,ncol=1)
for(s in 1:9){
   pi=exp(X%*%beta[,s])/(1+exp(X%*%beta[,s]))
   gradient=t(X)%*%(Y-pi)
   omega=matrix(0,nrow(X),nrow(X));diag(omega)=(pi*(1-pi))
   Hessian=-t(X)%*%omega%*%X
   beta=cbind(beta,beta[,s]-solve(Hessian)%*%gradient)}

Logistic Regression

 beta[,1]
##      X XFRCAR XINCAR XINSYS XPRDIA XPAPUL XPVENT XREPUL 
##      0      0      0      0      0      0      0      0
 beta[,9]
##           X      XFRCAR      XINCAR      XINSYS      XPRDIA      XPAPUL 
## -9.2027e+00  4.7726e-14 -5.7554e-13  4.5408e-14  4.9960e-16  2.6645e-15 
##      XPVENT      XREPUL 
##  1.3878e-15 -2.0817e-17
# -solve(Hessian)
sqrt(-diag(solve(Hessian)))
##                 FRCAR      INCAR      INSYS      PRDIA      PAPUL      PVENT 
## 180.183682   1.877488  93.030245   6.520984   6.346283   5.471882   3.059004 
##      REPUL 
##   0.054721

Logistic Regression

GLM <- glm(PRONO ~ PVENT + REPUL, data = myocarde, family = binomial)
pred_GLM = function(p,r){
 return(predict(GLM, newdata =
 data.frame(PVENT=p,REPUL=r), type="response"))}
vp=seq(min(myocarde$PVENT),max(myocarde$PVENT),length=251)
vr=seq(min(myocarde$REPUL),max(myocarde$REPUL),length=251)
vd=outer(vp,vr,pred_GLM)
library(RColorBrewer,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
CL=colorRampPalette(brewer.pal(name="RdYlBu", 11))(100)

Logistic Regression

image(vp,vr,vd,col=CL,xlab="PVENT",ylab="REPUL")

Logistic Regression

probabilities <- predict(GLM, myocarde, type="response")
predictions <- levels(myocarde$PRONO)[(probabilities>.5)+1] 
table(predictions, myocarde$PRONO)
##            
## predictions DECES SURVIE
##      DECES     24      4
##      SURVIE     5     38

Residuals and GLM

Pearson’s residuals \[ \widehat{\varepsilon}_i=\frac{y_i - \widehat{\pi}_i}{ \sqrt{\widehat{\pi}_i\cdot[1-\widehat{\pi}_i]}} \] Deviance residuals \[ \widehat{\varepsilon}_i=\pm \left( y_i \log[\widehat{\pi}_i]+ (1-y_i) \log[1-\widehat{\pi}_i]\right) \]

E1=residuals(model_bernoulli,type="pearson")
E2=residuals(model_bernoulli,type="deviance")

Residuals and GLM

plot(1:nrow(myocarde),E1)

Multinomial Regression

Binomial case, \(Y\in\lbrace 0,1\rbrace\),

\(\mathbb{P}[Y=1\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]\)

\(\mathbb{P}[Y=0\vert \boldsymbol{X}=\boldsymbol{x}]\propto1\)

Multinomial case, \(Y\in\lbrace A,B,C\rbrace\),

\(\mathbb{P}[Y=\color{red}{A}\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\color{red}{\boldsymbol{\beta}_A}]\)

\(\mathbb{P}[Y=\color{blue}{B}\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\color{blue}{\boldsymbol{\beta}_B}]\)

\(\mathbb{P}[Y=\color{green}{C}\vert \boldsymbol{X}=\boldsymbol{x}]\propto1\)

Multinomial Regression

library(VGAM,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_multi = vglm(PRONO~., data=myocarde, family="multinomial")
model_multi
## Call:
## vglm(formula = PRONO ~ ., family = "multinomial", data = myocarde)
## 
## Coefficients:
## (Intercept)       FRCAR       INCAR       INSYS       PRDIA       PAPUL 
##  10.1876411  -0.1381781   5.8624289  -0.7170840   0.0736682  -0.0167565 
##       PVENT       REPUL 
##   0.1067760   0.0031542 
## 
## Degrees of Freedom: 71 Total; 63 Residual
## Residual deviance: 41.043 
## Log-likelihood: -20.522 
## 
## This is a multinomial logit model with 2 levels

Generalized Linear Models

Poisson and Gamma distributions in the exponential family

base=data.frame(x=c(1,2,3,4,5),y=c(1,2,4,2,6))
regNId <- glm(y~x,family=gaussian(link="identity"),data=base)
regNlog <- glm(y~x,family=gaussian(link="log"),data=base)
regPId <- glm(y~x,family=poisson(link="identity"),data=base)
regPlog <- glm(y~x,family=poisson(link="log"),data=base)
regGId <- glm(y~x,family=Gamma(link="identity"),data=base)
regGlog <- glm(y~x,family=Gamma(link="log"),data=base)

Generalized Linear Models

visuel=function(regression,titre=""){
 plot(base$x,base$y,pch=19,cex=1.5,main=titre,xlab="",ylab="")
 abs <- seq(0,7,by=.1)
 yp <- predict(regression,newdata=data.frame(x=abs),se.fit = TRUE, type="response")
 polygon(c(abs,rev(abs)),c(yp$fit+2*yp$se.fit,rev(yp$fit-2*yp$se.fit)), col="light green",border=NA)
 points(base$x,base$y,pch=19,cex=1.5)
 lines(abs,yp$fit,lwd=2,col="red")
 lines(abs,yp$fit+2*yp$se.fit,lty=2)
 lines(abs,yp$fit-2*yp$se.fit,lty=2)}

Generalized Linear Models

visuel(regNId,"Gaussienne, lien identité")

Generalized Linear Models

visuel(regPId,"Poisson, lien identité")

Generalized Linear Models

visuel(regGId,"Gamma, lien identité")

Generalized Linear Models

visuel(regNlog,"Gaussienne, lien logarithmique")

Generalized Linear Models

visuel(regPlog,"Poisson, lien logarithme")

Generalized Linear Models

visuel(regGlog,"Gamma, lien logarithme")