Actuary Data Science

Introduction & More Advanced R

Arthur Charpentier - @freakonometrics - Univ. Rennes 1 & UQAM
2015

R & Packages

  getOption("defaultPackages")

## [1] "datasets"  "utils"     "grDevices" "graphics"  "stats"     "methods"

(.packages(all.available=TRUE))

##   [1] "abind"              "acepack"            "actuar"            
##   [4] "ada"                "adabag"             "ade4"              
##   [7] "ade4TkGUI"          "adegraphics"        "AER"               
##  [10] "akima"              "amap"               "aod"               
##  [13] "ape"                "aplpack"            "arules"            
##  [16] "arulesViz"          "ash"                "assertthat"        
##  [19] "base64enc"          "bayesm"             "bdsmatrix"         
##  [22] "BH"                 "biclust"            "biglm"             
##  [25] "bigmemory"          "bigmemory.sri"      "BiocInstaller"     
##  [28] "bit"                "bit64"              "bitops"            
##  [31] "boot"               "Boruta"             "BradleyTerry2"     
##  [34] "brew"               "brglm"              "C50"               
##  [37] "car"                "caret"              "CASdatasets"       
##  [40] "caTools"            "cba"                "ChainLadder"       
##  [43] "chron"              "circlize"           "CircStats"         
##  [46] "classInt"           "clv"                "cmprsk"            
##  [49] "cobs"               "coda"               "coin"              
##  [52] "colorspace"         "combinat"           "compositions"      
##  [55] "corpcor"            "corrplot"           "cplm"              
##  [58] "crayon"             "cshapes"            "cubature"          
##  [61] "Cubist"             "curl"               "data.table"        
##  [64] "DBI"                "deldir"             "demography"        
##  [67] "dendroextras"       "DEoptimR"           "descr"             
##  [70] "devtools"           "dichromat"          "digest"            
##  [73] "diptest"            "dismo"              "distr"             
##  [76] "distrEx"            "doBy"               "doParallel"        
##  [79] "dplyr"              "DT"                 "dtw"               
##  [82] "dynlm"              "e1071"              "earth"             
##  [85] "effects"            "ellipse"            "energy"            
##  [88] "entropy"            "evaluate"           "evir"              
##  [91] "explor"             "expm"               "FactoMineR"        
##  [94] "fastICA"            "fastmatch"          "fBasics"           
##  [97] "fda"                "fGarch"             "fields"            
## [100] "flashClust"         "flexclust"          "flexmix"           
## [103] "FNN"                "foreach"            "forecast"          
## [106] "formatR"            "Formula"            "fpc"               
## [109] "fracdiff"           "FSelector"          "fts"               
## [112] "ftsa"               "gam"                "gamair"            
## [115] "gamlss"             "gamlss.data"        "gamlss.dist"       
## [118] "gbm"                "gclus"              "gdata"             
## [121] "geosphere"          "ggdendro"           "ggmap"             
## [124] "ggplot2"            "ggthemes"           "git2r"             
## [127] "glmnet"             "GlobalOptions"      "gnm"               
## [130] "goftest"            "googleVis"          "gplots"            
## [133] "gridBase"           "gridExtra"          "gss"               
## [136] "gstat"              "gtable"             "gtools"            
## [139] "gWidgets"           "hdrcde"             "heplots"           
## [142] "hexbin"             "highr"              "HistData"          
## [145] "hmeasure"           "Hmisc"              "htmltools"         
## [148] "htmlwidgets"        "httpuv"             "httr"              
## [151] "igraph"             "ineq"               "intervals"         
## [154] "ipred"              "IRdisplay"          "irlba"             
## [157] "iterators"          "its"                "jpeg"              
## [160] "jsonlite"           "kernlab"            "klaR"              
## [163] "knitr"              "ks"                 "labeling"          
## [166] "LaF"                "Lahman"             "lars"              
## [169] "latticeExtra"       "lava"               "lazyeval"          
## [172] "leaflet"            "leaps"              "LearnBayes"        
## [175] "lme4"               "lmtest"             "loa"               
## [178] "locfit"             "logcondens"         "logicFS"           
## [181] "LogicReg"           "LSMonteCarlo"       "lubridate"         
## [184] "magrittr"           "manipulate"         "mapdata"           
## [187] "mapproj"            "maps"               "maptools"          
## [190] "markdown"           "MASS"               "MatrixModels"      
## [193] "maxLik"             "mboost"             "mcbiopi"           
## [196] "mclust"             "MCMCpack"           "mda"               
## [199] "memoise"            "mice"               "microbenchmark"    
## [202] "mime"               "minqa"              "misc3d"            
## [205] "miscTools"          "missMDA"            "mitools"           
## [208] "mix"                "mlbench"            "mlogit"            
## [211] "mnormt"             "modeltools"         "mosaic"            
## [214] "mosaicData"         "multcomp"           "multicool"         
## [217] "munsell"            "mvtnorm"            "ncdf"              
## [220] "ncdf4"              "nloptr"             "NLP"               
## [223] "NMF"                "nnet"               "nnls"              
## [226] "nortest"            "np"                 "numDeriv"          
## [229] "nycflights13"       "odfWeave"           "OpenStreetMap"     
## [232] "pamr"               "party"              "partykit"          
## [235] "pbapply"            "pbkrtest"           "PBSmapping"        
## [238] "pcaPP"              "pglm"               "pixmap"            
## [241] "pkgmaker"           "plm"                "plot3D"            
## [244] "plotly"             "plotmo"             "plotrix"           
## [247] "pls"                "plyr"               "pmml"              
## [250] "png"                "polyclip"           "prabclus"          
## [253] "pROC"               "prodlim"            "profileModel"      
## [256] "proto"              "proxy"              "pryr"              
## [259] "pscl"               "psych"              "psychotools"       
## [262] "psychotree"         "quadprog"           "quantreg"          
## [265] "questionr"          "qvcalc"             "R6"                
## [268] "rainbow"            "randomForest"       "randtoolbox"       
## [271] "ranger"             "RANN"               "raster"            
## [274] "rasterVis"          "rCharts"            "RColorBrewer"      
## [277] "Rcpp"               "RcppArmadillo"      "RcppEigen"         
## [280] "RCurl"              "readr"              "readxl"            
## [283] "registry"           "relaimpo"           "relimp"            
## [286] "repr"               "reshape"            "reshape2"          
## [289] "rgdal"              "rgeos"              "rgl"               
## [292] "RgoogleMaps"        "rJava"              "rjson"             
## [295] "RJSONIO"            "rleafmap"           "rmarkdown"         
## [298] "RMySQL"             "rngtools"           "rngWELL"           
## [301] "robustbase"         "ROCR"               "RODBC"             
## [304] "roxygen2"           "rpart"              "rpart.plot"        
## [307] "RPostgreSQL"        "rPython"            "RSQLite"           
## [310] "rstudio"            "rstudioapi"         "RUnit"             
## [313] "rversions"          "rvest"              "RWeka"             
## [316] "RWekajars"          "rworldmap"          "rworldxtra"        
## [319] "sampleSelection"    "sandwich"           "scales"            
## [322] "scatterD3"          "scatterplot3d"      "sde"               
## [325] "selectr"            "seriation"          "sfsmisc"           
## [328] "shape"              "shiny"              "shinyapps"         
## [331] "slam"               "slidify"            "slidifyLibraries"  
## [334] "SnowballC"          "sp"                 "spacetime"         
## [337] "spam"               "SparseM"            "spatstat"          
## [340] "spdep"              "splancs"            "splitstackshape"   
## [343] "spls"               "stabledist"         "stabs"             
## [346] "startupmsg"         "statmod"            "stringi"           
## [349] "stringr"            "strucchange"        "subselect"         
## [352] "superpc"            "survey"             "survival"          
## [355] "SweaveListingUtils" "systemfit"          "tau"               
## [358] "TeachingDemos"      "tensor"             "tensorA"           
## [361] "testthat"           "textcat"            "TH.data"           
## [364] "tidyr"              "timeDate"           "timeSeries"        
## [367] "tis"                "tm"                 "trifield"          
## [370] "trimcluster"        "tripack"            "truncreg"          
## [373] "tseries"            "TSP"                "tweedie"           
## [376] "urca"               "uuid"               "varSelRF"          
## [379] "vcd"                "vcdExtra"           "verification"      
## [382] "VGAM"               "VGAMdata"           "VIM"               
## [385] "viridis"            "waveslim"           "weights"           
## [388] "whisker"            "wordcloud"          "wskm"              
## [391] "xlsx"               "xlsxjars"           "XML"               
## [394] "xml2"               "xtable"             "xts"               
## [397] "yaml"               "YieldCurve"         "zoo"               
## [400] "base"               "class"              "cluster"           
## [403] "codetools"          "compiler"           "datasets"          
## [406] "foreign"            "graphics"           "grDevices"         
## [409] "grid"               "KernSmooth"         "lattice"           
## [412] "Matrix"             "methods"            "mgcv"              
## [415] "nlme"               "parallel"           "spatial"           
## [418] "splines"            "stats"              "stats4"            
## [421] "tcltk"              "tools"              "utils"

R & Packages

?rq

## No documentation for 'rq' in specified packages and libraries:
## you could try '??rq'

require(quantreg)

## Loading required package: quantreg
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve

library(quantreg, verbose=FALSE)
?rq
rq

## function (formula, tau = 0.5, data, subset, weights, na.action, 
##     method = "br", model = TRUE, contrasts = NULL, ...) 
## {
##     call <- match.call()
##     mf <- match.call(expand.dots = FALSE)
##     m <- match(c("formula", "data", "subset", "weights", "na.action"), 
##         names(mf), 0)
##     mf <- mf[c(1, m)]
##     mf$drop.unused.levels <- TRUE
##     mf[[1]] <- as.name("model.frame")
##     mf <- eval.parent(mf)
##     if (method == "model.frame") 
##         return(mf)
##     mt <- attr(mf, "terms")
##     weights <- as.vector(model.weights(mf))
##     Y <- model.response(mf)
##     X <- model.matrix(mt, mf, contrasts)
##     eps <- .Machine$double.eps^(2/3)
##     Rho <- function(u, tau) u * (tau - (u < 0))
##     if (length(tau) > 1) {
##         if (any(tau < 0) || any(tau > 1)) 
##             stop("invalid tau:  taus should be >= 0 and <= 1")
##         if (any(tau == 0)) 
##             tau[tau == 0] <- eps
##         if (any(tau == 1)) 
##             tau[tau == 1] <- 1 - eps
##         coef <- matrix(0, ncol(X), length(tau))
##         rho <- rep(0, length(tau))
##         fitted <- resid <- matrix(0, nrow(X), length(tau))
##         for (i in 1:length(tau)) {
##             z <- {
##                 if (length(weights)) 
##                   rq.wfit(X, Y, tau = tau[i], weights, method, 
##                     ...)
##                 else rq.fit(X, Y, tau = tau[i], method, ...)
##             }
##             coef[, i] <- z$coefficients
##             resid[, i] <- z$residuals
##             rho[i] <- sum(Rho(z$residuals, tau[i]))
##             fitted[, i] <- Y - z$residuals
##         }
##         taulabs <- paste("tau=", format(round(tau, 3)))
##         dimnames(coef) <- list(dimnames(X)[[2]], taulabs)
##         dimnames(resid) <- list(dimnames(X)[[1]], taulabs)
##         fit <- z
##         fit$coefficients <- coef
##         fit$residuals <- resid
##         fit$fitted.values <- fitted
##         if (method == "lasso") 
##             class(fit) <- c("lassorqs", "rqs")
##         else if (method == "scad") 
##             class(fit) <- c("scadrqs", "rqs")
##         else class(fit) <- "rqs"
##     }
##     else {
##         process <- (tau < 0 || tau > 1)
##         if (tau == 0) 
##             tau <- eps
##         if (tau == 1) 
##             tau <- 1 - eps
##         fit <- {
##             if (length(weights)) 
##                 rq.wfit(X, Y, tau = tau, weights, method, ...)
##             else rq.fit(X, Y, tau = tau, method, ...)
##         }
##         if (process) 
##             rho <- list(x = fit$sol[1, ], y = fit$sol[3, ])
##         else {
##             dimnames(fit$residuals) <- list(dimnames(X)[[1]], 
##                 NULL)
##             rho <- sum(Rho(fit$residuals, tau))
##         }
##         if (method == "lasso") 
##             class(fit) <- c("lassorq", "rq")
##         else if (method == "scad") 
##             class(fit) <- c("scadrq", "rq")
##         else class(fit) <- ifelse(process, "rq.process", "rq")
##     }
##     fit$na.action <- attr(mf, "na.action")
##     fit$formula <- formula
##     fit$terms <- mt
##     fit$xlevels <- .getXlevels(mt, mf)
##     fit$call <- call
##     fit$tau <- tau
##     fit$weights <- weights
##     fit$residuals <- drop(fit$residuals)
##     fit$rho <- rho
##     fit$method <- method
##     fit$fitted.values <- drop(fit$fitted.values)
##     attr(fit, "na.message") <- attr(m, "na.message")
##     if (model) 
##         fit$model <- mf
##     fit
## }
## <environment: namespace:quantreg>

detach(package:quantreg, unload=TRUE)
?rq

## No documentation for 'rq' in specified packages and libraries:
## you could try '??rq'

Getting Help with R

apropos("norm")

##  [1] "dlnorm"          "dnorm"           "norm"           
##  [4] "norm"            "normalizePath"   "plnorm"         
##  [7] "pnorm"           "qlnorm"          "qnorm"          
## [10] "qqnorm"          "rlnorm"          "rnorm"          
## [13] ".__T__norm:base"

S3 vs S4 Classes

person3 <- function(name,age,weight,height){
  characteristics<-list(name=name,age=age,weight=weight,height=height)
  class(characteristics)<-"person3"
  return(characteristics)}
JohnDoe3 <- person3(name="John",age=28, weight=76, height=182)

S3 vs S4 Classes

JohnDoe3

## $name
## [1] "John"
## 
## $age
## [1] 28
## 
## $weight
## [1] 76
## 
## $height
## [1] 182
## 
## attr(,"class")
## [1] "person3"

JohnDoe3$age

## [1] 28

S3 vs S4 Classes

BMI3 <- function(object,...) {return(object$weight*1e4/object$height^2)}
BMI3(JohnDoe3)

## [1] 22.94409

S3 vs S4 Classes

reg3 <- lm(dist~speed,data=cars)
reg3$coefficients

## (Intercept)       speed 
##  -17.579095    3.932409

coef(reg3)

## (Intercept)       speed 
##  -17.579095    3.932409

S3 vs S4 Classes

summary(reg3)

## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.069  -9.525  -2.272   9.215  43.201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
## speed         3.9324     0.4155   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12

S3 vs S4 Classes

setClass("person4", representation(name="character",
age="numeric", weight="numeric", height="numeric"), where = .GlobalEnv)
JohnDoe4 <- new("person4",name="John",age=28, weight=76, height=182)

S3 vs S4 Classes

JohnDoe4

## An object of class "person4"
## Slot "name":
## [1] "John"
## 
## Slot "age":
## [1] 28
## 
## Slot "weight":
## [1] 76
## 
## Slot "height":
## [1] 182

S3 vs S4 Classes

eval(JohnDoe4,.GlobalEnv)

## An object of class "person4"
## Slot "name":
## [1] "John"
## 
## Slot "age":
## [1] 28
## 
## Slot "weight":
## [1] 76
## 
## Slot "height":
## [1] 182

S3 vs S4 Classes

JohnDoe4@age

## [1] 28

S3 vs S4 Classes

setGeneric("BMI4",function(object,separator) return(standardGeneric("BMI4")), where = .GlobalEnv)

## [1] "BMI4"

setMethod("BMI4", "person4",
          function(object){return(object@weight*1e4/object@height^2)}, where = .GlobalEnv)

## [1] "BMI4"

BMI4(JohnDoe4)

## [1] 22.94409

S3 vs S4 Classes

library(VGAM)

## Loading required package: stats4
## Loading required package: splines

reg4 <- vglm(dist~speed,data=cars,family=gaussianff)
reg4@coefficients

## (Intercept)       speed 
##  -17.579095    3.932409

coef(reg4)

## (Intercept)       speed 
##  -17.579095    3.932409

S3 vs S4 Classes

qnorm(0.25,mean=5,sd=2)

## [1] 3.65102

library(distr, verbose=FALSE)

## Loading required package: startupmsg
## Utilities for start-up messages (version 0.9)
## For more information see ?"startupmsg", NEWS("startupmsg")
## 
## Loading required package: sfsmisc
## Loading required package: SweaveListingUtils
## Utilities for Sweave together with TeX listings package (version 0.6.2)
## NOTE: Support for this package will stop soon.
## Package 'knitr' is providing the same functionality in a better way.
## Some functions from package 'base' are intentionally masked ---see SweaveListingMASK().
## Note that global options are controlled by SweaveListingoptions() ---c.f. ?"SweaveListingoptions".
## For more information see ?"SweaveListingUtils", NEWS("SweaveListingUtils")
## There is a vignette to this package; try vignette("ExampleSweaveListingUtils").
## 
## 
## Attaching package: 'SweaveListingUtils'
## 
## The following objects are masked from 'package:base':
## 
##     library, require
## 
## Object oriented implementation of distributions (version 2.5.3)
## Attention: Arithmetics on distribution objects are understood as operations on corresponding random variables (r.v.s); see distrARITH().
## Some functions from package 'stats' are intentionally masked ---see distrMASK().
## Note that global options are controlled by distroptions() ---c.f. ?"distroptions".
## For more information see ?"distr", NEWS("distr"), as well as
##   http://distr.r-forge.r-project.org/
## Package "distrDoc" provides a vignette to this package as well as to several extension packages; try vignette("distr").
## 
## 
## Attaching package: 'distr'
## 
## The following object is masked from 'package:VGAM':
## 
##     Max
## 
## The following objects are masked from 'package:stats':
## 
##     df, qqplot, sd

library(distrEx, verbose=FALSE)

## Extensions of package distr (version 2.5)
## Note: Packages "e1071", "moments", "fBasics" should be attached /before/ package "distrEx". See distrExMASK().Note: Extreme value distribution functionality has been moved to
##       package "RobExtremes". See distrExMOVED().
## For more information see ?"distrEx", NEWS("distrEx"), as well as
##   http://distr.r-forge.r-project.org/
## Package "distrDoc" provides a vignette to this package as well as to several related packages; try vignette("distr").
## 
## 
## Attaching package: 'distrEx'
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, median, var

X <- Norm(mean=5,sd=2)
q(X)

## Error in quit(save, status, runLast): réponse attendue parmi "yes", "no", "ask" ou "default"

q(X)(0.25)

## Error in quit(save, status, runLast): réponse attendue parmi "yes", "no", "ask" ou "default"

mean(X)

## Warning in mean.default(X): argument is not numeric or logical: returning
## NA

## [1] NA

S3 vs S4 Classes

X1 <- Norm(mean=5,sd=2)
X2 <- Norm(mean=2,sd=1)
S <- X1+X2
q(S)(.5)

## Error in quit(save, status, runLast): réponse attendue parmi "yes", "no", "ask" ou "default"

S3 vs S4 Classes

plot(S)

plot of chunk unnamed-chunk-16

R objects : Numbers

set.seed(1)
U <- runif(20)
U[1:4]

## [1] 0.2655087 0.3721239 0.5728534 0.9082078

options(digits = 3)
U[1:4]

## [1] 0.266 0.372 0.573 0.908

options(digits = 22)
U[1:4]

## [1] 0.2655086631420999765396 0.3721238996367901563644
## [3] 0.5728533633518964052200 0.9082077899947762489319

R objects : Numbers

x <- exp(1)
y <- x
x <- 2
y

## [1] 2.72

class(x)

## [1] "numeric"

R objects : Numbers

0/0

## [1] NaN

1/0

## [1] Inf

.Machine$double.xmax

## [1] 1.8e+308

2e+307<Inf

## [1] TRUE

2e+308<Inf

## [1] FALSE

R objects : Numbers

(3/10-1/10)

## [1] 0.2

(3/10-1/10)==(7/10-5/10)

## [1] FALSE

(3/10-1/10)-(7/10-5/10)

## [1] 2.78e-17

all.equal((3/10-1/10),(7/10-5/10))

## [1] TRUE

sqrt(2)^2 == 2

## [1] FALSE

.Machine$double.eps

## [1] 2.22e-16

.Machine$double.xmin

## [1] 2.23e-308

R objects : Numbers

x <- rnorm(8)
names(x) <- letters[1:8]
x

##       a       b       c       d       e       f       g       h 
##  1.5118  0.3898 -0.6212 -2.2147  1.1249 -0.0449 -0.0162  0.9438

x[2:4]

##      b      c      d 
##  0.390 -0.621 -2.215

x[c("b","c","d")]

##      b      c      d 
##  0.390 -0.621 -2.215

min(x[x>0])

## [1] 0.39

x[-(3:8)]

##    a    b 
## 1.51 0.39

# x[-3:8]
x[c(TRUE,FALSE)]

##       a       c       e       g 
##  1.5118 -0.6212  1.1249 -0.0162

x[c(TRUE,NA,FALSE,TRUE)]

##      a   <NA>      d      e   <NA>      h 
##  1.512     NA -2.215  1.125     NA  0.944

x[2]

##    b 
## 0.39

x[[2]]

## [1] 0.39

R objects : Numbers

for(i in 1:2){
  nom_var <- paste0("x", i)
  assign(nom_var, rpois(5,7))
}
x1

## [1] 9 4 8 6 9

x2

## [1] 8 9 7 7 9

R objects : Numbers

x <- runif(4)
x

## [1] 0.0233 0.4772 0.7323 0.6927

## [1] 0.0233 0.4772 0.7323 0.6927

set.seed(1)
x <- runif(4)
x

## [1] 0.266 0.372 0.573 0.908

R objects : Numbers

library(pryr)
x %<a-% runif(1)
x

## [1] 0.202

## [1] 0.898

rm("x")
x

## Error in eval(expr, envir, enclos): objet 'x' introuvable

R objects : Numbers

(x_num=c(1,6,10))

## [1]  1  6 10

(x_int=c(1L,6L,10L))

## [1]  1  6 10

object.size(x_num)

## 72 bytes

object.size(x_int)

## 56 bytes

R objects : Numbers

typeof(x_num)

## [1] "double"

typeof(x_int)

## [1] "integer"

is.integer(x_num)

## [1] FALSE

is.integer(x_int)

## [1] TRUE

str(x_num)

##  num [1:3] 1 6 10

str(x_int)

##  int [1:3] 1 6 10

c(1,c(2,c(3,c(4,5))))

## [1] 1 2 3 4 5

R objects : Matrices

M<-1:24
dim(M)=c(6,4)
M

##      [,1] [,2] [,3] [,4]
## [1,]    1    7   13   19
## [2,]    2    8   14   20
## [3,]    3    9   15   21
## [4,]    4   10   16   22
## [5,]    5   11   17   23
## [6,]    6   12   18   24

str(M)

##  int [1:6, 1:4] 1 2 3 4 5 6 7 8 9 10 ...

R objects : Matrices

colnames(M)=letters[1:4]
rownames(M)=LETTERS[10:15]
M

##   a  b  c  d
## J 1  7 13 19
## K 2  8 14 20
## L 3  9 15 21
## M 4 10 16 22
## N 5 11 17 23
## O 6 12 18 24

M["K",]

##  a  b  c  d 
##  2  8 14 20

R objects : Matrices

M[c("K","N"),]

##   a  b  c  d
## K 2  8 14 20
## N 5 11 17 23

M[c(2,5),]

##   a  b  c  d
## K 2  8 14 20
## N 5 11 17 23

M[c("K",5),]

## Error in M[c("K", 5), ]: indice hors limites

R objects : Matrices

On Windows machines

rm(list=ls())
memory.size()

## Warning: 'memory.size()' is Windows-specific

## [1] Inf

memory.limit()

## Warning: 'memory.limit()' is Windows-specific

## [1] Inf

memory.limit(size=8000)

## Warning: 'memory.limit()' is Windows-specific

## [1] Inf

memory.limit(size=4000)

## Warning: 'memory.limit()' is Windows-specific

## [1] Inf

R objects : Matrices

x3 <- 1:1e3; x4 <- 1:1e4; x5 <- 1:1e5; x6 <- 1:1e6
object.size(x3)

## 4040 bytes

object.size(x4)

## 40040 bytes

object.size(x5)

## 400040 bytes

object.size(x6)

## 4000040 bytes

R objects : Matrices

#z1 <- matrix(0,1e+06,3) 
#z2 <- matrix(as.integer(0),1e+06,3) 
#object.size(z1)
#object.size(z2)
#z3 <- matrix(0,1e+07,30)

R objects : Matrices

mydat_r <- matrix(rnorm(3*5e6,0,2),5e6,3)
colnames(mydat_r) <- c("first","second","third")
mydat_rdf <- as.data.frame(mydat_r)
object.size(mydat_rdf)

## 120000896 bytes

R objects : Matrices

library(biglm, verbose=FALSE)

## Loading required package: DBI

z1 <- matrix(as.integer(5), 300, 400)
object.size(z1)

## 480200 bytes

library(bigmemory, verbose=FALSE)

## Loading required package: bigmemory.sri

z2 <- big.matrix(300, 400, type="integer", init=5)
object.size(z2)

## 664 bytes

rm(list=ls())

R objects : Factors

x <- c("A", "A", "B", "B", "C")
model.matrix(~0+x)

##   xA xB xC
## 1  1  0  0
## 2  1  0  0
## 3  0  1  0
## 4  0  1  0
## 5  0  0  1
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$x
## [1] "contr.treatment"

R objects : Factors

n <- 10; nn <- 100
group <- factor(round(n * runif(n * nn)))
x <- rnorm(n * nn) + sqrt(as.numeric(group))
tapply(x,group,mean)

##    0    1    2    3    4    5    6    7    8    9   10 
## 1.00 1.42 1.69 2.00 2.44 2.40 2.51 2.94 2.95 3.24 3.27

R objects : Factors

xg <- split(x, group)
sapply(xg, length)

##   0   1   2   3   4   5   6   7   8   9  10 
##  57  98  98 102  97 100 101 101  91 111  44

sapply(xg, mean)

##    0    1    2    3    4    5    6    7    8    9   10 
## 1.00 1.42 1.69 2.00 2.44 2.40 2.51 2.94 2.95 3.24 3.27

R objects : Factors

library(pbapply)
n <- 10; nn <- 1e6
group <- factor(round(n * runif(n * nn)))
x <- rnorm(n * nn) + sqrt(as.numeric(group))
xg <- split(x, group)
pbsapply(xg, mean)

##     0     1     2     3     4     5     6     7     8     9    10 
## 0.998 1.414 1.732 2.000 2.236 2.449 2.647 2.828 3.000 3.163 3.317

R objects : Factors

x <- factor(c("b","a","b"))
levels(x)

## [1] "a" "b"

x[3] = "c"

## Warning in `[<-.factor`(`*tmp*`, 3, value = "c"): invalid factor level, NA
## generated

## [1] b    a    <NA>
## Levels: a b

R objects : Factors

factor(x,levels=c("b","a"))

## [1] b    a    <NA>
## Levels: b a

x[1]

## [1] b
## Levels: a b

x[1,drop=TRUE]

## [1] b
## Levels: b

R objects : Factors

U <- runif(20)
cut(U,breaks=2)

##  [1] (0.145,0.544] (0.145,0.544] (0.145,0.544] (0.544,0.944] (0.544,0.944]
##  [6] (0.145,0.544] (0.145,0.544] (0.145,0.544] (0.544,0.944] (0.145,0.544]
## [11] (0.145,0.544] (0.145,0.544] (0.145,0.544] (0.544,0.944] (0.544,0.944]
## [16] (0.145,0.544] (0.145,0.544] (0.544,0.944] (0.145,0.544] (0.544,0.944]
## Levels: (0.145,0.544] (0.544,0.944]

cut(U,breaks=2,labels=c("small","large"))

##  [1] small small small large large small small small large small small
## [12] small small large large small small large small large
## Levels: small large

R objects : Factors

cut(U,breaks=c(0,.3,.8,1),labels=c("small","medium","large"))

##  [1] medium small  medium large  medium medium medium small  large  medium
## [11] medium small  medium medium large  medium medium medium medium large 
## Levels: small medium large

table(cut(U,breaks=c(0,.3,.8,1),labels=c("small","medium","large")))

## 
##  small medium  large 
##      3     13      4

R objects : Characters

"Be carefull of 'quotes'"
'Be carefull of "quotes"'

cities <- c("New York, NY", "Los Angeles, CA", "Boston, MA")
substr(cities, nchar(cities)-1, nchar(cities))

## [1] "NY" "CA" "MA"

unlist(strsplit(cities, ", "))[seq(2,6,by=2)]

## [1] "NY" "CA" "MA"

R objects : Characters

library(stringr)
tweet = "Emerging #climate change, environment and sustainability concerns for #actuaries Apr 9 #Toronto. Register TODAY http://bit.ly/CIAClimateForum"
hash="#[a-zA-Z]{1,}"
str_extract(tweet,hash)

## [1] "#climate"

str_extract_all(tweet,hash)

## [[1]]
## [1] "#climate"   "#actuaries" "#Toronto"

R objects : Characters

str_locate_all(tweet,hash)

## [[1]]
##      start end
## [1,]    10  17
## [2,]    71  80
## [3,]    88  95

urls="http://([\\da-z\\.-]+)\\.([a-z\\.]{2,6})/[a-zA-Z0-9]{1,}"
str_extract_all(tweet,urls)

## [[1]]
## [1] "http://bit.ly/CIAClimateForum"

R objects : Characters

email="^([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})$"
grep(pattern = email, x = "charpentier.arthur@uqam.ca")

## [1] 1

grepl(pattern = email, x = "charpentier.arthur@uqam.ca")

## [1] TRUE

str_detect(pattern = email, string=c("charpentier.arthur@uqam.ca","@freakonometrics"))

## [1]  TRUE FALSE

R objects : Characters

ex_sentence = "This is 1 simple sentence, just to play with, then we'll play with 4, and that will be more difficult"
library(tm)

## Loading required package: NLP
## 
## Attaching package: 'tm'
## 
## The following object is masked from 'package:pryr':
## 
##     inspect

ex_corpus <- Corpus(VectorSource(ex_sentence))
ex_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1

inspect(ex_corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 101

R objects : Characters

grep("hard", ex_sentence)

## integer(0)

grep("difficult", ex_sentence)

## [1] 1

R objects : Characters

library(stringr)
word(ex_sentence,4)

## [1] "simple"

word(ex_sentence,1:20)

##  [1] "This"      "is"        "1"         "simple"    "sentence,"
##  [6] "just"      "to"        "play"      "with,"     "then"     
## [11] "we'll"     "play"      "with"      "4,"        "and"      
## [16] "that"      "will"      "be"        "more"      "difficult"

ex_words <- strsplit(ex_sentence, split=" ")[[1]]
ex_words

##  [1] "This"      "is"        "1"         "simple"    "sentence,"
##  [6] "just"      "to"        "play"      "with,"     "then"     
## [11] "we'll"     "play"      "with"      "4,"        "and"      
## [16] "that"      "will"      "be"        "more"      "difficult"

R objects : Characters

grep(pattern="w",ex_words,value=TRUE)

## [1] "with," "we'll" "with"  "will"

str_count(ex_words,"w")

##  [1] 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0

grep(pattern="[ai]",ex_words,value=TRUE)

##  [1] "This"      "is"        "simple"    "play"      "with,"    
##  [6] "play"      "with"      "and"       "that"      "will"     
## [11] "difficult"

grep(pattern="[[:punct:]]",ex_words,value=TRUE)

## [1] "sentence," "with,"     "we'll"     "4,"

R objects : Characters

require(wordcloud)

## Loading required package: wordcloud
## Loading required package: RColorBrewer

wordcloud(ex_corpus)

plot of chunk unnamed-chunk-52

R objects : Characters

cols<-colorRampPalette(c("lightgrey","blue"))(40)
wordcloud(words = ex_corpus, max.words = 40, random.order=FALSE, scale = c(5, 0.5), colors=cols)

plot of chunk unnamed-chunk-53

R objects : Characters

tdm <- TermDocumentMatrix(ex_corpus)
tdm

## <<TermDocumentMatrix (terms: 14, documents: 1)>>
## Non-/sparse entries: 14/0
## Sparsity           : 0%
## Maximal term length: 9
## Weighting          : term frequency (tf)

inspect(tdm)

## <<TermDocumentMatrix (terms: 14, documents: 1)>>
## Non-/sparse entries: 14/0
## Sparsity           : 0%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## 
##            Docs
## Terms       1
##   and       1
##   difficult 1
##   just      1
##   more      1
##   play      2
##   sentence, 1
##   simple    1
##   that      1
##   then      1
##   this      1
##   we'll     1
##   will      1
##   with      1
##   with,     1

inspect(ex_corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 101

R objects : Characters

fix_contractions <- function(doc) {
  doc <- gsub("won't", "will not", doc)
  doc <- gsub("n't", " not", doc)
  doc <- gsub("'ll", " will", doc)
  doc <- gsub("'re", " are", doc)
  doc <- gsub("'ve", " have", doc)
  doc <- gsub("'m", " am", doc)
  doc <- gsub("'s", "", doc) 
  return(doc)
}
ex_corpus <- tm_map(ex_corpus, fix_contractions)
inspect(ex_corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] This is 1 simple sentence, just to play with, then we will play with 4, and that will be more difficult

R objects : Characters

ex_corpus <- tm_map(ex_corpus, tolower)
inspect(ex_corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] this is 1 simple sentence, just to play with, then we will play with 4, and that will be more difficult

R objects : Characters

ex_corpus <- tm_map(ex_corpus, removeNumbers)
inspect(ex_corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] this is  simple sentence, just to play with, then we will play with , and that will be more difficult

R objects : Characters

ex_corpus <- tm_map(ex_corpus,gsub, pattern= "[[:punct:]]", replacement = " ")
inspect(ex_corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] this is  simple sentence  just to play with  then we will play with   and that will be more difficult

R objects : Characters

stopwords("en")[sample(1:length(stopwords("en")),size=10)]

##  [1] "of"     "you're" "too"    "i"      "this"   "how's"  "having"
##  [8] "from"   "you've" "she"

inspect(ex_corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] this is  simple sentence  just to play with  then we will play with   and that will be more difficult

library(SnowballC)
ex_corpus <- tm_map(ex_corpus, stemDocument)
inspect(ex_corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## [1] this is  simple sentence  just to play with  then we will play with   and that will be more difficult

R objects : Characters

wordcloud(ex_corpus[[1]])

plot of chunk unnamed-chunk-60

R objects : Characters

wordcloud(words = ex_corpus[[1]], max.words = 40, random.order=FALSE, scale = c(5, 0.5),colors=cols)

plot of chunk unnamed-chunk-61

R objects : Dates

some.dates <- as.Date(c("16/10/12","19/11/12"),format="%d/%m/%y")
diff(some.dates)

## Time difference of 34 days

sequence.date <- seq(from=some.dates[1],to=some.dates[2],by=7)
format(sequence.date,"%b")

## [1] "oct." "oct." "oct." "nov." "nov."

months(sequence.date)

## [1] "octobre"  "octobre"  "octobre"  "novembre" "novembre"

weekdays(some.dates)

## [1] "mardi" "lundi"

R objects : Dates

substr(as.POSIXct(some.dates), 1, 4)

## [1] "2012" "2012"

strftime(some.dates,"%Y")

## [1] "2012" "2012"

R objects : Dates

Sys.setlocale("LC_TIME", "fr_FR")

## Warning in Sys.setlocale("LC_TIME", "fr_FR"): la requête OS pour spécifier
## la localisation à "fr_FR" n'a pas pu être honorée

## [1] ""

months(sequence.date)

## [1] "octobre"  "octobre"  "octobre"  "novembre" "novembre"

R objects : Lists

x <- list(1:5,c(1,2,3,4,5),a="test",
b=c(TRUE,FALSE),rpois(5,8))
x

## [[1]]
## [1] 1 2 3 4 5
## 
## [[2]]
## [1] 1 2 3 4 5
## 
## $a
## [1] "test"
## 
## $b
## [1]  TRUE FALSE
## 
## [[5]]
## [1]  6 13  9  4  7

str(x)

## List of 5
##  $  : int [1:5] 1 2 3 4 5
##  $  : num [1:5] 1 2 3 4 5
##  $ a: chr "test"
##  $ b: logi [1:2] TRUE FALSE
##  $  : int [1:5] 6 13 9 4 7

R objects : Lists

x <- list(
  list(1:5,c(1,2,3,4,5)),a="test",
  b=c(TRUE,FALSE),rpois(5,8))
x

## [[1]]
## [[1]][[1]]
## [1] 1 2 3 4 5
## 
## [[1]][[2]]
## [1] 1 2 3 4 5
## 
## 
## $a
## [1] "test"
## 
## $b
## [1]  TRUE FALSE
## 
## [[4]]
## [1]  6  6  3 12 10

names(x)

## [1] ""  "a" "b" ""

str(x)

## List of 4
##  $  :List of 2
##   ..$ : int [1:5] 1 2 3 4 5
##   ..$ : num [1:5] 1 2 3 4 5
##  $ a: chr "test"
##  $ b: logi [1:2] TRUE FALSE
##  $  : int [1:5] 6 6 3 12 10

is.list(x)

## [1] TRUE

is.recursive(x)

## [1] TRUE

R objects : Lists

x[[3]]

## [1]  TRUE FALSE

x[["b"]]

## [1]  TRUE FALSE

x[[1]]

## [[1]]
## [1] 1 2 3 4 5
## 
## [[2]]
## [1] 1 2 3 4 5

x[[1]][[1]]

## [1] 1 2 3 4 5

R objects : Lists

f <- function(x) { return(x*(1-x)) }
optimize(f, interval=c(0, 1), maximum=TRUE)

## $maximum
## [1] 0.5
## 
## $objective
## [1] 0.25

R objects : Lists

list(abc=1)$a

## [1] 1

list(abc=1,a=2)$a

## [1] 2

list(bac=1)$a

## NULL

list(abc=1,b=2)$a

## [1] 1

R objects : Functions

factorial

## function (x) 
## gamma(x + 1)
## <bytecode: 0x7c25940>
## <environment: namespace:base>

gamma

## function (x)  .Primitive("gamma")

?gamma

If/Else Condition

set.seed(1)
u <- runif(1)
if(u>.5) {("greater than 50%")} else {("smaller than 50%")}

## [1] "smaller than 50%"

ifelse(u>.5,("greater than 50%"),("smaller than 50%"))

## [1] "smaller than 50%"

## [1] 0.266

If/Else Condition

u <- runif(3)
if(u>.5) {print("greater than 50%")} else {("smaller than 50%")}

## Warning in if (u > 0.5) {: la condition a une longueur > 1 et seul le
## premier élément est utilisé

## [1] "smaller than 50%"

ifelse(u>.5,("greater than 50%"),("smaller than 50%"))

## [1] "smaller than 50%" "greater than 50%" "greater than 50%"

## [1] 0.372 0.573 0.908

For Loops

v_x <- runif(1e5)
sqrt_x <- NULL
for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x))

sqrt_x <- NULL
system.time(for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x)))

##    user  system elapsed 
##  14.312   0.073  14.381

For Loops

sqrt_x <- numeric(length(v_x))
for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i])

sqrt_x <- numeric(length(v_x))
system.time(for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i]))

##    user  system elapsed 
##   0.254   0.000   0.254

Apply

system.time(Vectorize(sqrt)(v_x))

##    user  system elapsed 
##   0.002   0.000   0.001

sqrt_x <- sapply(v_x,sqrt)
sqrt_x <- unlist(lapply(v_x,sqrt))
system.time(unlist(lapply(v_x,sqrt)))

##    user  system elapsed 
##   0.035   0.000   0.036

Parallel Apply

library(parallel)
(allcores <- detectCores(all.tests=TRUE))

## [1] 4

sqrt_x <- unlist(mclapply(v_x,sqrt,mc.cores=4))
system.time(unlist(mclapply(v_x,sqrt,mc.cores=4)))

##    user  system elapsed 
##   0.099   0.134   0.094

Functions

f<-function(x) dlnorm(x)
integrate(f,0,Inf)

## 1 with absolute error < 2.5e-07

integrate(f,0,1e5)

## 1.82e-05 with absolute error < 3.6e-05

integrate(f,0,1e3)$value+integrate(f,1e3,1e5)$value

## [1] 1

Compound Poisson

rN.Poisson <- function(n) rpois(n,5)
rX.Exponential <- function(n) rexp(n,2)

\[ S=\sum_{i=1}^N Y_i \ \ \ \left\lbrace \begin{array}{l} N\sim\ {P}(\lambda) \\ Y_i\sim\ {E}(\mu)\\ \end{array} \right. \]

Compound Poisson

rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  V <- rep(0,n)
  for(i in 1:n){
    N <- rN(1)
    if(N>0){V[i] <- sum(rX(N))}
  }
  return(V)}

rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  V <- rep(0,n)
  for (i in 1:n) V[i] <- sum(rX(rN(1)))
  return(V)}

Compound Poisson

rcpd2 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  N <- rN(n)
  X <- rX(sum(N))
  I <- factor(rep(1:n,N),levels=1:n)
  return(as.numeric(xtabs(X ~ I)))}

rcpd3 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  N <- rN(n)
  X <- rX(sum(N))
  I <- factor(rep(1:n,N),levels=1:n)
  V <- tapply(X,I,sum)
  V[is.na(V)] <- 0
  return(as.numeric(V))}

rcpd4 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(sapply(rN(n), function(x) sum(rX(x))))}

Compound Poisson

rcpd5 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(sapply(Vectorize(rX)(rN(n)),sum))}
rcpd6 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(unlist(lapply(lapply(t(rN(n)),rX),sum)))}

n <- 100
library(microbenchmark)
options(digits=1)
microbenchmark(rcpd1(n),rcpd2(n),rcpd3(n),rcpd4(n),rcpd5(n),rcpd6(n))

## Unit: microseconds
##      expr min   lq mean median   uq  max neval cld
##  rcpd1(n) 770  878 1145   1131 1244 7005   100  b 
##  rcpd2(n) 992 1194 1493   1416 1817 2321   100   c
##  rcpd3(n) 508  578  770    672  968 1877   100 a  
##  rcpd4(n) 497  550  665    606  770 1251   100 a  
##  rcpd5(n) 494  576  718    676  840 1294   100 a  
##  rcpd6(n) 412  477  653    580  698 6506   100 a

Vectorization

f <- function(x,m=0,s=1){
  H<-function(t) 1-pnorm(t,m,s)
  integral<-integrate(H,lower=x,upper=Inf)$value
  res<-H(x)/integral
  return(res)
}

f(0:1)

## Warning in if (is.finite(lower)) {: la condition a une longueur > 1 et seul
## le premier élément est utilisé

## [1] 1.3 0.4

Vectorization

y <- rep(NA,2)
x <- 0:1
for(i in 1:2) y[i] <- f(x[i])
y

## [1] 1 2

sapply(x,"f")

## [1] 1 2

Vectorize(f)(x)

## [1] 1 2

Vectorization

binorm <- function(x1,x2,r=0){
  exp(-(x1^2+x2^2-2*r*x1*x2)/(2*(1-r^2)))/(2*pi*sqrt(1-r^2))
}
u <- seq(-2,2)
binorm(u,u)

## [1] 0.003 0.059 0.159 0.059 0.003

outer(u,u,binorm)

##       [,1] [,2] [,3] [,4]  [,5]
## [1,] 0.003 0.01 0.02 0.01 0.003
## [2,] 0.013 0.06 0.10 0.06 0.013
## [3,] 0.022 0.10 0.16 0.10 0.022
## [4,] 0.013 0.06 0.10 0.06 0.013
## [5,] 0.003 0.01 0.02 0.01 0.003

(uv<-expand.grid(u,u))

##    Var1 Var2
## 1    -2   -2
## 2    -1   -2
## 3     0   -2
## 4     1   -2
## 5     2   -2
## 6    -2   -1
## 7    -1   -1
## 8     0   -1
## 9     1   -1
## 10    2   -1
## 11   -2    0
## 12   -1    0
## 13    0    0
## 14    1    0
## 15    2    0
## 16   -2    1
## 17   -1    1
## 18    0    1
## 19    1    1
## 20    2    1
## 21   -2    2
## 22   -1    2
## 23    0    2
## 24    1    2
## 25    2    2

matrix(binorm(uv$Var1,uv$Var2),5,5)

##       [,1] [,2] [,3] [,4]  [,5]
## [1,] 0.003 0.01 0.02 0.01 0.003
## [2,] 0.013 0.06 0.10 0.06 0.013
## [3,] 0.022 0.10 0.16 0.10 0.022
## [4,] 0.013 0.06 0.10 0.06 0.013
## [5,] 0.003 0.01 0.02 0.01 0.003

Vectorization

u <- seq(-2,2,length=31)
persp(u,u,outer(u,u,binorm))

plot of chunk unnamed-chunk-89

Computational Efficiency

library(microbenchmark)
n <- 1000
A<-matrix(seq(1,n^2),n,n)
B<-matrix(seq(1,n^2),n,n)
C<-1:n
microbenchmark((A%*%B)%*%C,A%*%(B%*%C),replications=1)

## Unit: nanoseconds
##             expr   min    lq  mean median    uq   max neval cld
##  (A %*% B) %*% C 1e+09 1e+09 1e+09  1e+09 1e+09 2e+09   100   b
##  A %*% (B %*% C) 1e+07 1e+07 1e+07  1e+07 1e+07 2e+07   100  a 
##     replications 2e+00 5e+01 2e+02  2e+02 3e+02 1e+03   100  a

Computational Efficiency

m <- 1000
n <- 50
X <- matrix(rnorm(m * n, mean = 10, sd = 3), nrow = m)
group <- rep(c("A","B"), each = n / 2)

system.time(for(i in 1:m) t.test(X[i, ] ~ group)$stat)

##    user  system elapsed 
##       1       0       1

system.time(for(i in 1:m) t.test(X[i, group == "A"], X[i, group == "B"])$stat)

##    user  system elapsed 
##     0.3     0.0     0.3

Computational Efficiency

t.test_1 <- function(x, grp){
  t.test(x[ group == "A"], x[ group == "B"])$stat
}
system.time(t1 <- apply(X, 1, t.test_1, grp = grp))

##    user  system elapsed 
##   0.239   0.004   0.244

Computational Efficiency

t.test_2 <- function(x, grp) {
  t_stat <- function(x) {
    m <- mean(x)
    n <- length(x)
    var <- sum((x - m) ^ 2) / (n - 1)
    list(m = m, n = n, var = var)
  }
  g1 <- t_stat(x[grp == "A"])
  g2 <- t_stat(x[grp == "B"])
  se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
  (g1$m - g2$m) / se_total
}
system.time(apply(X, 1, t.test_2, grp = group))

##    user  system elapsed 
##    0.06    0.00    0.06

Computational Efficiency

t.test_3 <- function(X, grp){
  t_stat <- function(X) {
    m <- rowMeans(X)
    n <- ncol(X)
    var <- rowSums((X - m) ^ 2) / (n - 1)
    list(m = m, n = n, var = var)
  }
  g1 <- t_stat(X[, grp == "A"])
  g2 <- t_stat(X[, grp == "B"])
  se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
  (g1$m - g2$m) / se_total
}
system.time( t.test_3(X, group))

##    user  system elapsed 
##   0.002   0.000   0.002

Computational Efficiency

"%pm%" <- function(x,s) x + c(qnorm(.05),qnorm(.95))*s
100 %pm% 10

## [1]  84 116

'second<-' <- function(x, value) {
  x[2] <- value
  return(x)
}
x <- 1:10
second(x) <- 5
x

##  [1]  1  5  3  4  5  6  7  8  9 10

Memory Issues

library(pryr)
x <- 1:10
address(x)

## [1] "0x25dc7d28"

second(x) <- 6
address(x)

## [1] "0x25dc7458"

Memory Issues

ls(globalenv())

## character(0)

find("x")

## character(0)

find("pi")

## [1] "package:base"

environment(sd)

## <environment: 0x48c45f0>

e <- new.env()
e$d <- 1
e$f <- 1:5
e$g <- 1:5
ls(e)

## [1] "d" "f" "g"

str(e)

## <environment: 0x62d3be8>

identical(globalenv(),e)

## [1] FALSE

search()

##  [1] ".GlobalEnv"                 "package:microbenchmark"    
##  [3] "package:SnowballC"          "package:wordcloud"         
##  [5] "package:RColorBrewer"       "package:tm"                
##  [7] "package:NLP"                "package:stringr"           
##  [9] "package:pbapply"            "package:bigmemory"         
## [11] "package:bigmemory.sri"      "package:biglm"             
## [13] "package:DBI"                "package:pryr"              
## [15] "package:distrEx"            "package:distr"             
## [17] "package:SweaveListingUtils" "package:sfsmisc"           
## [19] "package:startupmsg"         "package:VGAM"              
## [21] "package:splines"            "package:stats4"            
## [23] "package:SparseM"            "package:stats"             
## [25] "package:graphics"           "package:grDevices"         
## [27] "package:utils"              "package:datasets"          
## [29] "package:methods"            "Autoloads"                 
## [31] "package:base"

Memory Issues

f1 <- function(x1){
  f2 <- function(x2){
    f3 <- function(x3){
      x1+x2+x3
    }
    f3(3)
  } 
  f2(2)
}
f1(1)

## [1] 6

Memory Issues

f <- function(x) log(x)
f("x")

## Error in log(x): argument non numérique pour une fonction mathématique

try(f("x"))
inherits(try(f("x")), "try-error")

## [1] TRUE

inherits(try(f("x"), silent=TRUE), "try-error")

## [1] TRUE

a=0
try(a<-f("x"), silent=TRUE)
a

## [1] 0

try(a<-f(x), silent=TRUE)
a

##  [1] 0.00 1.79 1.10 1.39 1.61 1.79 1.95 2.08 2.20 2.30

Memory Issues

x=1:10
g=function(f) f(x)
g(mean)

## [1] 5.5

Memory Issues

fibonacci <- function(n){
  if(n<2) return(1)
  fibonacci(n-2)+fibonacci(n-1)
}
fibonacci(20)

## [1] 10946

system.time(fibonacci(30))

##    user  system elapsed 
##   3.163   0.016   3.195

Memory Issues

library(memoise)
fibonacci <- memoise(function(n){
  if(n<2) return(1)
  fibonacci(n-2)+fibonacci(n-1)
})
system.time(fibonacci(30))

##    user  system elapsed 
##   0.009   0.000   0.009

Functions

f <- function(x) x^2
f

## function(x) x^2
## <environment: 0x1931880>

formals(f)

## $x

body(f)

## x^2

Functions

set.seed(1)
x<-rexp(6)
sum(x)

## [1] 5.55

.Primitive("sum")(x)

## [1] 5.55

Functions

library(Rcpp)
cppFunction('double sum_C(NumericVector x) {
            int n = x.size();
            double total = 0;
            for(int i = 0; i < n; ++i) {
            total += x[i];
            }
            return total;
            }')
sum_C(x)

## [1] 5.55

Functions

library(rPython)

## Loading required package: RJSONIO

sum_P=function(z) python.call("sum",z)
sum_P(x)

## [1] 5.55

Functions

library(microbenchmark)
x <- runif(1e6)
microbenchmark(
  sum(x),
  .Primitive("sum")(x),
  sum_C(x))

## Unit: milliseconds
##                  expr  min   lq mean median   uq  max neval cld
##                sum(x) 1.29 1.52 1.92   1.65 2.18 5.71   100  ab
##  .Primitive("sum")(x) 1.24 1.54 2.23   1.78 2.47 8.42   100   b
##              sum_C(x) 1.21 1.37 1.78   1.48 2.05 6.28   100  a

Functions

x<-10
f=function() x<-5
f

## function() x<-5
## <environment: 0x1931880>

f()
x

## [1] 10

f=function(){
  x<-5
  return(x)
}
f()

## [1] 5

## [1] 10

Functions

f=function(){
  x<<-5
  return(x)
}
f()

## [1] 5

## [1] 5

f=function(){
  z<-5
  return(z)
}
f()

## [1] 5

## Error in eval(expr, envir, enclos): objet 'z' introuvable

Functions

f=function(x){
  z<-x+5
  return(z)
}
f(2)

## [1] 7

## Error in eval(expr, envir, enclos): objet 'z' introuvable

Functions

x=function(y) y/2
x

## function(y) y/2
## <environment: 0x1931880>

x <- 5
x(x)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "x"

Functions

x=function(y) y/2
y=function(){
  x <- 10
  x(x)
}
y()

## [1] 5

Functions

names_list <- function(...){
  names(list(...))
}
names_list(a=5,b=7)

## [1] "a" "b"

Functions

power <- function(exponent) {
  function(x) {
    x ^ exponent
  }
}
square <- power(2)
square(4)

## [1] 16

cube <- power(3)
cube(4)

## [1] 64

cube

## function(x) {
##     x ^ exponent
##   }
## <environment: 0x64f40e0>

Functions

total <- 20
pb <- txtProgressBar(min = 0, max = total, style = 3)

## 
  |                                                                       
  |                                                                 |   0%

for(i in 1:(10*total)){
  Sys.sleep(0.01)
  if(i %%10 == 0) setTxtProgressBar(pb, i%/%10)
}

## 
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |=================================================================| 100%

close(pb)

R objects : Data Frames

df <- data.frame(x=1:3,y=letters[1:3])
str(df)

## 'data.frame':    3 obs. of  2 variables:
##  $ x: int  1 2 3
##  $ y: Factor w/ 3 levels "a","b","c": 1 2 3

typeof(df)

## [1] "list"

class(df)

## [1] "data.frame"

R objects : Data Frames

cbind(df,z=9:7)

##   x y z
## 1 1 a 9
## 2 2 b 8
## 3 3 c 7

df$z<-5:3
df

##   x y z
## 1 1 a 5
## 2 2 b 4
## 3 3 c 3

R objects : Data Frames

df[1]

##   x
## 1 1
## 2 2
## 3 3

df[,1,drop=FALSE]

##   x
## 1 1
## 2 2
## 3 3

df[,1,drop=TRUE]

## [1] 1 2 3

df[[1]]

## [1] 1 2 3

df$x

## [1] 1 2 3

df[,"x"]

## [1] 1 2 3

df[["x"]]

## [1] 1 2 3

R objects : Data Frames

set.seed(1)
df[sample(nrow(df)),]

##   x y z
## 1 1 a 5
## 3 3 c 3
## 2 2 b 4

set.seed(1)
df[sample(nrow(df),nrow(df)*2,replace=TRUE),]

##     x y z
## 1   1 a 5
## 2   2 b 4
## 2.1 2 b 4
## 3   3 c 3
## 1.1 1 a 5
## 3.1 3 c 3

subset(df, x>1)

##   x y z
## 2 2 b 4
## 3 3 c 3

R objects : Data Frames

set.seed(123)
df <- data.frame(Y=rnorm(50), X1=as.factor(sample(LETTERS[1:4],size=50,
replace=TRUE)), X2=as.factor(sample(1:3,size=50,replace=TRUE)))
tail(df)

##          Y X1 X2
## 45  1.2080  D  3
## 46 -1.1231  A  2
## 47 -0.4029  A  2
## 48 -0.4667  A  2
## 49  0.7800  A  1
## 50 -0.0834  C  2

R objects : Data Frames

reg <- lm(Y~X1+X2,data=df)
model.matrix(reg)[45:50,]

##    (Intercept) X1B X1C X1D X22 X23
## 45           1   0   0   1   0   1
## 46           1   0   0   0   1   0
## 47           1   0   0   0   1   0
## 48           1   0   0   0   1   0
## 49           1   0   0   0   0   0
## 50           1   0   1   0   1   0

reg <- lm(Y~(X1=="A")+X2,data=df)
model.matrix(reg)[45:50,]

##    (Intercept) X1 == "A"TRUE X22 X23
## 45           1             0   0   1
## 46           1             1   1   0
## 47           1             1   1   0
## 48           1             1   1   0
## 49           1             1   0   0
## 50           1             0   1   0

R objects : Data Frames

reg <- lm(Y~X1*X2,data=df)
model.matrix(reg)[45:50,]

##    (Intercept) X1B X1C X1D X22 X23 X1B:X22 X1C:X22 X1D:X22 X1B:X23 X1C:X23
## 45           1   0   0   1   0   1       0       0       0       0       0
## 46           1   0   0   0   1   0       0       0       0       0       0
## 47           1   0   0   0   1   0       0       0       0       0       0
## 48           1   0   0   0   1   0       0       0       0       0       0
## 49           1   0   0   0   0   0       0       0       0       0       0
## 50           1   0   1   0   1   0       0       1       0       0       0
##    X1D:X23
## 45       1
## 46       0
## 47       0
## 48       0
## 49       0
## 50       0

R objects : Data Frames

reg <- lm(Y~X1+X2%in%X1,data=df)
model.matrix(reg)[45:50,]

##    (Intercept) X1B X1C X1D X1A:X22 X1B:X22 X1C:X22 X1D:X22 X1A:X23 X1B:X23
## 45           1   0   0   1       0       0       0       0       0       0
## 46           1   0   0   0       1       0       0       0       0       0
## 47           1   0   0   0       1       0       0       0       0       0
## 48           1   0   0   0       1       0       0       0       0       0
## 49           1   0   0   0       0       0       0       0       0       0
## 50           1   0   1   0       0       0       1       0       0       0
##    X1C:X23 X1D:X23
## 45       0       1
## 46       0       0
## 47       0       0
## 48       0       0
## 49       0       0
## 50       0       0

R objects : Data Frames

download.file("http://freakonometrics.free.fr/superheroes.RData","superheroes.RData")
load("superheroes.RData")
superheroes

##       name alignment gender         publisher
## 1  Magneto       bad   male            Marvel
## 2    Storm      good female            Marvel
## 3 Mystique       bad female            Marvel
## 4   Batman      good   male                DC
## 5    Joker       bad   male                DC
## 6 Catwoman       bad female                DC
## 7  Hellboy      good   male Dark Horse Comics

publishers

##   publisher yr_founded
## 1        DC       1934
## 2    Marvel       1939
## 3     Image       1992

R objects : Data Frames

library(dplyr, verbose=FALSE)
inner_join(superheroes, publishers)

## Error in inner_join(superheroes, publishers): objet 'superheroes' introuvable

R objects : Data Frames

inner_join(publishers, superheroes)

## Joining by: "publisher"

## Warning in inner_join_impl(x, y, by$x, by$y): joining factors with
## different levels, coercing to character vector

##   publisher yr_founded     name alignment gender
## 1        DC       1934   Batman      good   male
## 2        DC       1934    Joker       bad   male
## 3        DC       1934 Catwoman       bad female
## 4    Marvel       1939  Magneto       bad   male
## 5    Marvel       1939    Storm      good female
## 6    Marvel       1939 Mystique       bad female

R objects : Data Frames

merge(superheroes, publishers, all = TRUE)

##           publisher     name alignment gender yr_founded
## 1 Dark Horse Comics  Hellboy      good   male         NA
## 2                DC   Batman      good   male       1934
## 3                DC    Joker       bad   male       1934
## 4                DC Catwoman       bad female       1934
## 5            Marvel  Magneto       bad   male       1939
## 6            Marvel    Storm      good female       1939
## 7            Marvel Mystique       bad female       1939
## 8             Image     <NA>      <NA>   <NA>       1992

R objects : Data Frames

semi_join(superheroes, publishers)

## Joining by: "publisher"

## Warning in semi_join_impl(x, y, by$x, by$y): joining factors with different
## levels, coercing to character vector

##       name alignment gender publisher
## 1   Batman      good   male        DC
## 2    Joker       bad   male        DC
## 3 Catwoman       bad female        DC
## 4  Magneto       bad   male    Marvel
## 5    Storm      good female    Marvel
## 6 Mystique       bad female    Marvel

R objects : Data Frames

semi_join(publishers, superheroes)

## Joining by: "publisher"

## Warning in semi_join_impl(x, y, by$x, by$y): joining factors with different
## levels, coercing to character vector

##   publisher yr_founded
## 1    Marvel       1939
## 2        DC       1934

R objects : Data Frames

left_join(superheroes, publishers)

## Joining by: "publisher"

## Warning in left_join_impl(x, y, by$x, by$y): joining factors with different
## levels, coercing to character vector

##       name alignment gender         publisher yr_founded
## 1  Magneto       bad   male            Marvel       1939
## 2    Storm      good female            Marvel       1939
## 3 Mystique       bad female            Marvel       1939
## 4   Batman      good   male                DC       1934
## 5    Joker       bad   male                DC       1934
## 6 Catwoman       bad female                DC       1934
## 7  Hellboy      good   male Dark Horse Comics         NA

R objects : Data Frames

left_join(publishers, superheroes)

## Joining by: "publisher"

## Warning in left_join_impl(x, y, by$x, by$y): joining factors with different
## levels, coercing to character vector

##   publisher yr_founded     name alignment gender
## 1        DC       1934   Batman      good   male
## 2        DC       1934    Joker       bad   male
## 3        DC       1934 Catwoman       bad female
## 4    Marvel       1939  Magneto       bad   male
## 5    Marvel       1939    Storm      good female
## 6    Marvel       1939 Mystique       bad female
## 7     Image       1992     <NA>      <NA>   <NA>

R objects : Data Frames

anti_join(superheroes, publishers)

## Joining by: "publisher"

## Warning in anti_join_impl(x, y, by$x, by$y): joining factors with different
## levels, coercing to character vector

##      name alignment gender         publisher
## 1 Hellboy      good   male Dark Horse Comics

R objects : Data Frames

anti_join(publishers, superheroes)

## Joining by: "publisher"

## Warning in anti_join_impl(x, y, by$x, by$y): joining factors with different
## levels, coercing to character vector

##   publisher yr_founded
## 1     Image       1992

R objects : Data Frames

download.file("http://freakonometrics.free.fr/gapminderDataFiveYear.txt","gapminderDataFiveYear.txt")
gdf <- read.delim("gapminderDataFiveYear.txt")
head(gdf,4)

##       country year      pop continent lifeExp gdpPercap
## 1 Afghanistan 1952  8425333      Asia    28.8       779
## 2 Afghanistan 1957  9240934      Asia    30.3       821
## 3 Afghanistan 1962 10267083      Asia    32.0       853
## 4 Afghanistan 1967 11537966      Asia    34.0       836

str(gdf)

## 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...

R objects : Data Frames

library(dplyr)
gtbl <- tbl_df(gdf)
library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, last
## 
## The following object is masked from 'package:pryr':
## 
##     address
## 
## The following object is masked from 'package:sfsmisc':
## 
##     last

gdfdt <- data.table(gdf)

[gdf] is a standard data frame
[gtbl] is a local data frame (from [dplyr])
[gdfdt] is a data table (from [data.table])

R objects : Data Frames

subset(gdf, lifeExp < 30)

##          country year     pop continent lifeExp gdpPercap
## 1    Afghanistan 1952 8425333      Asia    28.8       779
## 1293      Rwanda 1992 7290203    Africa    23.6       737

gdf[lifeExp < 30,]

## Error in `[.data.frame`(gdf, lifeExp < 30, ): objet 'lifeExp' introuvable

gdf[gdf$lifeExp < 30,]

##          country year     pop continent lifeExp gdpPercap
## 1    Afghanistan 1952 8425333      Asia    28.8       779
## 1293      Rwanda 1992 7290203    Africa    23.6       737

R objects : Data Frames

filter(gtbl, lifeExp < 30)

## Source: local data frame [2 x 6]
## 
##       country  year     pop continent lifeExp gdpPercap
##        (fctr) (int)   (dbl)    (fctr)   (dbl)     (dbl)
## 1 Afghanistan  1952 8425333      Asia    28.8       779
## 2      Rwanda  1992 7290203    Africa    23.6       737

subset(gdfdt, lifeExp < 30)

##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia    28.8       779
## 2:      Rwanda 1992 7290203    Africa    23.6       737

gdfdt[lifeExp < 30,]

##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia    28.8       779
## 2:      Rwanda 1992 7290203    Africa    23.6       737

R objects : Data Frames

gdf[gdf$country == "Italy", c("year", "lifeExp")]

##     year lifeExp
## 769 1952    65.9
## 770 1957    67.8
## 771 1962    69.2
## 772 1967    71.1
## 773 1972    72.2
## 774 1977    73.5
## 775 1982    75.0
## 776 1987    76.4
## 777 1992    77.4
## 778 1997    78.8
## 779 2002    80.2
## 780 2007    80.5

gtbl %>% 
  filter(country == "Italy") %>%
  select(year, lifeExp)

## Source: local data frame [12 x 2]
## 
##     year lifeExp
##    (int)   (dbl)
## 1   1952    65.9
## 2   1957    67.8
## 3   1962    69.2
## 4   1967    71.1
## 5   1972    72.2
## 6   1977    73.5
## 7   1982    75.0
## 8   1987    76.4
## 9   1992    77.4
## 10  1997    78.8
## 11  2002    80.2
## 12  2007    80.5

gdfdt[country == "Italy",cbind(year, lifeExp)]

##       year lifeExp
##  [1,] 1952    65.9
##  [2,] 1957    67.8
##  [3,] 1962    69.2
##  [4,] 1967    71.1
##  [5,] 1972    72.2
##  [6,] 1977    73.5
##  [7,] 1982    75.0
##  [8,] 1987    76.4
##  [9,] 1992    77.4
## [10,] 1997    78.8
## [11,] 2002    80.2
## [12,] 2007    80.5

setkey(gdfdt,country)
gdfdt["Italy",cbind(year, lifeExp)]

##       year lifeExp
##  [1,] 1952    65.9
##  [2,] 1957    67.8
##  [3,] 1962    69.2
##  [4,] 1967    71.1
##  [5,] 1972    72.2
##  [6,] 1977    73.5
##  [7,] 1982    75.0
##  [8,] 1987    76.4
##  [9,] 1992    77.4
## [10,] 1997    78.8
## [11,] 2002    80.2
## [12,] 2007    80.5

R objects : Data Frames

rm(list=ls())
loc="http://freakonometrics.free.fr/gapminderDataFiveYear.txt"
download.file(loc,destfile="temp")
df <- read.delim("temp")
tail(df,4)

##       country year      pop continent lifeExp gdpPercap
## 1701 Zimbabwe 1992 10704340    Africa    60.4       693
## 1702 Zimbabwe 1997 11404948    Africa    46.8       792
## 1703 Zimbabwe 2002 11926563    Africa    40.0       672
## 1704 Zimbabwe 2007 12311143    Africa    43.5       470

R objects : Data Frames

str(df)

## 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...

sum(duplicated(df))

## [1] 0

df$lifeExp[1:10]

##  [1] 28.8 30.3 32.0 34.0 36.1 38.4 39.9 40.8 41.7 41.8

df[4:5,]

##       country year      pop continent lifeExp gdpPercap
## 4 Afghanistan 1967 11537966      Asia    34.0       836
## 5 Afghanistan 1972 13079460      Asia    36.1       740

R objects : Data Frames

library(dplyr)
ldf <- tbl_df(df)
glimpse(ldf)

## Observations: 1,704
## Variables: 6
## $ country   (fctr) Afghanistan, Afghanistan, Afghanistan, Afghanistan,...
## $ year      (int) 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992...
## $ pop       (dbl) 8425333, 9240934, 10267083, 11537966, 13079460, 1488...
## $ continent (fctr) Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi...
## $ lifeExp   (dbl) 28.8, 30.3, 32.0, 34.0, 36.1, 38.4, 39.9, 40.8, 41.7...
## $ gdpPercap (dbl) 779, 821, 853, 836, 740, 786, 978, 852, 649, 635, 72...

select(ldf,lifeExp)

## Source: local data frame [1,704 x 1]
## 
##    lifeExp
##      (dbl)
## 1     28.8
## 2     30.3
## 3     32.0
## 4     34.0
## 5     36.1
## 6     38.4
## 7     39.9
## 8     40.8
## 9     41.7
## 10    41.8
## ..     ...

select(ldf,lifeExp)$lifeExp[1:10]

##  [1] 28.8 30.3 32.0 34.0 36.1 38.4 39.9 40.8 41.7 41.8

#library(modify)
#s_select(ldf,"lifeExp")
ldf[4:5,]

## Source: local data frame [2 x 6]
## 
##       country  year      pop continent lifeExp gdpPercap
##        (fctr) (int)    (dbl)    (fctr)   (dbl)     (dbl)
## 1 Afghanistan  1967 11537966      Asia    34.0       836
## 2 Afghanistan  1972 13079460      Asia    36.1       740

slice(ldf,4:5)

## Source: local data frame [2 x 6]
## 
##       country  year      pop continent lifeExp gdpPercap
##        (fctr) (int)    (dbl)    (fctr)   (dbl)     (dbl)
## 1 Afghanistan  1967 11537966      Asia    34.0       836
## 2 Afghanistan  1972 13079460      Asia    36.1       740

filter(ldf,lifeExp<30 | gdpPercap<300)

## Source: local data frame [6 x 6]
## 
##            country  year      pop continent lifeExp gdpPercap
##             (fctr) (int)    (dbl)    (fctr)   (dbl)     (dbl)
## 1      Afghanistan  1952  8425333      Asia    28.8       779
## 2 Congo, Dem. Rep.  2002 55379852    Africa    45.0       241
## 3 Congo, Dem. Rep.  2007 64606759    Africa    46.5       278
## 4    Guinea-Bissau  1952   580653    Africa    32.5       300
## 5          Lesotho  1952   748747    Africa    42.1       299
## 6           Rwanda  1992  7290203    Africa    23.6       737

rename(ldf,population = pop)

## Source: local data frame [1,704 x 6]
## 
##        country  year population continent lifeExp gdpPercap
##         (fctr) (int)      (dbl)    (fctr)   (dbl)     (dbl)
## 1  Afghanistan  1952    8425333      Asia    28.8       779
## 2  Afghanistan  1957    9240934      Asia    30.3       821
## 3  Afghanistan  1962   10267083      Asia    32.0       853
## 4  Afghanistan  1967   11537966      Asia    34.0       836
## 5  Afghanistan  1972   13079460      Asia    36.1       740
## 6  Afghanistan  1977   14880372      Asia    38.4       786
## 7  Afghanistan  1982   12881816      Asia    39.9       978
## 8  Afghanistan  1987   13867957      Asia    40.8       852
## 9  Afghanistan  1992   16317921      Asia    41.7       649
## 10 Afghanistan  1997   22227415      Asia    41.8       635
## ..         ...   ...        ...       ...     ...       ...

R objects : Data Frames

library(data.table)
dt <- data.table(df)
is.data.frame(dt)

## [1] TRUE

?fread
tables()

##      NAME  NROW NCOL MB COLS                                         KEY
## [1,] dt   1,704    6  1 country,year,pop,continent,lifeExp,gdpPercap    
## Total: 1MB

R objects : Data Frames

object.size(df)

## 72184 bytes

object.size(ldf)

## 72304 bytes

object.size(dt)

## 72880 bytes

R objects : Data Frames

df[df$lifeExp<30,]

##          country year     pop continent lifeExp gdpPercap
## 1    Afghanistan 1952 8425333      Asia    28.8       779
## 1293      Rwanda 1992 7290203    Africa    23.6       737

subset(df, lifeExp < 30)

##          country year     pop continent lifeExp gdpPercap
## 1    Afghanistan 1952 8425333      Asia    28.8       779
## 1293      Rwanda 1992 7290203    Africa    23.6       737

filter(ldf, lifeExp < 30)

## Source: local data frame [2 x 6]
## 
##       country  year     pop continent lifeExp gdpPercap
##        (fctr) (int)   (dbl)    (fctr)   (dbl)     (dbl)
## 1 Afghanistan  1952 8425333      Asia    28.8       779
## 2      Rwanda  1992 7290203    Africa    23.6       737

R objects : Data Frames

subset(dt, lifeExp < 30)

##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia    28.8       779
## 2:      Rwanda 1992 7290203    Africa    23.6       737

dt[lifeExp<30]

##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia    28.8       779
## 2:      Rwanda 1992 7290203    Africa    23.6       737

setkey(dt, country)
dt["Italy"]

##     country year      pop continent lifeExp gdpPercap
##  1:   Italy 1952 47666000    Europe    65.9      4931
##  2:   Italy 1957 49182000    Europe    67.8      6249
##  3:   Italy 1962 50843200    Europe    69.2      8244
##  4:   Italy 1967 52667100    Europe    71.1     10022
##  5:   Italy 1972 54365564    Europe    72.2     12269
##  6:   Italy 1977 56059245    Europe    73.5     14256
##  7:   Italy 1982 56535636    Europe    75.0     16537
##  8:   Italy 1987 56729703    Europe    76.4     19207
##  9:   Italy 1992 56840847    Europe    77.4     22014
## 10:   Italy 1997 57479469    Europe    78.8     24675
## 11:   Italy 2002 57926999    Europe    80.2     27968
## 12:   Italy 2007 58147733    Europe    80.5     28570

R objects : Data Frames

dt["Italy",mult="first"]

##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1952 47666000    Europe    65.9      4931

dt["Italy",max(lifeExp)]

## [1] 80.5

max(df$lifeExp[df$country=="Italy"])

## [1] 80.5

R objects : Data Frames

setkey(dt, country, lifeExp) 
dt["Italy"]

##     country year      pop continent lifeExp gdpPercap
##  1:   Italy 1952 47666000    Europe    65.9      4931
##  2:   Italy 1957 49182000    Europe    67.8      6249
##  3:   Italy 1962 50843200    Europe    69.2      8244
##  4:   Italy 1967 52667100    Europe    71.1     10022
##  5:   Italy 1972 54365564    Europe    72.2     12269
##  6:   Italy 1977 56059245    Europe    73.5     14256
##  7:   Italy 1982 56535636    Europe    75.0     16537
##  8:   Italy 1987 56729703    Europe    76.4     19207
##  9:   Italy 1992 56840847    Europe    77.4     22014
## 10:   Italy 1997 57479469    Europe    78.8     24675
## 11:   Italy 2002 57926999    Europe    80.2     27968
## 12:   Italy 2007 58147733    Europe    80.5     28570

dt[.("Italy",77.44)]

##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1992 56840847    Europe    77.4     22014

dt[.("Italy",77)]

##    country year pop continent lifeExp gdpPercap
## 1:   Italy   NA  NA        NA      77        NA

dt[.("Italy",77),roll="nearest"]

##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1992 56840847    Europe      77     22014

R objects : Data Frames

df[df$country == "Italy", c("year", "lifeExp")]

##     year lifeExp
## 769 1952    65.9
## 770 1957    67.8
## 771 1962    69.2
## 772 1967    71.1
## 773 1972    72.2
## 774 1977    73.5
## 775 1982    75.0
## 776 1987    76.4
## 777 1992    77.4
## 778 1997    78.8
## 779 2002    80.2
## 780 2007    80.5

ldf %>%
  filter(country == "Italy") %>%
  select(year, lifeExp)

## Source: local data frame [12 x 2]
## 
##     year lifeExp
##    (int)   (dbl)
## 1   1952    65.9
## 2   1957    67.8
## 3   1962    69.2
## 4   1967    71.1
## 5   1972    72.2
## 6   1977    73.5
## 7   1982    75.0
## 8   1987    76.4
## 9   1992    77.4
## 10  1997    78.8
## 11  2002    80.2
## 12  2007    80.5

R objects : Data Frames

small_df <- df[df$country %in% c("France","Italy","Spain"), c("country","year", "lifeExp")]
library(tidyr)

## 
## Attaching package: 'tidyr'
## 
## The following object is masked from 'package:VGAM':
## 
##     fill

str(small_df)

## 'data.frame':    36 obs. of  3 variables:
##  $ country: Factor w/ 142 levels "Afghanistan",..: 45 45 45 45 45 45 45 45 45 45 ...
##  $ year   : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp: num  67.4 68.9 70.5 71.5 72.4 ...

table_df=spread(small_df,"country","lifeExp")
table_df

##    year France Italy Spain
## 1  1952   67.4  65.9  64.9
## 2  1957   68.9  67.8  66.7
## 3  1962   70.5  69.2  69.7
## 4  1967   71.5  71.1  71.4
## 5  1972   72.4  72.2  73.1
## 6  1977   73.8  73.5  74.4
## 7  1982   74.9  75.0  76.3
## 8  1987   76.3  76.4  76.9
## 9  1992   77.5  77.4  77.6
## 10 1997   78.6  78.8  78.8
## 11 2002   79.6  80.2  79.8
## 12 2007   80.7  80.5  80.9

data_df=gather(table_df,"country","lifeExp",2:4)
data_df

##    year country lifeExp
## 1  1952  France    67.4
## 2  1957  France    68.9
## 3  1962  France    70.5
## 4  1967  France    71.5
## 5  1972  France    72.4
## 6  1977  France    73.8
## 7  1982  France    74.9
## 8  1987  France    76.3
## 9  1992  France    77.5
## 10 1997  France    78.6
## 11 2002  France    79.6
## 12 2007  France    80.7
## 13 1952   Italy    65.9
## 14 1957   Italy    67.8
## 15 1962   Italy    69.2
## 16 1967   Italy    71.1
## 17 1972   Italy    72.2
## 18 1977   Italy    73.5
## 19 1982   Italy    75.0
## 20 1987   Italy    76.4
## 21 1992   Italy    77.4
## 22 1997   Italy    78.8
## 23 2002   Italy    80.2
## 24 2007   Italy    80.5
## 25 1952   Spain    64.9
## 26 1957   Spain    66.7
## 27 1962   Spain    69.7
## 28 1967   Spain    71.4
## 29 1972   Spain    73.1
## 30 1977   Spain    74.4
## 31 1982   Spain    76.3
## 32 1987   Spain    76.9
## 33 1992   Spain    77.6
## 34 1997   Spain    78.8
## 35 2002   Spain    79.8
## 36 2007   Spain    80.9

R objects : Data Frames

library(reshape2)
dcast(small_df, formula = country~year,
      value.var="lifeExp")

##   country 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007
## 1  France 67.4 68.9 70.5 71.5 72.4 73.8 74.9 76.3 77.5 78.6 79.6 80.7
## 2   Italy 65.9 67.8 69.2 71.1 72.2 73.5 75.0 76.4 77.4 78.8 80.2 80.5
## 3   Spain 64.9 66.7 69.7 71.4 73.1 74.4 76.3 76.9 77.6 78.8 79.8 80.9

table_df= dcast(small_df, formula = year~country,
                value.var="lifeExp")
table_df

##    year France Italy Spain
## 1  1952   67.4  65.9  64.9
## 2  1957   68.9  67.8  66.7
## 3  1962   70.5  69.2  69.7
## 4  1967   71.5  71.1  71.4
## 5  1972   72.4  72.2  73.1
## 6  1977   73.8  73.5  74.4
## 7  1982   74.9  75.0  76.3
## 8  1987   76.3  76.4  76.9
## 9  1992   77.5  77.4  77.6
## 10 1997   78.6  78.8  78.8
## 11 2002   79.6  80.2  79.8
## 12 2007   80.7  80.5  80.9

melt(table_df, id.vars=c("year"),
     value.name = "lifeExp",
     variable.name="country")

##    year country lifeExp
## 1  1952  France    67.4
## 2  1957  France    68.9
## 3  1962  France    70.5
## 4  1967  France    71.5
## 5  1972  France    72.4
## 6  1977  France    73.8
## 7  1982  France    74.9
## 8  1987  France    76.3
## 9  1992  France    77.5
## 10 1997  France    78.6
## 11 2002  France    79.6
## 12 2007  France    80.7
## 13 1952   Italy    65.9
## 14 1957   Italy    67.8
## 15 1962   Italy    69.2
## 16 1967   Italy    71.1
## 17 1972   Italy    72.2
## 18 1977   Italy    73.5
## 19 1982   Italy    75.0
## 20 1987   Italy    76.4
## 21 1992   Italy    77.4
## 22 1997   Italy    78.8
## 23 2002   Italy    80.2
## 24 2007   Italy    80.5
## 25 1952   Spain    64.9
## 26 1957   Spain    66.7
## 27 1962   Spain    69.7
## 28 1967   Spain    71.4
## 29 1972   Spain    73.1
## 30 1977   Spain    74.4
## 31 1982   Spain    76.3
## 32 1987   Spain    76.9
## 33 1992   Spain    77.6
## 34 1997   Spain    78.8
## 35 2002   Spain    79.8
## 36 2007   Spain    80.9

R objects : Data Frames

aggregate(small_df$lifeExp,FUN=max,by=list(small_df$country))

##   Group.1    x
## 1  France 80.7
## 2   Italy 80.5
## 3   Spain 80.9

tapply(small_df$lifeExp,factor(small_df$country),max)

## France  Italy  Spain 
##   80.7   80.5   80.9

small_ldf = filter(ldf,country %in% c("France","Italy","Spain"))
group_by(small_ldf,country) %>% summarize(max(lifeExp))

## Source: local data frame [3 x 2]
## 
##   country max(lifeExp)
##    (fctr)        (dbl)
## 1  France         80.7
## 2   Italy         80.5
## 3   Spain         80.9

filter(ldf,country %in% c("France","Italy","Spain")) %>%
  group_by(country) %>% 
  summarize(max(lifeExp))

## Source: local data frame [3 x 2]
## 
##   country max(lifeExp)
##    (fctr)        (dbl)
## 1  France         80.7
## 2   Italy         80.5
## 3   Spain         80.9

filter(ldf,country %in% c("France","Italy","Spain")) %>%
  group_by(country) %>% 
  summarize(max(lifeExp))

## Source: local data frame [3 x 2]
## 
##   country max(lifeExp)
##    (fctr)        (dbl)
## 1  France         80.7
## 2   Italy         80.5
## 3   Spain         80.9

R objects : Data Frames

plot(df$gdpPercap,df$lifeExp)

plot of chunk unnamed-chunk-152

R objects : Data Frames

select(ldf,gdpPercap,lifeExp) %>% plot()

plot of chunk unnamed-chunk-153

R objects : Data Frames

setkey(dt, country)
dt[c("France","Italy","Spain"),max(lifeExp),by="country"]

##    country   V1
## 1:  France 80.7
## 2:   Italy 80.5
## 3:   Spain 80.9

R objects : Data Frames

head(df[with(df,order(-lifeExp)),])

##               country year      pop continent lifeExp gdpPercap
## 804             Japan 2007 1.27e+08      Asia    82.6     31656
## 672  Hong Kong, China 2007 6.98e+06      Asia    82.2     39725
## 803             Japan 2002 1.27e+08      Asia    82.0     28605
## 696           Iceland 2007 3.02e+05    Europe    81.8     36181
## 1488      Switzerland 2007 7.55e+06    Europe    81.7     37506
## 671  Hong Kong, China 2002 6.76e+06      Asia    81.5     30209

head(dt[order(-lifeExp)])

##             country year      pop continent lifeExp gdpPercap
## 1:            Japan 2007 1.27e+08      Asia    82.6     31656
## 2: Hong Kong, China 2007 6.98e+06      Asia    82.2     39725
## 3:            Japan 2002 1.27e+08      Asia    82.0     28605
## 4:          Iceland 2007 3.02e+05    Europe    81.8     36181
## 5:      Switzerland 2007 7.55e+06    Europe    81.7     37506
## 6: Hong Kong, China 2002 6.76e+06      Asia    81.5     30209

head(setorder(dt,-lifeExp))

##             country year      pop continent lifeExp gdpPercap
## 1:            Japan 2007 1.27e+08      Asia    82.6     31656
## 2: Hong Kong, China 2007 6.98e+06      Asia    82.2     39725
## 3:            Japan 2002 1.27e+08      Asia    82.0     28605
## 4:          Iceland 2007 3.02e+05    Europe    81.8     36181
## 5:      Switzerland 2007 7.55e+06    Europe    81.7     37506
## 6: Hong Kong, China 2002 6.76e+06      Asia    81.5     30209

arrange(ldf,-lifeExp)

## Source: local data frame [1,704 x 6]
## 
##             country  year      pop continent lifeExp gdpPercap
##              (fctr) (int)    (dbl)    (fctr)   (dbl)     (dbl)
## 1             Japan  2007 1.27e+08      Asia    82.6     31656
## 2  Hong Kong, China  2007 6.98e+06      Asia    82.2     39725
## 3             Japan  2002 1.27e+08      Asia    82.0     28605
## 4           Iceland  2007 3.02e+05    Europe    81.8     36181
## 5       Switzerland  2007 7.55e+06    Europe    81.7     37506
## 6  Hong Kong, China  2002 6.76e+06      Asia    81.5     30209
## 7         Australia  2007 2.04e+07   Oceania    81.2     34435
## 8             Spain  2007 4.04e+07    Europe    80.9     28821
## 9            Sweden  2007 9.03e+06    Europe    80.9     33860
## 10           Israel  2007 6.43e+06      Asia    80.7     25523
## ..              ...   ...      ...       ...     ...       ...

R objects : Data Frames

df$gdp=df$pop*df$gdpPercap
df = within(df,gdp = pop*gdpPercap)

## Error in eval(expr, envir, enclos): argument manquant, sans valeur associée par défaut

df = within(df,gdp <- pop*gdpPercap)
dt[,gdp:=pop*gdpPercap]

##                country year      pop continent lifeExp gdpPercap      gdp
##    1:            Japan 2007 1.27e+08      Asia    82.6     31656 4.04e+12
##    2: Hong Kong, China 2007 6.98e+06      Asia    82.2     39725 2.77e+11
##    3:            Japan 2002 1.27e+08      Asia    82.0     28605 3.63e+12
##    4:          Iceland 2007 3.02e+05    Europe    81.8     36181 1.09e+10
##    5:      Switzerland 2007 7.55e+06    Europe    81.7     37506 2.83e+11
##   ---                                                                    
## 1700:     Sierra Leone 1952 2.14e+06    Africa    30.3       880 1.89e+09
## 1701:           Angola 1952 4.23e+06    Africa    30.0      3521 1.49e+10
## 1702:           Gambia 1952 2.84e+05    Africa    30.0       485 1.38e+08
## 1703:      Afghanistan 1952 8.43e+06      Asia    28.8       779 6.57e+09
## 1704:           Rwanda 1992 7.29e+06    Africa    23.6       737 5.37e+09

head(dt)

##             country year      pop continent lifeExp gdpPercap      gdp
## 1:            Japan 2007 1.27e+08      Asia    82.6     31656 4.04e+12
## 2: Hong Kong, China 2007 6.98e+06      Asia    82.2     39725 2.77e+11
## 3:            Japan 2002 1.27e+08      Asia    82.0     28605 3.63e+12
## 4:          Iceland 2007 3.02e+05    Europe    81.8     36181 1.09e+10
## 5:      Switzerland 2007 7.55e+06    Europe    81.7     37506 2.83e+11
## 6: Hong Kong, China 2002 6.76e+06      Asia    81.5     30209 2.04e+11

mutate(ldf,gdp=pop*gdpPercap)

## Source: local data frame [1,704 x 7]
## 
##        country  year      pop continent lifeExp gdpPercap      gdp
##         (fctr) (int)    (dbl)    (fctr)   (dbl)     (dbl)    (dbl)
## 1  Afghanistan  1952  8425333      Asia    28.8       779 6.57e+09
## 2  Afghanistan  1957  9240934      Asia    30.3       821 7.59e+09
## 3  Afghanistan  1962 10267083      Asia    32.0       853 8.76e+09
## 4  Afghanistan  1967 11537966      Asia    34.0       836 9.65e+09
## 5  Afghanistan  1972 13079460      Asia    36.1       740 9.68e+09
## 6  Afghanistan  1977 14880372      Asia    38.4       786 1.17e+10
## 7  Afghanistan  1982 12881816      Asia    39.9       978 1.26e+10
## 8  Afghanistan  1987 13867957      Asia    40.8       852 1.18e+10
## 9  Afghanistan  1992 16317921      Asia    41.7       649 1.06e+10
## 10 Afghanistan  1997 22227415      Asia    41.8       635 1.41e+10
## ..         ...   ...      ...       ...     ...       ...      ...

R objects : Data Frames

countries=read.csv2("http://freakonometrics.free.fr/countries.csv",header=TRUE)
countries = countries[-1,]
head(countries)

##       Country Interational.Direct.Dial..IDD..Code ISO.3166 Car.Code
## 2 Afghanistan                                  93       AF         
## 3     Albania                                 355       AL         
## 4     Algeria                                 213       DZ         
## 5     Andorra                              33 628       AD         
## 6      Angola                                 244       AO         
## 7    Anguilla                               1 809       AI         
##         Capital              Currency ISO.Currency Digraph  ICAO
## 2         Kabul               Afghani          AFA      AF   YA-
## 3        Tirana                   Lek          ALL      AL   ZA-
## 4       Algiers                 Dinar          DZD      DZ   7T-
## 5 And. La Vella          French Franc          FRF      AD   C3-
## 6        Luanda           New Irwanza          AON      AO   D2-
## 7               East Caribbean Dollar          XCD      AI VP-LA
##     Language ISO.639
## 2     Pashto      PS
## 3   Albanian      SQ
## 4     Arabic      AR
## 5    Catalan      CA
## 6 Portuguese      PT
## 7

df2 = countries[,c("Country","Capital")]
names(df2)=c("countries","capital")

R objects : Data Frames

df1 = aggregate(df$lifeExp,FUN=max,by=list(df$country))
names(df1)=c("countries","lifeExp")
dim(df1)

## [1] 142   2

dim(df2)

## [1] 263   2

tail(merge(df1,df2))

##          countries lifeExp    capital
## 122 United Kingdom    79.4     London
## 123        Uruguay    76.4 Montevideo
## 124      Venezuela    73.7    Caracas
## 125        Vietnam    74.2      Hanoi
## 126         Zambia    51.8     Lusaka
## 127       Zimbabwe    62.4     Harare

tail(merge(df1,df2,all.x=TRUE))

##              countries lifeExp capital
## 137          Venezuela    73.7 Caracas
## 138            Vietnam    74.2   Hanoi
## 139 West Bank and Gaza    73.4    <NA>
## 140        Yemen, Rep.    62.7    <NA>
## 141             Zambia    51.8  Lusaka
## 142           Zimbabwe    62.4  Harare

tail(merge(df1,df2,all.y=TRUE))

##                     countries lifeExp  capital
## 258 Wallis and Futuna Islands      NA         
## 259                 West Bank      NA         
## 260                     World      NA         
## 261                     Yemen      NA   Sana'a
## 262                Yugoslavia      NA Belgrade
## 263                     Zaire      NA Kinshasa

R objects : Data Frames

ldf1 = group_by(ldf, country) %>% summarize(max(lifeExp)) 
names(ldf1)=c("countries","lifeExp")
ldf2 <- tbl_df(df2)
inner_join(ldf1,ldf2)

## Joining by: "countries"

## Warning in inner_join_impl(x, y, by$x, by$y): joining factors with
## different levels, coercing to character vector

## Source: local data frame [127 x 3]
## 
##      countries lifeExp      capital
##          (chr)   (dbl)       (fctr)
## 1  Afghanistan    43.8        Kabul
## 2      Albania    76.4       Tirana
## 3      Algeria    72.3      Algiers
## 4       Angola    42.7       Luanda
## 5    Argentina    75.3 Buenos Aires
## 6    Australia    81.2     Canberra
## 7      Austria    79.8       Vienna
## 8      Bahrain    75.6       Manama
## 9   Bangladesh    64.1        Dacca
## 10     Belgium    79.4     Brussels
## ..         ...     ...          ...

left_join(ldf1,ldf2)

## Joining by: "countries"

## Warning in left_join_impl(x, y, by$x, by$y): joining factors with different
## levels, coercing to character vector

## Source: local data frame [142 x 3]
## 
##      countries lifeExp      capital
##          (chr)   (dbl)       (fctr)
## 1  Afghanistan    43.8        Kabul
## 2      Albania    76.4       Tirana
## 3      Algeria    72.3      Algiers
## 4       Angola    42.7       Luanda
## 5    Argentina    75.3 Buenos Aires
## 6    Australia    81.2     Canberra
## 7      Austria    79.8       Vienna
## 8      Bahrain    75.6       Manama
## 9   Bangladesh    64.1        Dacca
## 10     Belgium    79.4     Brussels
## ..         ...     ...          ...

ldf1 %>% inner_join(ldf2)

## Joining by: "countries"

## Warning in inner_join_impl(x, y, by$x, by$y): joining factors with
## different levels, coercing to character vector

## Source: local data frame [127 x 3]
## 
##      countries lifeExp      capital
##          (chr)   (dbl)       (fctr)
## 1  Afghanistan    43.8        Kabul
## 2      Albania    76.4       Tirana
## 3      Algeria    72.3      Algiers
## 4       Angola    42.7       Luanda
## 5    Argentina    75.3 Buenos Aires
## 6    Australia    81.2     Canberra
## 7      Austria    79.8       Vienna
## 8      Bahrain    75.6       Manama
## 9   Bangladesh    64.1        Dacca
## 10     Belgium    79.4     Brussels
## ..         ...     ...          ...

R objects : Data Frames

dt1 = dt[,max(lifeExp),by="country"]
setnames(dt1,c("countries","lifeExp"))
dt2 <- data.table(df2)
setkey(dt1,"countries")
setkey(dt2,"countries")
dt1[dt2,nomatch=0]

##        countries lifeExp      capital
##   1: Afghanistan    43.8        Kabul
##   2:     Albania    76.4       Tirana
##   3:     Algeria    72.3      Algiers
##   4:      Angola    42.7       Luanda
##   5:   Argentina    75.3 Buenos Aires
##  ---                                 
## 123:     Uruguay    76.4   Montevideo
## 124:   Venezuela    73.7      Caracas
## 125:     Vietnam    74.2        Hanoi
## 126:      Zambia    51.8       Lusaka
## 127:    Zimbabwe    62.4       Harare

dt1[dt2]

##        countries lifeExp       capital
##   1: Afghanistan    43.8         Kabul
##   2:     Albania    76.4        Tirana
##   3:     Algeria    72.3       Algiers
##   4:     Andorra      NA And. La Vella
##   5:      Angola    42.7        Luanda
##  ---                                  
## 259:       Yemen      NA        Sana'a
## 260:  Yugoslavia      NA      Belgrade
## 261:       Zaire      NA      Kinshasa
## 262:      Zambia    51.8        Lusaka
## 263:    Zimbabwe    62.4        Harare

dt2[dt1]

##               countries      capital lifeExp
##   1:        Afghanistan        Kabul    43.8
##   2:            Albania       Tirana    76.4
##   3:            Algeria      Algiers    72.3
##   4:             Angola       Luanda    42.7
##   5:          Argentina Buenos Aires    75.3
##  ---                                        
## 138:            Vietnam        Hanoi    74.2
## 139: West Bank and Gaza           NA    73.4
## 140:        Yemen, Rep.           NA    62.7
## 141:             Zambia       Lusaka    51.8
## 142:           Zimbabwe       Harare    62.4

R objects : Data Frames

library(stringr)
library(lubridate, verbose=FALSE)

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:data.table':
## 
##     hour, mday, month, quarter, wday, week, yday, year

library(plyr, verbose=FALSE)

## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following object is masked from 'package:lubridate':
## 
##     here
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library(data.table, verbose=FALSE)
library(LaF)
loc="http://freakonometrics.free.fr/gapminderDataFiveYear.txt"
download.file(loc,destfile="temp")
ssn <- laf_open_csv("temp",sep="\t", 
                    column_types=c("character","integer","numeric","character","numeric","numeric"),
                    column_names=names(df)[1:6])
object.size(ssn)

## 2888 bytes

nrow(ssn)

## NULL

Graphs with R

library(gamair)
data(chicago)
head(chicago)

##   death pm10median pm25median o3median so2median  time tmpd
## 1   130     -7.434         NA    -19.6     1.928 -2556 31.5
## 2   150         NA         NA    -19.0    -0.986 -2556 33.0
## 3   101     -0.827         NA    -20.2    -1.891 -2554 33.0
## 4   135      5.566         NA    -19.7     6.139 -2554 29.0
## 5   126         NA         NA    -19.2     2.278 -2552 32.0
## 6   130      6.566         NA    -17.6     9.859 -2552 40.0

base=data.frame(death=chicago$death,
                temp_F=chicago$tmpd,
                o3=chicago$o3median,
                date=seq(as.Date("1987-01-01"),
                         as.Date("2000-12-31"),by=1))
base$temp_C <- (base$temp_F-32)/1.8
base$year <- substring(base$date,1,4)

Graphs with R

date2season <- function(date){
  m <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%m"))
  d <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%d"))
  s <- NA
  if(m %in% c(1,2) | ((m==12)&(d>=21)) |  ((m==3)&(d<21))) s <- "winter"
  if(m %in% c(4,5) | ((m==3)&(d>=21)) |  ((m==6)&(d<21))) s <- "spring"
  if(m %in% c(7,8) | ((m==6)&(d>=21)) |  ((m==9)&(d<21))) s <- "summer"
  if(m %in% c(10,11) | ((m==9)&(d>=21)) |  ((m==12)&(d<21))) s <- "autumn"
  return(s)}
base$season <- sapply(base$date,date2season)
head(base)

##   death temp_F    o3       date temp_C year season
## 1   130   31.5 -19.6 1987-01-01 -0.278 1987 winter
## 2   150   33.0 -19.0 1987-01-02  0.556 1987 winter
## 3   101   33.0 -20.2 1987-01-03  0.556 1987 winter
## 4   135   29.0 -19.7 1987-01-04 -1.667 1987 winter
## 5   126   32.0 -19.2 1987-01-05  0.000 1987 winter
## 6   130   40.0 -17.6 1987-01-06  4.444 1987 winter

Graphs with R

plot(base[,c("date", "temp_C")])

plot of chunk unnamed-chunk-164

Graphs with R

plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")))

plot of chunk unnamed-chunk-165

Graphs with R

plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")),
     ylim=c(-20,60))

plot of chunk unnamed-chunk-166

Graphs with R

plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")),
     col=as.integer(as.factor(base$season)),
     xlim=as.Date(c("1995-01-01","1999-01-01")))

plot of chunk unnamed-chunk-167

Graphs with R

CL <- function(x) rgb(colorRamp(c("blue", "red"))(x)/255)
plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")),
     col=CL(rank(base$temp_C)/(length(base$temp_C)+1)))

plot of chunk unnamed-chunk-168

Graphs with R

plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
  xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")),
  type="l",col="grey",
  xlim=as.Date(c("1995-01-01","1999-01-01")))

plot of chunk unnamed-chunk-169

Graphs with R

plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")),
     xlim=as.Date(c("1995-01-01","1999-01-01")))
rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "light yellow")
points(base[,c("date", "temp_C")],col=as.integer(as.factor(base$season)))

plot of chunk unnamed-chunk-170

Graphs with R

boxplot(base$temp_C~as.factor(base$year),main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")))
x <- 1:14
text(x, rep(-25,length(x)), x, col="red")

plot of chunk unnamed-chunk-171

Graphs with R

bp=boxplot(base$temp_C~as.factor(base$year),main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")))
points(x,bp$stats[5,],pch=19,col="purple")
abline(lm(bp$stats[5,]~x),col="purple",lwd=2)

plot of chunk unnamed-chunk-172

Graphs with R

hist(base$temp_C,xlab=expression(paste("Temperature ( ", degree ~ C, " )")),ylab="",probability=TRUE,
     col="yellow",border="white",main="",ylim=c(0,.0375))
u=seq(-30,40,by=.25)
lines(u,dnorm(u,mean(base$temp_C),sd(base$temp_C)),lty=2)
lines(density(base$temp_C),col="red",lwd=2)

plot of chunk unnamed-chunk-173

Graphs with ggplot2

library(ggplot2)
g <- ggplot(base, aes(date, temp_C)) + geom_point()
g

plot of chunk unnamed-chunk-174

ggsave(file="figure.pdf", scale=3)

Graphs with ggplot2

g <- g+labs(title='Temperature')
g <- g+ggtitle('Temperature')
g

plot of chunk unnamed-chunk-175

Graphs with ggplot2

g+theme(plot.title = element_text(size=20, face="bold", vjust=2, color="red"))

plot of chunk unnamed-chunk-176

Graphs with ggplot2

g<-g+labs(
  x="Date", 
  y=expression(paste("Temperature ( ", degree ~ C, " )")), 
  title="Temperature in Chicago")
g

plot of chunk unnamed-chunk-177

Graphs with ggplot2

g+ theme(
  axis.title.x = element_text(color="blue"),
  axis.title.y = element_text(color="blue")   
)

plot of chunk unnamed-chunk-178

Graphs with ggplot2

g + ylim(c(-20,60))

## Warning: Removed 10 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-179

g <- g + xlim(as.Date(c("1995-01-01","1999-01-01")))
g

## Warning: Removed 3652 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-179

Graphs with ggplot2

base2=data.frame(
  today=base$temp_C[1:5113],
  tomorrow=base$temp_C[2:5114])
ggplot(base2, aes(today, tomorrow))+     geom_point()+ coord_equal()

plot of chunk unnamed-chunk-180

Graphs with ggplot2

g <- ggplot(base, aes(date, temp_C, color=factor(season)))+
  xlim(as.Date(c("1995-01-01","1999-01-01")))+ geom_point()
g+theme(legend.title=element_blank())+
  guides(colour = guide_legend(override.aes = list(size=4)))

## Warning: Removed 3652 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-181

Graphs with ggplot2

g<-ggplot(base, aes(date, temp_C))+
  xlim(as.Date(c("1995-01-01","1999-01-01")))
g+geom_line(color="grey")

## Warning: Removed 3652 rows containing missing values (geom_path).

plot of chunk unnamed-chunk-182

Graphs with ggplot2

g+geom_line(aes(color="temperature"))+
  scale_colour_manual(name='', values=c('temperature'='grey'))

## Warning: Removed 3652 rows containing missing values (geom_path).

plot of chunk unnamed-chunk-183

Graphs with ggplot2

ggplot(base, aes(date, temp_C, color=factor(season)))+geom_point() + 
  scale_color_manual(values=c("dodgerblue4", "darkolivegreen4","darkorchid3", "goldenrod1"))

plot of chunk unnamed-chunk-184

Graphs with ggplot2

ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point() + scale_color_brewer(palette="Set1")

plot of chunk unnamed-chunk-185

Graphs with ggplot2

ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
  scale_color_gradient(low="blue", high="red")

plot of chunk unnamed-chunk-186

Graphs with ggplot2

ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
  scale_color_gradient2(low="blue", mid="white", high="red")

plot of chunk unnamed-chunk-187

Graphs with ggplot2

ggplot(base, aes(date, temp_C))+geom_point(color="firebrick")+
  theme(panel.background = element_rect(fill = 'light yellow'))

plot of chunk unnamed-chunk-188

Graphs with ggplot2

ggplot(base, aes(date,temp_C))+geom_point(color="firebrick")+
  facet_wrap(~year, ncol=2, scales="free")

plot of chunk unnamed-chunk-189

Graphs with ggplot2

library(ggthemes, verbose=FALSE)
ggplot(base, aes(date, temp_C, color=factor(season)))+  geom_point()+
  labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like MS Excel")+
  theme_excel()+scale_colour_excel()+ theme(legend.title=element_blank())

plot of chunk unnamed-chunk-190

Spatial Models

See XKCD Map Projections.

Spatial Models

library(sp)
library(rworldmap)
data(countriesLow)
crs.WGS84 <- CRS("+proj=longlat +datum=WGS84")
countries.WGS84 <- spTransform(countriesLow, crs.WGS84)
plot(countries.WGS84,col="light green")

plot of chunk unnamed-chunk-191

Spatial Models

crs.laea <- CRS("+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 +y_0=3210000 +ellps=GRS80 +units=m +no_defs") 
countries.laea <- spTransform(countriesLow, crs.laea) 
plot(countries.laea,col="light green")

plot of chunk unnamed-chunk-192

OpenStreetMap

library(OpenStreetMap)
map <- openmap(c(lat= 51.516,   lon= -.141),
               c(lat= 51.511,   lon= -.133))
map <- openproj(map, projection = "+init=epsg:27700") 
plot(map)

plot of chunk unnamed-chunk-193

OpenStreetMap

loc="http://freakonometrics.free.fr/cholera.zip"
download.file(loc,destfile="cholera.zip")
unzip("cholera.zip", exdir="./")
library(maptools)

## Checking rgeos availability: TRUE

deaths<-readShapePoints("Cholera_Deaths")
head(deaths@coords)

##   coords.x1 coords.x2
## 0    529309    181031
## 1    529312    181025
## 2    529314    181020
## 3    529317    181014
## 4    529321    181008
## 5    529337    181006

OpenStreetMap

plot(map)
points(deaths@coords,col="red", pch=19, cex=.7 )

plot of chunk unnamed-chunk-195

OpenStreetMap

X <- deaths@coords
library(KernSmooth)

## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009

kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))

## Warning in bw.ucv(X[, 1]): minimum occurred at one end of the range

library(grDevices)
clrs <- colorRampPalette(c(rgb(0,0,1,0), rgb(0,0,1,1)), alpha = TRUE)(20)

OpenStreetMap

plot(map)
image(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE,col=clrs)
contour(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE)

plot of chunk unnamed-chunk-197

Google Map

library(ggmap)
get_london <- get_map(c(-.137,51.513), zoom=17, maptype = "roadmap", source="google")
london <- ggmap(get_london)
london

plot of chunk unnamed-chunk-198

Google Map

df_deaths <- data.frame(X)
library(sp)
library(rgdal)
coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700") 
df_deaths = spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))
london + geom_point(aes(x=coords.x1,y=coords.x2),data=data.frame(df_deaths@coords),col="red")

plot of chunk unnamed-chunk-199

Google Map

london + geom_point(aes(x=coords.x1, y=coords.x2), 
data=data.frame(df_deaths@coords),col="red") +
  geom_density2d(data = data.frame(df_deaths@coords), 
aes(x = coords.x1, y=coords.x2), size = 0.3) + 
  stat_density2d(data = data.frame(df_deaths@coords), 
 aes(x = coords.x1, y=coords.x2,fill = ..level.., alpha = ..level..),
 size = 0.01, geom = "polygon") + scale_fill_gradient(low = "green", high = "red",
 guide = FALSE) + scale_alpha(range = c(0, 0.3), guide = FALSE)

plot of chunk unnamed-chunk-200

Google Map & Leaflet

require(leaflet)

## Loading required package: leaflet

deaths <- readShapePoints("Cholera_Deaths")
df_deaths <- data.frame(deaths@coords)
coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700") 
df_deaths=spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))
df=data.frame(df_deaths@coords)
lng <- df$coords.x1
lat <- df$coords.x2

Google Map & Leaflet

m = leaflet()%>% addTiles()  %>% fitBounds(-.141,  51.511, -.133, 51.516)
#m

Google Map & Leaflet

rd=.5
op=.8
clr="blue"
m = leaflet() %>% addTiles() %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr)
#m

Google Map & Leaflet

X=cbind(lng,lat)
kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))

## Warning in bw.ucv(X[, 1]): minimum occurred at one end of the range

x=kde2d$x1
y=kde2d$x2
z=kde2d$fhat
CL=contourLines(x , y , z)
m = leaflet() %>% addTiles()  %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
# m

Google Map & Leaflet

m = leaflet() %>% addTiles()  %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
m = leaflet() %>% addTiles()  %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE)
# m

Google Map & Leaflet

m = leaflet() %>% addTiles() %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>%
  addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>%
  addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red")
# m

Web Scrapping & Hurricanes

library(XML)
year <- 2012
loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="")
tabs <- readHTMLTable(htmlParse(loc)) 
tabs[1]

## $`NULL`
##     #                    Name          Date Wind Pres Cat  
## 1   1  Tropical Storm ALBERTO     19-23 MAY   50  995   -  
## 2   2    Tropical Storm BERYL 25 MAY- 2 JUN   60  992   -  
## 3   3       Hurricane-1 CHRIS     17-24 JUN   75  974   1  
## 4   4    Tropical Storm DEBBY     23-27 JUN   55  990   -  
## 5   5     Hurricane-2 ERNESTO      1-10 AUG   85  973   2  
## 6   6 Tropical Storm FLORENCE      3- 8 AUG   50 1002   -  
## 7   7   Tropical Storm HELENE      9-19 AUG   40 1004   -  
## 8   8      Hurricane-2 GORDON     15-21 AUG   95  965   2  
## 9   9       Hurricane-1 ISAAC 20 AUG- 1 SEP   70  965   1  
## 10 10    Tropical Storm JOYCE     21-24 AUG   35 1006   -  
## 11 11        Hurricane-2 KIRK 28 AUG- 3 SEP   90  970   2  
## 12 12      Hurricane-1 LESLIE 28 AUG-12 SEP   70  968   1  
## 13 13     Hurricane-3 MICHAEL      2-12 SEP  100  964   3  
## 14 14      Hurricane-1 NADINE 10 SEP- 4 OCT   80  978   1  
## 15 15    Tropical Storm OSCAR      2- 5 OCT   45  994   -  
## 16 16    Tropical Storm PATTY     10-13 OCT   40 1005   -  
## 17 17      Hurricane-1 RAFAEL     12-26 OCT   80  966   1  
## 18 18       Hurricane-3 SANDY     21-31 OCT  100  940   3  
## 19 19     Tropical Storm TONY     21-26 OCT   45 1000   -

storms <- unlist(strsplit(as.character(tabs[[1]]$Name),split=" "))
storms

##  [1] "Tropical"    "Storm"       "ALBERTO"     "Tropical"    "Storm"      
##  [6] "BERYL"       "Hurricane-1" "CHRIS"       "Tropical"    "Storm"      
## [11] "DEBBY"       "Hurricane-2" "ERNESTO"     "Tropical"    "Storm"      
## [16] "FLORENCE"    "Tropical"    "Storm"       "HELENE"      "Hurricane-2"
## [21] "GORDON"      "Hurricane-1" "ISAAC"       "Tropical"    "Storm"      
## [26] "JOYCE"       "Hurricane-2" "KIRK"        "Hurricane-1" "LESLIE"     
## [31] "Hurricane-3" "MICHAEL"     "Hurricane-1" "NADINE"      "Tropical"   
## [36] "Storm"       "OSCAR"       "Tropical"    "Storm"       "PATTY"      
## [41] "Hurricane-1" "RAFAEL"      "Hurricane-3" "SANDY"       "Tropical"   
## [46] "Storm"       "TONY"

Web Scrapping & Hurricanes

index <- storms%in%c("Tropical", "Storm", paste("Hurricane-",1:6,sep=""), "Depression", "Subtropical", "Extratropical", "Low", paste("Storm-",1:6,sep=""), "Xxx")
nstorms  <- storms[!index]
nstorms

##  [1] "ALBERTO"  "BERYL"    "CHRIS"    "DEBBY"    "ERNESTO"  "FLORENCE"
##  [7] "HELENE"   "GORDON"   "ISAAC"    "JOYCE"    "KIRK"     "LESLIE"  
## [13] "MICHAEL"  "NADINE"   "OSCAR"    "PATTY"    "RAFAEL"   "SANDY"   
## [19] "TONY"

TRACK=NULL

Web Scrapping & Hurricanes

for(i in length(nstorms):1){
  if((nstorms[i]=="SIXTEE")&(year==2008)) nstorms[i] <- "SIXTEEN"
  if((nstorms[i]=="LAUR")&(year==2008)) nstorms[i] <- "LAURA"
  if((nstorms[i]=="FIFTEE")&(year==2007)) nstorms[i] <- "FIFTEEN"
  if((nstorms[i]=="CHANTA")&(year==2007)) nstorms[i] <- "CHANTAL"
  if((nstorms[i]=="ANDR")&(year==2007)) nstorms[i] <- "ANDREA"
  if((nstorms[i]=="NINETE")&(year==2005)) nstorms[i] <- "NINETEEN"
  if((nstorms[i]=="JOSEPH")&(year==2002)) nstorms[i] <- "JOSEPHINE"
  if((nstorms[i]=="FLOY")&(year==1993)) nstorms[i] <- "FLOYD"
  if((nstorms[i]=="KEIT")&(year==1988)) nstorms[i] <- "KEITH"
  if((nstorms[i]=="CHARLI")&(year==1972)) nstorms[i] <- "CHARLIE"}

Web Scrapping & Hurricanes

for(i in length(nstorms):1){
  loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="")
  track <- read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20))
  names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT")
  track$LAT=as.numeric(as.character(track$LAT))
  track$LON=as.numeric(as.character(track$LON))
  track$WIND=as.numeric(as.character(track$WIND))
  track$PR=as.numeric(as.character(track$PR))
  track$year=year
  track$name=nstorms[i]
  TRACK=rbind(TRACK,track)}
tail(TRACK,10)

##     ADV  LAT   LON         TIME WIND   PR           STAT year    name
## 645  11 30.4 -79.1  05/21/12Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 646  12 30.5 -78.3  05/21/18Z     35 1006 TROPICAL STORM 2012 ALBERTO
## 647  13 30.7 -77.1  05/22/00Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 648  14 31.5 -76.1  05/22/06Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 649  15 32.5 -74.7  05/22/12Z     30 1008            LOW 2012 ALBERTO
## 650  16 33.4 -73.4  05/22/18Z     30 1008            LOW 2012 ALBERTO
## 651  17 34.1 -71.9  05/23/00Z     25 1010            LOW 2012 ALBERTO
## 652  18 34.9 -70.1  05/23/06Z     25 1011            LOW 2012 ALBERTO
## 653  19 35.5 -67.9  05/23/12Z     25 1012            LOW 2012 ALBERTO
## 654  20 35.9 -66.0  05/23/18Z     25 1012            LOW 2012 ALBERTO

Web Scrapping & Hurricanes

loc="http://freakonometrics.free.fr/TRACK-ATLANTIC.Rdata"
download.file(loc,destfile="TRACK-ATLANTIC.Rdata")
load("TRACK-ATLANTIC.Rdata")
dim(TOTTRACK)

## [1] 26712     9

Web Scrapping & Hurricanes

library(grDevices)
library(maps)
map("world", col="light yellow", fill=TRUE)
for(n in unique(TOTTRACK$name)){
  lines(TOTTRACK$LON[TOTTRACK$name==n],TOTTRACK$LAT[TOTTRACK$name==n],lwd=.5,col=rgb(1, 0, 0, alpha=.5))}

plot of chunk unnamed-chunk-212

Shapefiles

loc="http://freakonometrics.free.fr/Italy.zip"
download.file(loc,destfile="Italy.zip")
unzip("Italy.zip", exdir="./")
library(rgdal)
ita1<-readOGR(dsn="./Italy/",layer="ITA_adm1",verbose=FALSE)
plot(ita1,col="light green")

plot of chunk unnamed-chunk-213

Shapefiles

ita1@data[,"NAME_1"][1:6]

## [1] Abruzzo        Apulia         Basilicata     Calabria      
## [5] Campania       Emilia-Romagna
## 20 Levels: Abruzzo Apulia Basilicata Calabria Campania ... Veneto

pos<-which(ita1@data[,"NAME_1"]=="Sicily")
Poly_Sicily<-ita1[pos,]
plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)

plot of chunk unnamed-chunk-214

Shapefiles

plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
abline(v=5:20,col="light blue")
abline(h=35:50,col="light blue")
axis(1)
axis(2)

plot of chunk unnamed-chunk-215

Shapefiles

pos<-which(ita1@data[,"NAME_1"] %in% c("Abruzzo","Apulia","Basilicata","Calabria","Campania","Lazio","Molise","Sardegna","Sicily"))
ita_south <- ita1[pos,]
ita_north <- ita1[-pos,]
plot(ita1)
plot(ita_south,col="red",add=TRUE)
plot(ita_north,col="blue",add=TRUE)

plot of chunk unnamed-chunk-216

Shapefiles

library(xlsx)

## Loading required package: rJava
## Loading required package: xlsxjars

loc="http://freakonometrics.free.fr/Italy_codes.xlsx"
download.file(loc,destfile="Italy_codes.xlsx")
data_codes<-read.xlsx2(file = "Italy_codes.xlsx",sheetName="ITALY", startRow=1, header=TRUE)
names(data_codes)[1]="NAME_1"
ita2<-merge(ita1, data_codes, by = "NAME_1", all.x=TRUE)
pos<-which(ita2@data[,"GROUP"] == "SOUTH")

## Error in which(ita2@data[, "GROUP"] == "SOUTH"): tentative d'obtenir le slot "data" d'un objet (classe "data.frame") qui n'est pas un objet S4

Shapefiles

library(rgeos)
ita_s<-gUnionCascaded(ita_south)
ita_n<-gUnionCascaded(ita_north)
plot(ita1)
plot(ita_s,col="red",add=TRUE)
plot(ita_n,col="blue",add=TRUE)

plot of chunk unnamed-chunk-218

Shapefiles

plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)

plot of chunk unnamed-chunk-219

gCentroid(Poly_Sicily,byid=TRUE)

## class       : SpatialPoints 
## features    : 1 
## extent      : 14.1, 14.1, 37.6, 37.6  (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0

Shapefiles

G <- as.data.frame(gCentroid(ita1,byid=TRUE))
plot(ita1,col="light green")
text(G$x,G$y,1:20)

plot of chunk unnamed-chunk-220

Shapefiles

c1 <- G[c(17,20,6,16,8,12,2),]
c2 <- G[c(13,11,1,5,3,4),]
L1<-SpatialLines(list(Lines(list(Line(c1)),"Line1")))
L2<-SpatialLines(list(Lines(list(Line(c2)),"Line2")))
L1@proj4string<-ita1@proj4string
L2@proj4string<-ita1@proj4string
cross_road <-gIntersection(L1,L2)
cross_road@coords

##      x    y
## 1 11.1 44.0
## 1 14.2 41.7

Shapefiles

plot(ita1,col="light green")
plot(L1,col="red",lwd=2,add=TRUE)
plot(L2,col="blue",lwd=2,add=TRUE)
plot(cross_road,pch=19,cex=1.5,add=TRUE)

plot of chunk unnamed-chunk-222

Shapefiles

library(maptools)
library(rgdal)
library(classInt)
#paris <- readShapeSpatial("paris-cartelec.shp")

Shapefiles

paris<-readOGR(dsn="./",layer="paris-cartelec",verbose=FALSE)
proj4string(paris) <- CRS("+init=epsg:2154")
paris <- spTransform(paris, CRS("+proj=longlat +ellps=GRS80"))
plot(paris)

plot of chunk unnamed-chunk-224

Shapefiles

length(paris)

## [1] 850

poly_paris <- SpatialPolygons2PolySet(paris)
sub_poly <- poly_paris[poly_paris$PID==100,]
plot(sub_poly[,c("X","Y")])
polygon(sub_poly[,c("X","Y")])

plot of chunk unnamed-chunk-225

Shapefiles

length(paris)

## [1] 850

poly_paris <- SpatialPolygons2PolySet(paris)
sub_poly <- poly_paris[poly_paris$PID==100,]
plot(sub_poly[,c("X","Y")])
polygon(sub_poly[,c("X","Y")])

plot of chunk unnamed-chunk-226

Shapefiles

point <- c(2.33,48.859)
point.in.polygon(point[1],point[2],sub_poly[,"X"],sub_poly[,"Y"])

## [1] 1

point_in_i=function(i,point) point.in.polygon(point[1],point[2],poly_paris[poly_paris$PID==i,"X"],poly_paris[poly_paris$PID==i,"Y"])
where_is_point=function(point) which(Vectorize(function(i) point_in_i(i,point))(1:length(paris))>0)
where_is_point(point)

## [1] 100

Shapefiles

library(RColorBrewer)
plotclr <- brewer.pal(7,"RdYlBu")
vizualize_point <- function(point){
 wp <- where_is_point(point)
 colcode <- rep(plotclr[4],length(paris))
 colcode[wp] <- plotclr[1]
 plot(paris,col=colcode,border="grey")}
 vizualize_point(c(2.4,48.87))

plot of chunk unnamed-chunk-228

Shapefiles

library(ggmap,verbose=FALSE)
library(geosphere)
(ADS <- geocode("4 rue chauveau lagarde paris", source="google"))

##    lon  lat
## 1 2.32 48.9

vizualize_point(ADS)

plot of chunk unnamed-chunk-229