Data Science for Actuaries

Arthur Charpentier, @freakonometrics
2018

R & Packages

getOption("defaultPackages")

## [1] "datasets"  "utils"     "grDevices" "graphics"  "stats"     "methods"

(.packages(all.available=TRUE))

##   [1] "abind"                 "acepack"              
##   [3] "ada"                   "ade4"                 
##   [5] "adegenet"              "adegraphics"          
##   [7] "adephylo"              "ADGofTest"            
##   [9] "AER"                   "alabama"              
##  [11] "amap"                  "animation"            
##  [13] "ape"                   "aplpack"              
##  [15] "arm"                   "arules"               
##  [17] "arulesViz"             "ash"                  
##  [19] "assertthat"            "backports"            
##  [21] "base64enc"             "BB"                   
##  [23] "bcp"                   "bdsmatrix"            
##  [25] "BH"                    "biclust"              
##  [27] "biglm"                 "bindr"                
##  [29] "bindrcpp"              "BiocInstaller"        
##  [31] "bipartite"             "bit"                  
##  [33] "bit64"                 "bitops"               
##  [35] "BLCOP"                 "blob"                 
##  [37] "bold"                  "brew"                 
##  [39] "brglm"                 "broom"                
##  [41] "ca"                    "callr"                
##  [43] "car"                   "carData"              
##  [45] "caret"                 "cartography"          
##  [47] "catdata"               "caTools"              
##  [49] "cba"                   "cccp"                 
##  [51] "cellranger"            "checkmate"            
##  [53] "chron"                 "CircStats"            
##  [55] "classInt"              "cli"                  
##  [57] "clipr"                 "clusterGeneration"    
##  [59] "clv"                   "coda"                 
##  [61] "coin"                  "colorspace"           
##  [63] "combinat"              "commonmark"           
##  [65] "Compositional"         "config"               
##  [67] "contfrac"              "copula"               
##  [69] "corrgram"              "corrplot"             
##  [71] "covr"                  "covRobust"            
##  [73] "crayon"                "crosstalk"            
##  [75] "crul"                  "cubature"             
##  [77] "curl"                  "CVST"                 
##  [79] "cvTools"               "data.table"           
##  [81] "DBI"                   "dbplyr"               
##  [83] "ddalpha"               "debugme"              
##  [85] "deldir"                "dendextend"           
##  [87] "dendroextras"          "DEoptim"              
##  [89] "DEoptimR"              "desc"                 
##  [91] "descr"                 "deSolve"              
##  [93] "devtools"              "dfoptim"              
##  [95] "dichromat"             "digest"               
##  [97] "dimRed"                "diptest"              
##  [99] "distr"                 "distrEx"              
## [101] "DistributionUtils"     "doBy"                 
## [103] "doMC"                  "doParallel"           
## [105] "doRNG"                 "dotCall64"            
## [107] "downloader"            "dplR"                 
## [109] "dplyr"                 "DRR"                  
## [111] "DT"                    "dtplyr"               
## [113] "dtw"                   "dynlm"                
## [115] "e1071"                 "ecodist"              
## [117] "ECOSolveR"             "effects"              
## [119] "ellipse"               "elliptic"             
## [121] "emplik"                "enaR"                 
## [123] "energy"                "EnvStats"             
## [125] "eva"                   "evaluate"             
## [127] "evir"                  "expm"                 
## [129] "FactoMineR"            "fAssets"              
## [131] "fastmatch"             "fBasics"              
## [133] "fCopulae"              "fGarch"               
## [135] "fields"                "fit.models"           
## [137] "flashClust"            "flexclust"            
## [139] "flexmix"               "flows"                
## [141] "fMultivar"             "FNN"                  
## [143] "forcats"               "foreach"              
## [145] "forecast"              "formatR"              
## [147] "Formula"               "fpc"                  
## [149] "fPortfolio"            "fracdiff"             
## [151] "FRAPO"                 "fts"                  
## [153] "gam"                   "gamair"               
## [155] "gamlss"                "gamlss.data"          
## [157] "gamlss.dist"           "GB2"                  
## [159] "gbm"                   "gclus"                
## [161] "gdata"                 "GeneralizedHyperbolic"
## [163] "geometry"              "geoR"                 
## [165] "geosphere"             "GGally"               
## [167] "ggcorrplot"            "ggdendro"             
## [169] "ggmap"                 "ggnet"                
## [171] "ggplot2"               "ggplot2movies"        
## [173] "ggthemes"              "ggvis"                
## [175] "git2r"                 "glmnet"               
## [177] "glue"                  "gmodels"              
## [179] "gnm"                   "goftest"              
## [181] "googleVis"             "gower"                
## [183] "gplots"                "gridBase"             
## [185] "gridExtra"             "gsl"                  
## [187] "gss"                   "gstat"                
## [189] "gsubfn"                "gtable"               
## [191] "gtools"                "gurobi"               
## [193] "gWidgets"              "h2o"                  
## [195] "haven"                 "hdrcde"               
## [197] "hexbin"                "highr"                
## [199] "hmeasure"              "Hmisc"                
## [201] "hms"                   "htmlTable"            
## [203] "htmltools"             "htmlwidgets"          
## [205] "httpcode"              "httpuv"               
## [207] "httr"                  "hypergeo"             
## [209] "Icens"                 "igraph"               
## [211] "igraphdata"            "Imap"                 
## [213] "ineq"                  "inline"               
## [215] "interval"              "intervals"            
## [217] "IntroCompFinR"         "ipred"                
## [219] "irlba"                 "ISLR"                 
## [221] "iterators"             "jpeg"                 
## [223] "jsonlite"              "kernlab"              
## [225] "knitr"                 "ks"                   
## [227] "labeling"              "laeken"               
## [229] "LaF"                   "Lahman"               
## [231] "lasso2"                "latticeExtra"         
## [233] "lava"                  "lazyeval"             
## [235] "leaflet"               "leaflet.extras"       
## [237] "leaps"                 "LearnBayes"           
## [239] "limSolve"              "linprog"              
## [241] "lme4"                  "lmtest"               
## [243] "locfit"                "logcondens"           
## [245] "longmemo"              "lpSolve"              
## [247] "lpSolveAPI"            "lubridate"            
## [249] "magic"                 "mapproj"              
## [251] "maps"                  "maptools"             
## [253] "markdown"              "markovchain"          
## [255] "MASS"                  "matchingR"            
## [257] "matlab"                "Matrix"               
## [259] "MatrixModels"          "matrixStats"          
## [261] "maxLik"                "maxmatching"          
## [263] "mclust"                "memoise"              
## [265] "mice"                  "microbenchmark"       
## [267] "mime"                  "minqa"                
## [269] "misc3d"                "miscTools"            
## [271] "missMDA"               "mix"                  
## [273] "mixture"               "MLEcens"              
## [275] "mlogit"                "mnormt"               
## [277] "ModelMetrics"          "modelr"               
## [279] "modeltools"            "MPV"                  
## [281] "MTS"                   "multcomp"             
## [283] "multicool"             "munsell"              
## [285] "mvnormtest"            "mvoutlier"            
## [287] "mvtnorm"               "nanotime"             
## [289] "natserv"               "nbpMatching"          
## [291] "network"               "networkD3"            
## [293] "nloptr"                "NLP"                  
## [295] "NMF"                   "nortest"              
## [297] "np"                    "numDeriv"             
## [299] "nycflights13"          "odfWeave"             
## [301] "openssl"               "OpenStreetMap"        
## [303] "opera"                 "optextras"            
## [305] "optimx"                "optmatch"             
## [307] "optrees"               "osmar"                
## [309] "packrat"               "pander"               
## [311] "party"                 "pbkrtest"             
## [313] "PBSmapping"            "pcaPP"                
## [315] "PerformanceAnalytics"  "perm"                 
## [317] "permute"               "pglm"                 
## [319] "phylobase"             "pixmap"               
## [321] "pkgconfig"             "pkgmaker"             
## [323] "PKI"                   "plm"                  
## [325] "plogr"                 "plotly"               
## [327] "plotrix"               "pls"                  
## [329] "plyr"                  "pmml"                 
## [331] "png"                   "polyclip"             
## [333] "PortfolioAnalytics"    "prabclus"             
## [335] "praise"                "prettymapr"           
## [337] "prettyunits"           "pROC"                 
## [339] "processx"              "prodlim"              
## [341] "profileModel"          "progress"             
## [343] "proto"                 "proxy"                
## [345] "pscl"                  "pspline"              
## [347] "psych"                 "purrr"                
## [349] "qap"                   "qcc"                  
## [351] "quadprog"              "quantmod"             
## [353] "quantreg"              "qvcalc"               
## [355] "R.methodsS3"           "R.oo"                 
## [357] "R.utils"               "R6"                   
## [359] "rainbow"               "RandomFields"         
## [361] "RandomFieldsUtils"     "randomForest"         
## [363] "randomNames"           "raster"               
## [365] "Rcgmin"                "RColorBrewer"         
## [367] "Rcpp"                  "RcppArmadillo"        
## [369] "RcppCCTZ"              "RcppDE"               
## [371] "RcppEigen"             "RcppParallel"         
## [373] "RcppRoll"              "RcppZiggurat"         
## [375] "RCurl"                 "Rdonlp2"              
## [377] "readr"                 "readxl"               
## [379] "recipes"               "registry"             
## [381] "ReIns"                 "relimp"               
## [383] "rematch"               "rentrez"              
## [385] "reprex"                "reshape2"             
## [387] "reticulate"            "rex"                  
## [389] "Rfast"                 "rgdal"                
## [391] "rgeos"                 "rgl"                  
## [393] "RgoogleMaps"           "ritis"                
## [395] "RItools"               "rjson"                
## [397] "RJSONIO"               "rlang"                
## [399] "rmarkdown"             "Rmdanimation"         
## [401] "Rmixmod"               "RMongo"               
## [403] "RMySQL"                "rncl"                 
## [405] "rneos"                 "RNeXML"               
## [407] "rngtools"              "robCompositions"      
## [409] "robust"                "robustbase"           
## [411] "ROCR"                  "RODBC"                
## [413] "ROI"                   "ROI.plugin.alabama"   
## [415] "ROI.plugin.ecos"       "ROI.plugin.glpk"      
## [417] "ROI.plugin.ipop"       "ROI.plugin.lpsolve"   
## [419] "ROI.plugin.nloptr"     "ROI.plugin.optimx"    
## [421] "ROI.plugin.quadprog"   "ROI.plugin.scs"       
## [423] "ROI.plugin.symphony"   "rosm"                 
## [425] "rotl"                  "roxygen2"             
## [427] "rpart"                 "rpart.plot"           
## [429] "RPostgreSQL"           "rprojroot"            
## [431] "rrcov"                 "rredlist"             
## [433] "rsconnect"             "Rsolnp"               
## [435] "RSQLite"               "rstudioapi"           
## [437] "rugarch"               "RUnit"                
## [439] "rvest"                 "Rvmmin"               
## [441] "rworldmap"             "sampleSelection"      
## [443] "sand"                  "sandwich"             
## [445] "sas7bdat"              "scales"               
## [447] "scatterplot3d"         "scrapeR"              
## [449] "scs"                   "segmented"            
## [451] "selectr"               "seqinr"               
## [453] "seriation"             "setRNG"               
## [455] "sf"                    "sfsmisc"              
## [457] "sgeostat"              "shapefiles"           
## [459] "shiny"                 "SkewHyperbolic"       
## [461] "slam"                  "slidify"              
## [463] "slidifyLibraries"      "sn"                   
## [465] "sna"                   "SnowballC"            
## [467] "SOAR"                  "solrium"              
## [469] "sourcetools"           "sp"                   
## [471] "spacetime"             "spam"                 
## [473] "SparseM"               "spatstat"             
## [475] "spatstat.data"         "spatstat.utils"       
## [477] "spd"                   "spdep"                
## [479] "splancs"               "sqldf"                
## [481] "sROC"                  "stabledist"           
## [483] "startupmsg"            "statmod"              
## [485] "statnet.common"        "stringr"              
## [487] "strucchange"           "survey"               
## [489] "survival"              "svd"                  
## [491] "svUnit"                "SweaveListingUtils"   
## [493] "systemfit"             "tau"                  
## [495] "taxize"                "tensor"               
## [497] "tensorflow"            "testthat"             
## [499] "textcat"               "tfruns"               
## [501] "TH.data"               "tibble"               
## [503] "tidyr"                 "tidyselect"           
## [505] "tidyverse"             "timeDate"             
## [507] "timeSeries"            "tis"                  
## [509] "tm"                    "transport"            
## [511] "triebeard"             "trimcluster"          
## [513] "tripack"               "truncnorm"            
## [515] "truncreg"              "tseries"              
## [517] "TSP"                   "TTR"                  
## [519] "tweedie"               "ucminf"               
## [521] "udunits2"              "units"                
## [523] "urca"                  "urltools"             
## [525] "uuid"                  "V8"                   
## [527] "vars"                  "vcd"                  
## [529] "vcdExtra"              "vegan"                
## [531] "verification"          "VGAM"                 
## [533] "VIM"                   "viridis"              
## [535] "viridisLite"           "visNetwork"           
## [537] "waveslim"              "whisker"              
## [539] "WikidataR"             "WikipediR"            
## [541] "wikitaxa"              "winference"           
## [543] "withr"                 "wordcloud"            
## [545] "worrms"                "wskm"                 
## [547] "xlsx"                  "xlsxjars"             
## [549] "XML"                   "xml2"                 
## [551] "xtable"                "xts"                  
## [553] "yaml"                  "YieldCurve"           
## [555] "zipcode"               "zoo"                  
## [557] "littler"               "magrittr"             
## [559] "pkgKitten"             "RcppGSL"              
## [561] "reshape"               "Rglpk"                
## [563] "rJava"                 "Rsymphony"            
## [565] "stringi"               "base"                 
## [567] "boot"                  "class"                
## [569] "cluster"               "codetools"            
## [571] "compiler"              "datasets"             
## [573] "foreign"               "graphics"             
## [575] "grDevices"             "grid"                 
## [577] "KernSmooth"            "lattice"              
## [579] "methods"               "mgcv"                 
## [581] "nlme"                  "nnet"                 
## [583] "parallel"              "spatial"              
## [585] "splines"               "stats"                
## [587] "stats4"                "tcltk"                
## [589] "tools"                 "utils"

rm(list=ls())
options(width = 80)
library(knitr); library(htmlwidgets); library(dplyr); library(leaflet)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(distr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

## Utilities for Start-Up Messages (version 0.9.4)

## For more information see ?"startupmsg", NEWS("startupmsg")

## 
## Attaching package: 'sfsmisc'

## The following object is masked from 'package:dplyr':
## 
##     last

## Utilities for Sweave Together with TeX 'listings' Package (version 0.7.7)

## NOTE: Support for this package will stop soon.
## Package 'knitr' is providing the same functionality in a better way.
## Some functions from package 'base' are intentionally masked ---see SweaveListingMASK().
## Note that global options are controlled by SweaveListingoptions() ---c.f. ?"SweaveListingoptions".

## For more information see ?"SweaveListingUtils", NEWS("SweaveListingUtils")
## There is a vignette to this package; try vignette("ExampleSweaveListingUtils").

## 
## Attaching package: 'SweaveListingUtils'

## The following objects are masked from 'package:base':
## 
##     library, require

## Object Oriented Implementation of Distributions (version 2.6.2)

## Attention: Arithmetics on distribution objects are understood as operations on corresponding random variables (r.v.s); see distrARITH().
## Some functions from package 'stats' are intentionally masked ---see distrMASK().
## Note that global options are controlled by distroptions() ---c.f. ?"distroptions".

## For more information see ?"distr", NEWS("distr"), as well as
##   http://distr.r-forge.r-project.org/
## Package "distrDoc" provides a vignette to this package as well as to several extension packages; try vignette("distr").

library(distrEx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

## Extensions of Package 'distr' (version 2.6.1)

## Note: Packages "e1071", "moments", "fBasics" should be attached /before/ package "distrEx". See distrExMASK().Note: Extreme value distribution functionality has been moved to
##       package "RobExtremes". See distrExMOVED().

## For more information see ?"distrEx", NEWS("distrEx"), as well as
##   http://distr.r-forge.r-project.org/
## Package "distrDoc" provides a vignette to this package as well as to several related packages; try vignette("distr").

library(pryr,verbose=FALSE,quietly=TRUE)

## Error in library(pryr, verbose = FALSE, quietly = TRUE): there is no package called 'pryr'

library(wordcloud,verbose=FALSE,quietly=TRUE)
library(tm,verbose=FALSE,quietly=TRUE)

Using with RStudio

Download RStudio from http://www.rstudio.com

?rq

## No documentation for 'rq' in specified packages and libraries:
## you could try '??rq'

library(quantreg,verbose=FALSE,quietly=TRUE)

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

require(quantreg)
?rq
rq

## function (formula, tau = 0.5, data, subset, weights, na.action, 
##     method = "br", model = TRUE, contrasts = NULL, ...) 
## {
##     call <- match.call()
##     mf <- match.call(expand.dots = FALSE)
##     m <- match(c("formula", "data", "subset", "weights", "na.action"), 
##         names(mf), 0)
##     mf <- mf[c(1, m)]
##     mf$drop.unused.levels <- TRUE
##     mf[[1]] <- as.name("model.frame")
##     mf <- eval.parent(mf)
##     if (method == "model.frame") 
##         return(mf)
##     mt <- attr(mf, "terms")
##     weights <- as.vector(model.weights(mf))
##     Y <- model.response(mf)
##     if (method == "sfn") {
##         if (requireNamespace("MatrixModels") && requireNamespace("Matrix")) {
##             X <- MatrixModels::model.Matrix(mt, data, sparse = TRUE)
##             vnames <- dimnames(X)[[2]]
##             X <- as(X, "matrix.csr")
##             mf$x <- X
##         }
##     }
##     else X <- model.matrix(mt, mf, contrasts)
##     eps <- .Machine$double.eps^(2/3)
##     Rho <- function(u, tau) u * (tau - (u < 0))
##     if (length(tau) > 1) {
##         if (any(tau < 0) || any(tau > 1)) 
##             stop("invalid tau:  taus should be >= 0 and <= 1")
##         if (any(tau == 0)) 
##             tau[tau == 0] <- eps
##         if (any(tau == 1)) 
##             tau[tau == 1] <- 1 - eps
##         coef <- matrix(0, ncol(X), length(tau))
##         rho <- rep(0, length(tau))
##         fitted <- resid <- matrix(0, nrow(X), length(tau))
##         for (i in 1:length(tau)) {
##             z <- {
##                 if (length(weights)) 
##                   rq.wfit(X, Y, tau = tau[i], weights, method, 
##                     ...)
##                 else rq.fit(X, Y, tau = tau[i], method, ...)
##             }
##             coef[, i] <- z$coefficients
##             resid[, i] <- z$residuals
##             rho[i] <- sum(Rho(z$residuals, tau[i]))
##             fitted[, i] <- Y - z$residuals
##         }
##         taulabs <- paste("tau=", format(round(tau, 3)))
##         dimnames(coef) <- list(dimnames(X)[[2]], taulabs)
##         dimnames(resid) <- list(dimnames(X)[[1]], taulabs)
##         fit <- z
##         fit$coefficients <- coef
##         fit$residuals <- resid
##         fit$fitted.values <- fitted
##         if (method == "lasso") 
##             class(fit) <- c("lassorqs", "rqs")
##         else if (method == "scad") 
##             class(fit) <- c("scadrqs", "rqs")
##         else class(fit) <- "rqs"
##     }
##     else {
##         process <- (tau < 0 || tau > 1)
##         if (tau == 0) 
##             tau <- eps
##         if (tau == 1) 
##             tau <- 1 - eps
##         fit <- {
##             if (length(weights)) 
##                 rq.wfit(X, Y, tau = tau, weights, method, ...)
##             else rq.fit(X, Y, tau = tau, method, ...)
##         }
##         if (process) 
##             rho <- list(x = fit$sol[1, ], y = fit$sol[3, ])
##         else {
##             if (length(dim(fit$residuals))) 
##                 dimnames(fit$residuals) <- list(dimnames(X)[[1]], 
##                   NULL)
##             rho <- sum(Rho(fit$residuals, tau))
##         }
##         if (method == "lasso") 
##             class(fit) <- c("lassorq", "rq")
##         else if (method == "scad") 
##             class(fit) <- c("scadrq", "rq")
##         else class(fit) <- ifelse(process, "rq.process", "rq")
##     }
##     fit$na.action <- attr(mf, "na.action")
##     fit$formula <- formula
##     fit$terms <- mt
##     fit$xlevels <- .getXlevels(mt, mf)
##     fit$call <- call
##     fit$tau <- tau
##     fit$weights <- weights
##     fit$residuals <- drop(fit$residuals)
##     fit$rho <- rho
##     fit$method <- method
##     fit$fitted.values <- drop(fit$fitted.values)
##     attr(fit, "na.message") <- attr(m, "na.message")
##     if (model) 
##         fit$model <- mf
##     fit
## }
## <environment: namespace:quantreg>

detach(package:quantreg, unload=TRUE)
?rq

## No documentation for 'rq' in specified packages and libraries:
## you could try '??rq'

apropos("norm")

##  [1] ".__T__norm:base" "dlnorm"          "dnorm"           "norm"           
##  [5] "norm"            "normal_print"    "normalizePath"   "plnorm"         
##  [9] "pnorm"           "qlnorm"          "qnorm"           "qqnorm"         
## [13] "rlnorm"          "rnorm"

?pnorm

S3 vs S4

person3 <- function(name,age,weight,height){
  characteristics<-list(name=name,age=age,weight=weight,height=height)
  class(characteristics)<-"person3"
  return(characteristics)}
JohnDoe3 <- person3(name="John",age=28, weight=76, height=182)
JohnDoe3$age

## [1] 28

BMI3 <- function(object,...) {return(object$weight*1e4/object$height^2)}
BMI3(JohnDoe3)

## [1] 22.94409

JohnDoe3

## $name
## [1] "John"
## 
## $age
## [1] 28
## 
## $weight
## [1] 76
## 
## $height
## [1] 182
## 
## attr(,"class")
## [1] "person3"

reg3 <- lm(dist~speed,data=cars)
reg3$coefficients

## (Intercept)       speed 
##  -17.579095    3.932409

coef(reg3)

## (Intercept)       speed 
##  -17.579095    3.932409

summary(reg3)

## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.069  -9.525  -2.272   9.215  43.201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
## speed         3.9324     0.4155   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12

setClass("person4", representation(name="character",
age="numeric", weight="numeric", height="numeric"),where=.GlobalEnv)
JohnDoe4 <- new("person4",name="John",age=28, weight=76, height=182)
JohnDoe4

## An object of class "person4"
## Slot "name":
## [1] "John"
## 
## Slot "age":
## [1] 28
## 
## Slot "weight":
## [1] 76
## 
## Slot "height":
## [1] 182

JohnDoe4@age

## [1] 28

setGeneric("BMI4",function(object,separator) return(standardGeneric("BMI4")),
           where=.GlobalEnv)

## [1] "BMI4"

setMethod("BMI4", "person4",where=.GlobalEnv,
          function(object){return(object@weight*1e4/object@height^2)})

## [1] "BMI4"

BMI4(JohnDoe4)

## [1] 22.94409

library(VGAM,verbose=FALSE,quietly=TRUE)
reg4 <- vglm(dist~speed,data=cars,family=gaussianff)
reg4@coefficients

## (Intercept)       speed 
##  -17.579095    3.932409

coef(reg4)

## (Intercept)       speed 
##  -17.579095    3.932409

pnorm(3.65102,mean=5,sd=2)

## [1] 0.2499999

library(distr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

## Error in library(distr, verbose = FALSE, quietly = TRUE, warn.conflicts = FALSE): there is no package called 'distr'

X <- Norm(mean=5,sd=2)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "Norm"

mean(X)

## Error in mean(X): objet 'X' introuvable

p(X)(3.65102)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "p"

qnorm(0.25,mean=5,sd=2)

## [1] 3.65102

library(distrEx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

## Error in library(distrEx, verbose = FALSE, quietly = TRUE, warn.conflicts = FALSE): there is no package called 'distrEx'

distr::q(X)

## Error in loadNamespace(name): there is no package called 'distr'

distr::q(X)(0.25)

## Error in loadNamespace(name): there is no package called 'distr'

X1 <- Norm(mean=5,sd=2)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "Norm"

X2 <- Norm(mean=2,sd=1)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "Norm"

S <- X1+X2

## Error in eval(expr, envir, enclos): objet 'X1' introuvable

distr::q(S)(.5)

## Error in loadNamespace(name): there is no package called 'distr'

plot(S)

## Error in plot(S): objet 'S' introuvable

R objects : Numbers

set.seed(1)
U <- runif(20)
U[1:4]

## [1] 0.2655087 0.3721239 0.5728534 0.9082078

U[1] == 0.2655087

## [1] FALSE

options(digits = 3)
U[1:4]

## [1] 0.266 0.372 0.573 0.908

options(digits = 22)
U[1:4]

## [1] 0.2655086631420999765396 0.3721238996367901563644 0.5728533633518964052200
## [4] 0.9082077899947762489319

options(digits = 3)

x <- exp(1)
y <- x
x <- 2
y

## [1] 2.72

class(x)

## [1] "numeric"

0/0

## [1] NaN

1/0

## [1] Inf

.Machine$double.xmax

## [1] 1.8e+308

2e+307<Inf

## [1] TRUE

2e+308<Inf

## [1] FALSE

(3/10-1/10)

## [1] 0.2

(3/10-1/10)==(7/10-5/10)

## [1] FALSE

(3/10-1/10)-(7/10-5/10)

## [1] 2.78e-17

all.equal((3/10-1/10),(7/10-5/10))

## [1] TRUE

sqrt(2)^2 == 2

## [1] FALSE

.Machine$double.eps

## [1] 2.22e-16

.Machine$double.xmin

## [1] 2.23e-308

x <- rnorm(8)
names(x) <- letters[1:8]
x

##       a       b       c       d       e       f       g       h 
##  1.5118  0.3898 -0.6212 -2.2147  1.1249 -0.0449 -0.0162  0.9438

x[2:4]

##      b      c      d 
##  0.390 -0.621 -2.215

x[c("b","c","d")]

##      b      c      d 
##  0.390 -0.621 -2.215

min(x[x>0])

## [1] 0.39

x[-(3:8)]

##    a    b 
## 1.51 0.39

# x[-3:8]
x[c(TRUE,FALSE)]

##       a       c       e       g 
##  1.5118 -0.6212  1.1249 -0.0162

x[c(TRUE,NA,FALSE,TRUE)]

##      a   <NA>      d      e   <NA>      h 
##  1.512     NA -2.215  1.125     NA  0.944

x[2]

##    b 
## 0.39

x[[2]]

## [1] 0.39

for(i in 1:2){
  nom_var <- paste0("x", i)
  assign(nom_var, rpois(5,7))
}
x1

## [1] 9 4 8 6 9

x2

## [1] 8 9 7 7 9

x <- runif(4)
x

## [1] 0.0233 0.4772 0.7323 0.6927

## [1] 0.0233 0.4772 0.7323 0.6927

library(pryr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

## Error in library(pryr, verbose = FALSE, quietly = TRUE, warn.conflicts = FALSE): there is no package called 'pryr'

x %<a-% runif(1)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "%<a-%"

## [1] 0.0233 0.4772 0.7323 0.6927

## [1] 0.0233 0.4772 0.7323 0.6927

x <- 1:2
x

## [1] 1 2

rm("x")
x

## Error in eval(expr, envir, enclos): objet 'x' introuvable

R objects : Integers

(x_num=c(1,6,10))

## [1]  1  6 10

(x_int=c(1L,6L,10L))

## [1]  1  6 10

object.size(x_num)

## 72 bytes

object.size(x_int)

## 56 bytes

typeof(x_num)

## [1] "double"

typeof(x_int)

## [1] "integer"

is.integer(x_num)

## [1] FALSE

is.integer(x_int)

## [1] TRUE

str(x_num)

##  num [1:3] 1 6 10

str(x_int)

##  int [1:3] 1 6 10

c(1,c(2,c(3,c(4,5))))

## [1] 1 2 3 4 5

R objects : Matrices

M<-1:24
dim(M)=c(6,4)
M

##      [,1] [,2] [,3] [,4]
## [1,]    1    7   13   19
## [2,]    2    8   14   20
## [3,]    3    9   15   21
## [4,]    4   10   16   22
## [5,]    5   11   17   23
## [6,]    6   12   18   24

str(M)

##  int [1:6, 1:4] 1 2 3 4 5 6 7 8 9 10 ...

colnames(M)=letters[1:4]
rownames(M)=LETTERS[10:15]
M

##   a  b  c  d
## J 1  7 13 19
## K 2  8 14 20
## L 3  9 15 21
## M 4 10 16 22
## N 5 11 17 23
## O 6 12 18 24

M["K",]

##  a  b  c  d 
##  2  8 14 20

M[c("K","N"),]

##   a  b  c  d
## K 2  8 14 20
## N 5 11 17 23

M[c(2,5),]

##   a  b  c  d
## K 2  8 14 20
## N 5 11 17 23

M[c("K",5),]

## Error in M[c("K", 5), ]: indice hors limites

On Windows machines

rm(list=ls())
memory.size()

## Warning: 'memory.size()' is Windows-specific

## [1] Inf

memory.limit()

## Warning: 'memory.limit()' is Windows-specific

## [1] Inf

memory.limit(size=8000)

## Warning: 'memory.limit()' is Windows-specific

## [1] Inf

memory.limit(size=4000)

## Warning: 'memory.limit()' is Windows-specific

## [1] Inf

x3 <- 1:1e3
x4 <- 1:1e4
x5 <- 1:1e5
x6 <- 1:1e6
object.size(x3)

## 4040 bytes

object.size(x4)

## 40040 bytes

object.size(x6)

## 4000040 bytes

z1 <- matrix(0,1e+06,3) 
z2 <- matrix(as.integer(0),1e+06,3) 
object.size(z1)

## 24000200 bytes

print(object.size(z1),units="Mb")

## 22.9 Mb

object.size(z2)

## 12000200 bytes

print(object.size(z2),units="Mb")

## 11.4 Mb

mydat_r <- matrix(rnorm(3*5e6,0,2),5e6,3)
colnames(mydat_r) <- c("first","second","third")
mydat_rdf <- as.data.frame(mydat_r)
print(object.size(mydat_rdf ),units="Mb")

## 114.4 Mb

library(biglm)

## Loading required package: DBI

z1 <- matrix(as.integer(5), 300, 400)
object.size(z1)

## 480200 bytes

library(bigmemory)

## Error in library(bigmemory): there is no package called 'bigmemory'

z2 <- big.matrix(300, 400, type="integer", init=5)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "big.matrix"

object.size(z2)

## 12000200 bytes

rm(list=ls())

R objects : Factors

x <- c("A", "A", "B", "B", "C")
x

## [1] "A" "A" "B" "B" "C"

factor(x)

## [1] A A B B C
## Levels: A B C

relevel(factor(x),"B")

## [1] A A B B C
## Levels: B A C

model.matrix(~0+x)

##   xA xB xC
## 1  1  0  0
## 2  1  0  0
## 3  0  1  0
## 4  0  1  0
## 5  0  0  1
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$x
## [1] "contr.treatment"

n <- 10; nn <- 100
group <- factor(round(n * runif(n * nn)))
x <- rnorm(n * nn) + sqrt(as.numeric(group))
tapply(x,group,mean)

##    0    1    2    3    4    5    6    7    8    9   10 
## 1.10 1.40 1.75 2.06 2.25 2.60 2.59 2.76 3.04 3.06 3.47

aggregate(x,by=list(group),mean)

##    Group.1    x
## 1        0 1.10
## 2        1 1.40
## 3        2 1.75
## 4        3 2.06
## 5        4 2.25
## 6        5 2.60
## 7        6 2.59
## 8        7 2.76
## 9        8 3.04
## 10       9 3.06
## 11      10 3.47

xg <- split(x, group)
sapply(xg, length)

##   0   1   2   3   4   5   6   7   8   9  10 
##  55  99  92 102 101 102 101 102  93 108  45

sapply(xg, mean)

##    0    1    2    3    4    5    6    7    8    9   10 
## 1.10 1.40 1.75 2.06 2.25 2.60 2.59 2.76 3.04 3.06 3.47

library(pbapply,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

## Error in library(pbapply, verbose = FALSE, quietly = TRUE, warn.conflicts = FALSE): there is no package called 'pbapply'

n <- 10; nn <- 1e6
group <- factor(round(n * runif(n * nn)))
x <- rnorm(n * nn) + sqrt(as.numeric(group))
xg <- split(x, group)
pbsapply(xg, mean)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "pbsapply"

x <- factor(c("b","a","b"))
levels(x)

## [1] "a" "b"

x[3]="c"

## Warning in `[<-.factor`(`*tmp*`, 3, value = "c"): invalid factor level, NA
## generated

## [1] b    a    <NA>
## Levels: a b

factor(x,levels=c("b","a"))

## [1] b    a    <NA>
## Levels: b a

x[1]

## [1] b
## Levels: a b

x[1,drop=TRUE]

## [1] b
## Levels: b

U <- runif(20)
cut(U,breaks=2)

##  [1] (0.494,0.971]  (0.494,0.971]  (0.494,0.971]  (0.0169,0.494] (0.494,0.971] 
##  [6] (0.494,0.971]  (0.0169,0.494] (0.494,0.971]  (0.0169,0.494] (0.494,0.971] 
## [11] (0.0169,0.494] (0.0169,0.494] (0.494,0.971]  (0.0169,0.494] (0.0169,0.494]
## [16] (0.494,0.971]  (0.494,0.971]  (0.494,0.971]  (0.0169,0.494] (0.0169,0.494]
## Levels: (0.0169,0.494] (0.494,0.971]

cut(U,breaks=2,labels=c("small","large"))

##  [1] large large large small large large small large small large small small
## [13] large small small large large large small small
## Levels: small large

cut(U,breaks=c(0,.3,.8,1),labels=c("small","medium","large"))

##  [1] large  large  medium small  large  large  small  large  small  large 
## [11] small  medium medium small  small  medium medium medium medium medium
## Levels: small medium large

table(cut(U,breaks=c(0,.3,.8,1),labels=c("small","medium","large")))

## 
##  small medium  large 
##      6      8      6

Short Quizz

What would 1:3*1:5 return ?

1 2 3
1 2 3 2 4 6 3 6 9 4 8 12 5 10 15
1 4 9 4 10
1 2 3 4 5 2 4 6 8 10 3 6 9 12 15

a*b returns the componentwise product of a and b

1:3*1:5 is read (1:3)*(1:5) which returns the componentwise product of 1:3 - or c(1,2,3) - and 1:5 - or c(1,2,3,4,5).

Short Quizz

What would intersect(seq(4,28,by=7),seq (31,3,by=-2)) return ?

11 25
25 11
3 5 7 9 11 13 15 17 19 21 23 25 27 29 31
FALSE

See seq(4,28,by=7) %in% seq(3,31,by=2)

seq(4,28,by=7)[seq(4,28,by=7) %in% seq(3,31,by=2)].

Short Quizz

Consider function dg <- function(M) diag(M[,ncol(M):1]).

Define M <- matrix(1:25,5,5). What would dg(M) return ?

5 9 13 17 21
21 17 13 9 5
1 2 3 4 5
1 7 13 19 25

See dg return the second diagonal of the matrix (watch the order).

dgreturn the second diagonal of the matrix, starting from the upper right corner (21) and ending with the lower left corner (5).

Short Quizz

Define M <- matrix(1:5 ,3 ,3)

What would M[,2] return ?

4 5
4 5 NA
4 5 NULL
4 5 1

Observe that M <- matrix(c(1:5,1:4),3,3).

M <- matrix(c(1:5,1:4),3,3).

Short Quizz

Compute $\sum_{i=10}^{20}(i^2+4/i)$

2588.075
2581.925
2855.609
373.2841

Use sum(i^2-4/i).

Set i=10:20 and compute sum(i^2+4/i).

Short Quizz

Solve \[ \left\{ 3x+2y-z=1 \\ 2x-2y+4z=-2 \\ -x+y/2-z=0 \right. \]

-1 2 0
1 -2 0
0 1 2
there is no unique solution

Use solve(A,b).

Set A <- matrix(c(3,2,-1,2,-2,.5,-1,4,-1),3,3) and b <- c(1,-2,0) and then compute solve(A,b).

Short Quizz

Compute $\displaystyle{\sum_{i=1}^{10}\sum_{j=i}^{10} \frac{i^2}{5+i\cdot j}}$

32.21865
27.71865
2.5
6.5

Use a double loop for(i in 1:10) for(j in i:10).

Set s <- 0 and then compute for(i in 1:10) for(j in i:10) s <- s+i^2/(5*i*j).

Short Quizz

What sqrt(7)^2 == 7

TRUE
FALSE
``
NULL

Compute sqrt(7)^2 - 7.

Use all.equal(sqrt(7)^2, 7) to avoid numerical issue like that one.

Short Quizz

Find roots of polynomial $x^2+x=1$

``
``
``
NULL

Compute sqrt(7)^2 - 7.

Use Re(polyroot(c(1,1,-1))) or uniroot(f,c(-10,0))$root and uniroot(f,c(0,10))$root

R objects : Strings and Characters

"Be carefull of 'quotes'"
'Be carefull of "quotes"'

cities <- c("New York, NY", "Los Angeles, CA", "Boston, MA")
substr(cities, nchar(cities)-1, nchar(cities))

## [1] "NY" "CA" "MA"

unlist(strsplit(cities, ", "))[seq(2,6,by=2)]

## [1] "NY" "CA" "MA"

library(stringr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
tweet = "Emerging #climate change, environment and sustainability concerns for #actuaries Apr 9 #Toronto. Register TODAY http://bit.ly/CIAClimateForum"
hash="#[a-zA-Z]{1,}"
str_extract(tweet,hash)

## [1] "#climate"

str_extract_all(tweet,hash)

## [[1]]
## [1] "#climate"   "#actuaries" "#Toronto"

str_locate_all(tweet,hash)

## [[1]]
##      start end
## [1,]    10  17
## [2,]    71  80
## [3,]    88  95

urls="http://([\\da-z\\.-]+)\\.([a-z\\.]{2,6})/[a-zA-Z0-9]{1,}"
str_extract_all(tweet,urls)

## [[1]]
## [1] "http://bit.ly/CIAClimateForum"

email="^([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})$"
grep(pattern = email, x = "charpentier.arthur@uqam.ca")

## [1] 1

grepl(pattern = email, x = "charpentier.arthur@uqam.ca")

## [1] TRUE

str_detect(pattern = email, string=c("charpentier.arthur@uqam.ca","@freakonometrics"))

## [1]  TRUE FALSE

ex_sentence = "This is 1 simple sentence, just to play with, then we'll play with 4, and that will be more difficult"
library(tm,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ex_corpus <- Corpus(VectorSource(ex_sentence))
ex_corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1

inspect(ex_corpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] This is 1 simple sentence, just to play with, then we'll play with 4, and that will be more difficult

grep("hard", ex_sentence)

## integer(0)

grep("difficult", ex_sentence)

## [1] 1

library(stringr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
word(ex_sentence,4)

## [1] "simple"

word(ex_sentence,1:20)

##  [1] "This"      "is"        "1"         "simple"    "sentence," "just"     
##  [7] "to"        "play"      "with,"     "then"      "we'll"     "play"     
## [13] "with"      "4,"        "and"       "that"      "will"      "be"       
## [19] "more"      "difficult"

ex_words <- strsplit(ex_sentence, split=" ")[[1]]
ex_words

##  [1] "This"      "is"        "1"         "simple"    "sentence," "just"     
##  [7] "to"        "play"      "with,"     "then"      "we'll"     "play"     
## [13] "with"      "4,"        "and"       "that"      "will"      "be"       
## [19] "more"      "difficult"

grep(pattern="w",ex_words,value=TRUE)

## [1] "with," "we'll" "with"  "will"

str_count(ex_words,"w")

##  [1] 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0

grep(pattern="[ai]",ex_words,value=TRUE)

##  [1] "This"      "is"        "simple"    "play"      "with,"     "play"     
##  [7] "with"      "and"       "that"      "will"      "difficult"

grep(pattern="[[:punct:]]",ex_words,value=TRUE)

## [1] "sentence," "with,"     "we'll"     "4,"

require(wordcloud)

## Loading required package: wordcloud

## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'wordcloud'

wordcloud(ex_corpus)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "wordcloud"

cols<-colorRampPalette(c("lightgrey","blue"))(40)
wordcloud(words = ex_corpus, max.words = 40, random.order=FALSE, scale = c(5, 0.5), colors=cols)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "wordcloud"

tdm <- TermDocumentMatrix(ex_corpus)
tdm

## <<TermDocumentMatrix (terms: 12, documents: 1)>>
## Non-/sparse entries: 12/0
## Sparsity           : 0%
## Maximal term length: 9
## Weighting          : term frequency (tf)

inspect(tdm)

## <<TermDocumentMatrix (terms: 12, documents: 1)>>
## Non-/sparse entries: 12/0
## Sparsity           : 0%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## Sample             :
##            Docs
## Terms       1
##   and       1
##   difficult 1
##   just      1
##   more      1
##   play      2
##   sentence  1
##   simple    1
##   that      1
##   then      1
##   with      2

inspect(ex_corpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] This is 1 simple sentence, just to play with, then we'll play with 4, and that will be more difficult

fix_contractions <- function(doc) {
  doc <- gsub("won't", "will not", doc)
  doc <- gsub("n't", " not", doc)
  doc <- gsub("'ll", " will", doc)
  doc <- gsub("'re", " are", doc)
  doc <- gsub("'ve", " have", doc)
  doc <- gsub("'m", " am", doc)
  doc <- gsub("'s", "", doc) 
  return(doc)
}

ex_corpus <- tm_map(ex_corpus, fix_contractions)
inspect(ex_corpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] This is 1 simple sentence, just to play with, then we will play with 4, and that will be more difficult

ex_corpus <- tm_map(ex_corpus, tolower)
inspect(ex_corpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] this is 1 simple sentence, just to play with, then we will play with 4, and that will be more difficult

ex_corpus <- tm_map(ex_corpus, removeNumbers)
inspect(ex_corpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] this is  simple sentence, just to play with, then we will play with , and that will be more difficult

ex_corpus <- tm_map(ex_corpus,gsub, pattern= "[[:punct:]]", replacement = " ")
inspect(ex_corpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] this is  simple sentence  just to play with  then we will play with   and that will be more difficult

stopwords("en")[sample(1:length(stopwords("en")),size=10)]

##  [1] "very"   "when"   "who"    "what's" "they'd" "i'm"    "am"     "she's" 
##  [9] "then"   "here"

inspect(ex_corpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] this is  simple sentence  just to play with  then we will play with   and that will be more difficult

library(SnowballC,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ex_corpus <- tm_map(ex_corpus, stemDocument)
inspect(ex_corpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] this is simpl sentenc just to play with then we will play with and that will be more difficult

wordcloud(ex_corpus[[1]])

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "wordcloud"

wordcloud(words = ex_corpus[[1]], max.words = 40, random.order=FALSE, scale = c(5, 0.5),colors=cols)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "wordcloud"

Short Quizz

What would grep("ab",c("abc","b","a","ba","cab")) return?

TRUE
1 5
abc cab
TRUE FALSE FALSE FALSE TRUE

Use ?grep.

only two elements contain ab in that order.

R objects : Strings and Characters

some.dates <- as.Date(c("16/10/12","19/11/12"),format="%d/%m/%y")
diff(some.dates)

## Time difference of 34 days

sequence.date <- seq(from=some.dates[1],to=some.dates[2],by=7)
format(sequence.date,"%b")

## [1] "oct." "oct." "oct." "nov." "nov."

months(sequence.date)

## [1] "octobre"  "octobre"  "octobre"  "novembre" "novembre"

weekdays(some.dates)

## [1] "mardi" "lundi"

substr(as.POSIXct(some.dates), 1, 4)

## [1] "2012" "2012"

strftime(some.dates,"%Y")

## [1] "2012" "2012"

Sys.setlocale("LC_TIME", "fr_FR")

## Warning in Sys.setlocale("LC_TIME", "fr_FR"): la requête OS pour spécifier la
## localisation à "fr_FR" n'a pas pu être honorée

## [1] ""

months(sequence.date)

## [1] "octobre"  "octobre"  "octobre"  "novembre" "novembre"

R objects : Lists

x <- list(1:5,c(1,2,3,4,5),a="test",
b=c(TRUE,FALSE),rpois(5,8))
x

## [[1]]
## [1] 1 2 3 4 5
## 
## [[2]]
## [1] 1 2 3 4 5
## 
## $a
## [1] "test"
## 
## $b
## [1]  TRUE FALSE
## 
## [[5]]
## [1]  6  7 13  6  4

str(x)

## List of 5
##  $  : int [1:5] 1 2 3 4 5
##  $  : num [1:5] 1 2 3 4 5
##  $ a: chr "test"
##  $ b: logi [1:2] TRUE FALSE
##  $  : int [1:5] 6 7 13 6 4

x <- list(
  list(1:5,c(1,2,3,4,5)),a="test",
  b=c(TRUE,FALSE),rpois(5,8))
x

## [[1]]
## [[1]][[1]]
## [1] 1 2 3 4 5
## 
## [[1]][[2]]
## [1] 1 2 3 4 5
## 
## 
## $a
## [1] "test"
## 
## $b
## [1]  TRUE FALSE
## 
## [[4]]
## [1]  5 10 10  7  8

names(x)

## [1] ""  "a" "b" ""

str(x)

## List of 4
##  $  :List of 2
##   ..$ : int [1:5] 1 2 3 4 5
##   ..$ : num [1:5] 1 2 3 4 5
##  $ a: chr "test"
##  $ b: logi [1:2] TRUE FALSE
##  $  : int [1:5] 5 10 10 7 8

is.list(x)

## [1] TRUE

is.recursive(x)

## [1] TRUE

x[[3]]

## [1]  TRUE FALSE

x[["b"]]

## [1]  TRUE FALSE

x[[1]]

## [[1]]
## [1] 1 2 3 4 5
## 
## [[2]]
## [1] 1 2 3 4 5

x[[1]][[1]]

## [1] 1 2 3 4 5

f <- function(x) { return(x*(1-x)) }
optim.f <- optimize(f, interval=c(0, 1), maximum=TRUE)
str(optim.f)

## List of 2
##  $ maximum  : num 0.5
##  $ objective: num 0.25

list(abc=1)$a

## [1] 1

list(abc=1,a=2)$a

## [1] 2

list(bac=1)$a

## NULL

list(abc=1,b=2)$a

## [1] 1

R objects : Functions

factorial

## function (x) 
## gamma(x + 1)
## <bytecode: 0x5583a3a67628>
## <environment: namespace:base>

gamma

## function (x)  .Primitive("gamma")

?gamma

If Condition

set.seed(1)
u <- runif(1)
if(u>.5) {("greater than 50%")} else {("smaller than 50%")}

## [1] "smaller than 50%"

ifelse(u>.5,("greater than 50%"),("smaller than 50%"))

## [1] "smaller than 50%"

## [1] 0.266

u <- runif(3)
if(u>.5) {print("greater than 50%")} else {("smaller than 50%")}

## Warning in if (u > 0.5) {: la condition a une longueur > 1 et seul le premier
## élément est utilisé

## [1] "smaller than 50%"

ifelse(u>.5,("greater than 50%"),("smaller than 50%"))

## [1] "smaller than 50%" "greater than 50%" "greater than 50%"

## [1] 0.372 0.573 0.908

Loops

v_x <- runif(1e5)
sqrt_x <- NULL
for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x))

sqrt_x <- NULL
system.time(for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x)))

##    user  system elapsed 
##    9.91    0.08    9.99

sqrt_x <- numeric(length(v_x))
for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i])

sqrt_x <- numeric(length(v_x))
system.time(for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i]))

##    user  system elapsed 
##   0.104   0.000   0.105

system.time(Vectorize(sqrt)(v_x))

##    user  system elapsed 
##       0       0       0

sqrt_x <- sapply(v_x,sqrt)
sqrt_x <- unlist(lapply(v_x,sqrt))
system.time(unlist(lapply(v_x,sqrt)))

##    user  system elapsed 
##   0.028   0.000   0.028

library(parallel,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
(allcores <- detectCores(all.tests=TRUE))

## [1] 4

sqrt_x <- unlist(mclapply(v_x,sqrt,mc.cores=4))
system.time(unlist(mclapply(v_x,sqrt,mc.cores=4)))

##    user  system elapsed 
##   0.060   0.040   0.047

f<-function(x) dlnorm(x)
integrate(f,0,Inf)

## 1 with absolute error < 2.5e-07

integrate(f,0,1e5)

## 1.82e-05 with absolute error < 3.6e-05

integrate(f,0,1e3)$value+integrate(f,1e3,1e5)$value

## [1] 1

Apply

rN.Poisson <- function(n) rpois(n,5)
rX.Exponential <- function(n) rexp(n,2)
set.seed(1)
rN.Poisson(5)

## [1] 4 4 5 8 3

rX.Exponential(3)

## [1] 0.229 0.129 1.447

rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  V <- rep(0,n)
  for(i in 1:n){
    N <- rN(1)
    if(N>0){V[i] <- sum(rX(N))}
  }
  return(V)}
rcpd1(3)

## [1] 1.52 3.88 0.79

rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  V <- rep(0,n)
  for (i in 1:n) V[i] <- sum(rX(rN(1)))
  return(V)}
rcpd1(3)

## [1] 0.893 3.368 3.023

rcpd2 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  N <- rN(n)
  X <- rX(sum(N))
  I <- factor(rep(1:n,N),levels=1:n)
  return(as.numeric(xtabs(X ~ I)))}
rcpd2(3)

## [1] 2.29 2.06 5.46

rcpd3 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  N <- rN(n)
  X <- rX(sum(N))
  I <- factor(rep(1:n,N),levels=1:n)
  V <- tapply(X,I,sum)
  V[is.na(V)] <- 0
  return(as.numeric(V))}
rcpd3(3)

## [1] 2.22 6.03 4.33

rcpd4 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(sapply(rN(n), function(x) sum(rX(x))))}
rcpd4(3)

## [1] 2.83 4.09 2.99

rcpd5 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(sapply(Vectorize(rX)(rN(n)),sum))}
rcpd5(3)

## [1] 1.81 3.69 3.60

rcpd6 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(unlist(lapply(lapply(t(rN(n)),rX),sum)))}
rcpd6(3)

## [1] 0.291 1.344 2.948

n <- 100
library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
options(digits=1)
microbenchmark(rcpd1(n),rcpd2(n),rcpd3(n),rcpd4(n),rcpd5(n),rcpd6(n))

## Unit: microseconds
##      expr min  lq mean median   uq  max neval cld
##  rcpd1(n) 548 580  734    621  698 5570   100  b 
##  rcpd2(n) 766 835  950    878 1010 1992   100   c
##  rcpd3(n) 337 373  412    386  414  797   100 a  
##  rcpd4(n) 350 371  430    392  428  980   100 a  
##  rcpd5(n) 399 439  492    460  502  942   100 a  
##  rcpd6(n) 320 337  394    361  401  760   100 a

options(digits=5)

Vectorization

f <- function(x,m=0,s=1){
  H<-function(t) 1-pnorm(t,m,s)
  integral<-integrate(H,lower=x,upper=Inf)$value
  res<-H(x)/integral
  return(res)
}
f(0)

## [1] 1.2533

f(0:1)

## Warning in if (is.finite(lower)) {: la condition a une longueur > 1 et seul le
## premier élément est utilisé

## [1] 1.25331 0.39769

y <- rep(NA,2)
x <- 0:1
for(i in 1:2) y[i] <- f(x[i])
y

## [1] 1.2533 1.9043

sapply(x,"f")

## [1] 1.2533 1.9043

Vectorize(f)(x)

## [1] 1.2533 1.9043

binorm <- function(x1,x2,r=0){
  exp(-(x1^2+x2^2-2*r*x1*x2)/(2*(1-r^2)))/(2*pi*sqrt(1-r^2))
}
u <- seq(-2,2)
binorm(u,u)

## [1] 0.002915 0.058550 0.159155 0.058550 0.002915

outer(u,u,binorm)

##          [,1]     [,2]     [,3]     [,4]     [,5]
## [1,] 0.002915 0.013064 0.021539 0.013064 0.002915
## [2,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [3,] 0.021539 0.096532 0.159155 0.096532 0.021539
## [4,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [5,] 0.002915 0.013064 0.021539 0.013064 0.002915

(uv<-expand.grid(u,u))

##    Var1 Var2
## 1    -2   -2
## 2    -1   -2
## 3     0   -2
## 4     1   -2
## 5     2   -2
## 6    -2   -1
## 7    -1   -1
## 8     0   -1
## 9     1   -1
## 10    2   -1
## 11   -2    0
## 12   -1    0
## 13    0    0
## 14    1    0
## 15    2    0
## 16   -2    1
## 17   -1    1
## 18    0    1
## 19    1    1
## 20    2    1
## 21   -2    2
## 22   -1    2
## 23    0    2
## 24    1    2
## 25    2    2

matrix(binorm(uv$Var1,uv$Var2),5,5)

##          [,1]     [,2]     [,3]     [,4]     [,5]
## [1,] 0.002915 0.013064 0.021539 0.013064 0.002915
## [2,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [3,] 0.021539 0.096532 0.159155 0.096532 0.021539
## [4,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [5,] 0.002915 0.013064 0.021539 0.013064 0.002915

u <- seq(-2,2,length=31)
persp(u,u,outer(u,u,binorm),theta=25)

plot of chunk unnamed-chunk-93

Computational Efficiency

library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
n <- 1000
A<-matrix(seq(1,n^2),n,n)
B<-matrix(seq(1,n^2),n,n)
C<-1:n
microbenchmark((A%*%B)%*%C,A%*%(B%*%C),replications=1)

## Unit: nanoseconds
##             expr       min        lq       mean    median         uq        max
##  (A %*% B) %*% C 741482231 828932318 9.0671e+08 881234170 9.5726e+08 1593058647
##  A %*% (B %*% C)   4658177   5269338 8.5710e+06   6380956 9.0802e+06  105423967
##     replications         5        21 2.7901e+02       246 3.1850e+02       2706
##  neval cld
##    100   b
##    100  a 
##    100  a

m <- 1000
n <- 50
X <- matrix(rnorm(m * n, mean = 10, sd = 3), nrow = m)
group <- rep(c("A","B"), each = n / 2)

system.time(for(i in 1:m) t.test(X[i, ] ~ group)$stat)

##    user  system elapsed 
##   0.716   0.000   0.713

system.time(for(i in 1:m) t.test(X[i, group == "A"], X[i, group == "B"])$stat)

##    user  system elapsed 
##   0.188   0.000   0.189

t.test_1 <- function(x, grp){
  t.test(x[ group == "A"], x[ group == "B"])$stat
}
system.time(t1 <- apply(X, 1, t.test_1, grp = grp))

##    user  system elapsed 
##   0.212   0.000   0.213

t.test_2 <- function(x, grp) {
  t_stat <- function(x) {
    m <- mean(x)
    n <- length(x)
    var <- sum((x - m) ^ 2) / (n - 1)
    list(m = m, n = n, var = var)
  }
  g1 <- t_stat(x[grp == "A"])
  g2 <- t_stat(x[grp == "B"])
  se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
  (g1$m - g2$m) / se_total
}
system.time(apply(X, 1, t.test_2, grp = group))

##    user  system elapsed 
##   0.028   0.000   0.027

t.test_3 <- function(X, grp){
  t_stat <- function(X) {
    m <- rowMeans(X)
    n <- ncol(X)
    var <- rowSums((X - m) ^ 2) / (n - 1)
    list(m = m, n = n, var = var)
  }
  g1 <- t_stat(X[, grp == "A"])
  g2 <- t_stat(X[, grp == "B"])
  se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
  (g1$m - g2$m) / se_total
}
system.time( t.test_3(X, group))

##    user  system elapsed 
##   0.000   0.000   0.001

"%pm%" <- function(x,s) x + c(qnorm(.05),qnorm(.95))*s
100 %pm% 10

## [1]  83.551 116.449

'second<-' <- function(x, value) {
  x[2] <- value
  return(x)
}
x <- 1:10
second(x) <- 5
x

##  [1]  1  5  3  4  5  6  7  8  9 10

library(pryr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

## Error in library(pryr, verbose = FALSE, quietly = TRUE, warn.conflicts = FALSE): there is no package called 'pryr'

x <- 1:10
address(x)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "address"

second(x) <- 6
address(x)

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "address"

ls(globalenv())

## [1] "BMI4"

find("x")

## character(0)

find("pi")

## [1] "package:base"

environment(sd)

## <environment: namespace:stats>

f <- function(x) x^2
f

## function(x) x^2
## <environment: 0x558395eef418>

formals(f)

## $x

body(f)

## x^2

e <- new.env()
e$d <- 1
e$f <- 1:5
e$g <- 1:5
ls(e)

## [1] "d" "f" "g"

str(e)

## <environment: 0x5583a5be7960>

identical(globalenv(),e)

## [1] FALSE

search()

##  [1] ".GlobalEnv"             "package:microbenchmark" "package:parallel"      
##  [4] "package:SnowballC"      "package:stringr"        "package:biglm"         
##  [7] "package:DBI"            "package:VGAM"           "package:splines"       
## [10] "package:stats4"         "package:SparseM"        "package:tm"            
## [13] "package:NLP"            "package:leaflet"        "package:dplyr"         
## [16] "package:htmlwidgets"    "package:knitr"          "package:stats"         
## [19] "package:graphics"       "package:grDevices"      "package:utils"         
## [22] "package:datasets"       "package:methods"        "Autoloads"             
## [25] "package:base"

f1 <- function(x1){
  f2 <- function(x2){
    f3 <- function(x3){
      x1+x2+x3
    }
    f3(3)
  } 
  f2(2)
}
f1(1)

## [1] 6

f <- function(x) log(x)
f("x")

## Error in log(x): argument non numérique pour une fonction mathématique

try(f("x"))
inherits(try(f("x")), "try-error")

## [1] TRUE

inherits(try(f("x"), silent=TRUE), "try-error")

## [1] TRUE

a=0
try(a<-f("x"), silent=TRUE)
a

## [1] 0

try(a<-f(x), silent=TRUE)
a

##  [1] 0.0000 1.7918 1.0986 1.3863 1.6094 1.7918 1.9459 2.0794 2.1972 2.3026

x=1:10
g=function(f) f(x)
g(mean)

## [1] 5.5

fibonacci <- function(n){
  if(n<2) return(1)
  fibonacci(n-2)+fibonacci(n-1)
}
fibonacci(20)

## [1] 10946

system.time(fibonacci(30))

##    user  system elapsed 
##   2.056   0.004   2.094

library(memoise,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
fibonacci <- memoise(function(n){
  if(n<2) return(1)
  fibonacci(n-2)+fibonacci(n-1)
})
system.time(fibonacci(30))

##    user  system elapsed 
##   0.016   0.000   0.017

set.seed(1)
x<-rexp(6)
sum(x)

## [1] 5.5534

.Primitive("sum")(x)

## [1] 5.5534

library(Rcpp,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
cppFunction('double sum_C(NumericVector x) {
            int n = x.size();
            double total = 0;
            for(int i = 0; i < n; ++i) {
            total += x[i];
            }
            return total;
            }')
sum_C(x)

## [1] 5.5534

library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
x <- runif(1e6)
microbenchmark(
  sum(x),
  .Primitive("sum")(x),
  sum_C(x))

## Unit: microseconds
##                  expr     min      lq   mean median     uq    max neval cld
##                sum(x)  968.60  987.35 1034.0 1021.8 1061.2 1216.9   100  a 
##  .Primitive("sum")(x)  971.29  987.96 1029.3 1010.9 1045.5 1211.6   100  a 
##              sum_C(x) 1247.59 1265.43 1310.7 1300.8 1323.9 1543.4   100   b

names_list <- function(...){
  names(list(...))
}
names_list(a=5,b=7)

## [1] "a" "b"

power <- function(exponent) {
  function(x) {
    x ^ exponent
  }
}
square <- power(2)
square(4)

## [1] 16

cube <- power(3)
cube(4)

## [1] 64

cube

## function(x) {
##     x ^ exponent
##   }
## <environment: 0x5583a5a86520>

total <- 20
pb <- txtProgressBar(min = 0, max = total, style = 3)

## 
  |                                                                            
  |                                                                      |   0%

for(i in 1:(10*total)){
  Sys.sleep(0.01)
  if(i %%10 == 0) setTxtProgressBar(pb, i%/%10)
}

## 
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |======================================================================| 100%

close(pb)

R objects : Data Frames

df <- data.frame(x=1:3,y=letters[1:3])
str(df)

## 'data.frame':    3 obs. of  2 variables:
##  $ x: int  1 2 3
##  $ y: Factor w/ 3 levels "a","b","c": 1 2 3

typeof(df)

## [1] "list"

class(df)

## [1] "data.frame"

cbind(df,z=9:7)

##   x y z
## 1 1 a 9
## 2 2 b 8
## 3 3 c 7

df$z<-5:3
df

##   x y z
## 1 1 a 5
## 2 2 b 4
## 3 3 c 3

df[1]

##   x
## 1 1
## 2 2
## 3 3

df[,1,drop=FALSE]

##   x
## 1 1
## 2 2
## 3 3

df[,1,drop=TRUE]

## [1] 1 2 3

df[[1]]

## [1] 1 2 3

df$x

## [1] 1 2 3

df[,"x"]

## [1] 1 2 3

df[["x"]]

## [1] 1 2 3

set.seed(1)
df[sample(nrow(df)),]

##   x y z
## 1 1 a 5
## 3 3 c 3
## 2 2 b 4

set.seed(1)
df[sample(nrow(df),nrow(df)*2,replace=TRUE),]

##     x y z
## 1   1 a 5
## 2   2 b 4
## 2.1 2 b 4
## 3   3 c 3
## 1.1 1 a 5
## 3.1 3 c 3

subset(df, x>1)

##   x y z
## 2 2 b 4
## 3 3 c 3

set.seed(123)
df <- data.frame(Y=rnorm(50), X1=as.factor(sample(LETTERS[1:4],size=50,
replace=TRUE)), X2=as.factor(sample(1:3,size=50,replace=TRUE)))
tail(df)

##            Y X1 X2
## 45  1.207962  D  3
## 46 -1.123109  A  2
## 47 -0.402885  A  2
## 48 -0.466655  A  2
## 49  0.779965  A  1
## 50 -0.083369  C  2

reg <- lm(Y~X1+X2,data=df)
model.matrix(reg)[45:50,]

##    (Intercept) X1B X1C X1D X22 X23
## 45           1   0   0   1   0   1
## 46           1   0   0   0   1   0
## 47           1   0   0   0   1   0
## 48           1   0   0   0   1   0
## 49           1   0   0   0   0   0
## 50           1   0   1   0   1   0

reg <- lm(Y~(X1=="A")+X2,data=df)
model.matrix(reg)[45:50,]

##    (Intercept) X1 == "A"TRUE X22 X23
## 45           1             0   0   1
## 46           1             1   1   0
## 47           1             1   1   0
## 48           1             1   1   0
## 49           1             1   0   0
## 50           1             0   1   0

reg <- lm(Y~X1*X2,data=df)
model.matrix(reg)[45:50,]

##    (Intercept) X1B X1C X1D X22 X23 X1B:X22 X1C:X22 X1D:X22 X1B:X23 X1C:X23
## 45           1   0   0   1   0   1       0       0       0       0       0
## 46           1   0   0   0   1   0       0       0       0       0       0
## 47           1   0   0   0   1   0       0       0       0       0       0
## 48           1   0   0   0   1   0       0       0       0       0       0
## 49           1   0   0   0   0   0       0       0       0       0       0
## 50           1   0   1   0   1   0       0       1       0       0       0
##    X1D:X23
## 45       1
## 46       0
## 47       0
## 48       0
## 49       0
## 50       0

reg <- lm(Y~X1+X2%in%X1,data=df)
model.matrix(reg)[45:50,]

##    (Intercept) X1B X1C X1D X1A:X22 X1B:X22 X1C:X22 X1D:X22 X1A:X23 X1B:X23
## 45           1   0   0   1       0       0       0       0       0       0
## 46           1   0   0   0       1       0       0       0       0       0
## 47           1   0   0   0       1       0       0       0       0       0
## 48           1   0   0   0       1       0       0       0       0       0
## 49           1   0   0   0       0       0       0       0       0       0
## 50           1   0   1   0       0       0       1       0       0       0
##    X1C:X23 X1D:X23
## 45       0       1
## 46       0       0
## 47       0       0
## 48       0       0
## 49       0       0
## 50       0       0

download.file("http://freakonometrics.free.fr/superheroes.RData","superheroes.RData")
load("superheroes.RData")
superheroes

##       name alignment gender         publisher
## 1  Magneto       bad   male            Marvel
## 2    Storm      good female            Marvel
## 3 Mystique       bad female            Marvel
## 4   Batman      good   male                DC
## 5    Joker       bad   male                DC
## 6 Catwoman       bad female                DC
## 7  Hellboy      good   male Dark Horse Comics

publishers

##   publisher yr_founded
## 1        DC       1934
## 2    Marvel       1939
## 3     Image       1992

library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
inner_join(superheroes, publishers)

## Joining, by = "publisher"

## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector

##       name alignment gender publisher yr_founded
## 1  Magneto       bad   male    Marvel       1939
## 2    Storm      good female    Marvel       1939
## 3 Mystique       bad female    Marvel       1939
## 4   Batman      good   male        DC       1934
## 5    Joker       bad   male        DC       1934
## 6 Catwoman       bad female        DC       1934

inner_join(publishers, superheroes)

## Joining, by = "publisher"

## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector

##   publisher yr_founded     name alignment gender
## 1        DC       1934   Batman      good   male
## 2        DC       1934    Joker       bad   male
## 3        DC       1934 Catwoman       bad female
## 4    Marvel       1939  Magneto       bad   male
## 5    Marvel       1939    Storm      good female
## 6    Marvel       1939 Mystique       bad female

merge(superheroes, publishers, all = TRUE)

##           publisher     name alignment gender yr_founded
## 1 Dark Horse Comics  Hellboy      good   male         NA
## 2                DC   Batman      good   male       1934
## 3                DC    Joker       bad   male       1934
## 4                DC Catwoman       bad female       1934
## 5            Marvel  Magneto       bad   male       1939
## 6            Marvel    Storm      good female       1939
## 7            Marvel Mystique       bad female       1939
## 8             Image     <NA>      <NA>   <NA>       1992

semi_join(superheroes, publishers)

## Joining, by = "publisher"

## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector

##       name alignment gender publisher
## 1  Magneto       bad   male    Marvel
## 2    Storm      good female    Marvel
## 3 Mystique       bad female    Marvel
## 4   Batman      good   male        DC
## 5    Joker       bad   male        DC
## 6 Catwoman       bad female        DC

semi_join(publishers, superheroes)

## Joining, by = "publisher"

## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector

##   publisher yr_founded
## 1        DC       1934
## 2    Marvel       1939

left_join(superheroes, publishers)

## Joining, by = "publisher"

## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector

##       name alignment gender         publisher yr_founded
## 1  Magneto       bad   male            Marvel       1939
## 2    Storm      good female            Marvel       1939
## 3 Mystique       bad female            Marvel       1939
## 4   Batman      good   male                DC       1934
## 5    Joker       bad   male                DC       1934
## 6 Catwoman       bad female                DC       1934
## 7  Hellboy      good   male Dark Horse Comics         NA

left_join(publishers, superheroes)

## Joining, by = "publisher"

## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector

##   publisher yr_founded     name alignment gender
## 1        DC       1934   Batman      good   male
## 2        DC       1934    Joker       bad   male
## 3        DC       1934 Catwoman       bad female
## 4    Marvel       1939  Magneto       bad   male
## 5    Marvel       1939    Storm      good female
## 6    Marvel       1939 Mystique       bad female
## 7     Image       1992     <NA>      <NA>   <NA>

anti_join(superheroes, publishers)

## Joining, by = "publisher"

## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector

##      name alignment gender         publisher
## 1 Hellboy      good   male Dark Horse Comics

anti_join(publishers, superheroes)

## Joining, by = "publisher"

## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector

##   publisher yr_founded
## 1     Image       1992

download.file("http://freakonometrics.free.fr/gapminderDataFiveYear.txt",
              "gapminderDataFiveYear.txt")
gdf <- read.delim("gapminderDataFiveYear.txt")
head(gdf,4)

##       country year      pop continent lifeExp gdpPercap
## 1 Afghanistan 1952  8425333      Asia  28.801    779.45
## 2 Afghanistan 1957  9240934      Asia  30.332    820.85
## 3 Afghanistan 1962 10267083      Asia  31.997    853.10
## 4 Afghanistan 1967 11537966      Asia  34.020    836.20

str(gdf)

## 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...

library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
gtbl <- tbl_df(gdf)
library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
gdfdt <- data.table(gdf)
gdfdt

##           country year      pop continent lifeExp gdpPercap
##    1: Afghanistan 1952  8425333      Asia  28.801    779.45
##    2: Afghanistan 1957  9240934      Asia  30.332    820.85
##    3: Afghanistan 1962 10267083      Asia  31.997    853.10
##    4: Afghanistan 1967 11537966      Asia  34.020    836.20
##    5: Afghanistan 1972 13079460      Asia  36.088    739.98
##   ---                                                      
## 1700:    Zimbabwe 1987  9216418    Africa  62.351    706.16
## 1701:    Zimbabwe 1992 10704340    Africa  60.377    693.42
## 1702:    Zimbabwe 1997 11404948    Africa  46.809    792.45
## 1703:    Zimbabwe 2002 11926563    Africa  39.989    672.04
## 1704:    Zimbabwe 2007 12311143    Africa  43.487    469.71

subset(gdf, lifeExp < 30)

##          country year     pop continent lifeExp gdpPercap
## 1    Afghanistan 1952 8425333      Asia  28.801    779.45
## 1293      Rwanda 1992 7290203    Africa  23.599    737.07

gdf[lifeExp < 30,]

## Error in `[.data.frame`(gdf, lifeExp < 30, ): objet 'lifeExp' introuvable

gdf[gdf$lifeExp < 30,]

##          country year     pop continent lifeExp gdpPercap
## 1    Afghanistan 1952 8425333      Asia  28.801    779.45
## 1293      Rwanda 1992 7290203    Africa  23.599    737.07

filter(gtbl, lifeExp < 30)

## # A tibble: 2 x 6
##       country  year     pop continent lifeExp gdpPercap
##        <fctr> <int>   <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1952 8425333      Asia  28.801    779.45
## 2      Rwanda  1992 7290203    Africa  23.599    737.07

subset(gdfdt, lifeExp < 30)

##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07

gdfdt[lifeExp < 30,]

##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07

gdf[gdf$country == "Italy", c("year", "lifeExp")]

##     year lifeExp
## 769 1952  65.940
## 770 1957  67.810
## 771 1962  69.240
## 772 1967  71.060
## 773 1972  72.190
## 774 1977  73.480
## 775 1982  74.980
## 776 1987  76.420
## 777 1992  77.440
## 778 1997  78.820
## 779 2002  80.240
## 780 2007  80.546

gtbl %>% 
  filter(country == "Italy") %>%
  select(year, lifeExp)

## # A tibble: 12 x 2
##     year lifeExp
##    <int>   <dbl>
##  1  1952  65.940
##  2  1957  67.810
##  3  1962  69.240
##  4  1967  71.060
##  5  1972  72.190
##  6  1977  73.480
##  7  1982  74.980
##  8  1987  76.420
##  9  1992  77.440
## 10  1997  78.820
## 11  2002  80.240
## 12  2007  80.546

gdfdt[country == "Italy",cbind(year, lifeExp)]

##       year lifeExp
##  [1,] 1952  65.940
##  [2,] 1957  67.810
##  [3,] 1962  69.240
##  [4,] 1967  71.060
##  [5,] 1972  72.190
##  [6,] 1977  73.480
##  [7,] 1982  74.980
##  [8,] 1987  76.420
##  [9,] 1992  77.440
## [10,] 1997  78.820
## [11,] 2002  80.240
## [12,] 2007  80.546

setkey(gdfdt,country)
gdfdt["Italy",cbind(year, lifeExp)]

##       year lifeExp
##  [1,] 1952  65.940
##  [2,] 1957  67.810
##  [3,] 1962  69.240
##  [4,] 1967  71.060
##  [5,] 1972  72.190
##  [6,] 1977  73.480
##  [7,] 1982  74.980
##  [8,] 1987  76.420
##  [9,] 1992  77.440
## [10,] 1997  78.820
## [11,] 2002  80.240
## [12,] 2007  80.546

Data frames (and other format)

library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ldf <- tbl_df(gdf)
glimpse(ldf)

## Observations: 1,704
## Variables: 6
## $ country   <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afgh...
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 199...
## $ pop       <dbl> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372,...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, As...
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 4...
## $ gdpPercap <dbl> 779.45, 820.85, 853.10, 836.20, 739.98, 786.11, 978.01, 8...

select(ldf,lifeExp)

## # A tibble: 1,704 x 1
##    lifeExp
##      <dbl>
##  1  28.801
##  2  30.332
##  3  31.997
##  4  34.020
##  5  36.088
##  6  38.438
##  7  39.854
##  8  40.822
##  9  41.674
## 10  41.763
## # ... with 1,694 more rows

select(ldf,lifeExp)$lifeExp[1:10]

##  [1] 28.801 30.332 31.997 34.020 36.088 38.438 39.854 40.822 41.674 41.763

ldf[4:5,]

## # A tibble: 2 x 6
##       country  year      pop continent lifeExp gdpPercap
##        <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1967 11537966      Asia  34.020    836.20
## 2 Afghanistan  1972 13079460      Asia  36.088    739.98

slice(ldf,4:5)

## # A tibble: 2 x 6
##       country  year      pop continent lifeExp gdpPercap
##        <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1967 11537966      Asia  34.020    836.20
## 2 Afghanistan  1972 13079460      Asia  36.088    739.98

filter(ldf,lifeExp<30 | gdpPercap<300)

## # A tibble: 6 x 6
##            country  year      pop continent lifeExp gdpPercap
##             <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>
## 1      Afghanistan  1952  8425333      Asia  28.801    779.45
## 2 Congo, Dem. Rep.  2002 55379852    Africa  44.966    241.17
## 3 Congo, Dem. Rep.  2007 64606759    Africa  46.462    277.55
## 4    Guinea-Bissau  1952   580653    Africa  32.500    299.85
## 5          Lesotho  1952   748747    Africa  42.138    298.85
## 6           Rwanda  1992  7290203    Africa  23.599    737.07

rename(ldf,population = pop)

## # A tibble: 1,704 x 6
##        country  year population continent lifeExp gdpPercap
##         <fctr> <int>      <dbl>    <fctr>   <dbl>     <dbl>
##  1 Afghanistan  1952    8425333      Asia  28.801    779.45
##  2 Afghanistan  1957    9240934      Asia  30.332    820.85
##  3 Afghanistan  1962   10267083      Asia  31.997    853.10
##  4 Afghanistan  1967   11537966      Asia  34.020    836.20
##  5 Afghanistan  1972   13079460      Asia  36.088    739.98
##  6 Afghanistan  1977   14880372      Asia  38.438    786.11
##  7 Afghanistan  1982   12881816      Asia  39.854    978.01
##  8 Afghanistan  1987   13867957      Asia  40.822    852.40
##  9 Afghanistan  1992   16317921      Asia  41.674    649.34
## 10 Afghanistan  1997   22227415      Asia  41.763    635.34
## # ... with 1,694 more rows

library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
dt <- data.table(gdf)
is.data.frame(dt)

## [1] TRUE

object.size(gdf)

## 72184 bytes

object.size(ldf)

## 72304 bytes

object.size(dt)

## 72880 bytes

df[df$lifeExp<30,]

## [1] Y  X1 X2
## <0 rows> (or 0-length row.names)

subset(df, lifeExp < 30)

## Error in eval(expr, envir, enclos): objet 'lifeExp' introuvable

filter(ldf, lifeExp < 30)

## # A tibble: 2 x 6
##       country  year     pop continent lifeExp gdpPercap
##        <fctr> <int>   <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1952 8425333      Asia  28.801    779.45
## 2      Rwanda  1992 7290203    Africa  23.599    737.07

subset(dt, lifeExp < 30)

##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07

dt[lifeExp<30]

##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07

setkey(dt, country)
dt["Italy"]

##     country year      pop continent lifeExp gdpPercap
##  1:   Italy 1952 47666000    Europe  65.940    4931.4
##  2:   Italy 1957 49182000    Europe  67.810    6248.7
##  3:   Italy 1962 50843200    Europe  69.240    8243.6
##  4:   Italy 1967 52667100    Europe  71.060   10022.4
##  5:   Italy 1972 54365564    Europe  72.190   12269.3
##  6:   Italy 1977 56059245    Europe  73.480   14256.0
##  7:   Italy 1982 56535636    Europe  74.980   16537.5
##  8:   Italy 1987 56729703    Europe  76.420   19207.2
##  9:   Italy 1992 56840847    Europe  77.440   22013.6
## 10:   Italy 1997 57479469    Europe  78.820   24675.0
## 11:   Italy 2002 57926999    Europe  80.240   27968.1
## 12:   Italy 2007 58147733    Europe  80.546   28569.7

dt["Italy",mult="first"]

##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1952 47666000    Europe   65.94    4931.4

dt["Italy",max(lifeExp)]

## [1] 80.546

max(df$lifeExp[df$country=="Italy"])

## Warning in max(df$lifeExp[df$country == "Italy"]): aucun argument pour max ; -
## Inf est renvoyé

## [1] -Inf

setkey(dt, country, lifeExp) 
dt["Italy"]

##     country year      pop continent lifeExp gdpPercap
##  1:   Italy 1952 47666000    Europe  65.940    4931.4
##  2:   Italy 1957 49182000    Europe  67.810    6248.7
##  3:   Italy 1962 50843200    Europe  69.240    8243.6
##  4:   Italy 1967 52667100    Europe  71.060   10022.4
##  5:   Italy 1972 54365564    Europe  72.190   12269.3
##  6:   Italy 1977 56059245    Europe  73.480   14256.0
##  7:   Italy 1982 56535636    Europe  74.980   16537.5
##  8:   Italy 1987 56729703    Europe  76.420   19207.2
##  9:   Italy 1992 56840847    Europe  77.440   22013.6
## 10:   Italy 1997 57479469    Europe  78.820   24675.0
## 11:   Italy 2002 57926999    Europe  80.240   27968.1
## 12:   Italy 2007 58147733    Europe  80.546   28569.7

dt[.("Italy",77.44)]

##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1992 56840847    Europe   77.44     22014

dt[.("Italy",77)]

##    country year pop continent lifeExp gdpPercap
## 1:   Italy   NA  NA        NA      77        NA

dt[.("Italy",77),roll="nearest"]

##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1992 56840847    Europe      77     22014

df[df$country == "Italy", c("year", "lifeExp")]

## Error in `[.data.frame`(df, df$country == "Italy", c("year", "lifeExp")): undefined columns selected

ldf %>%
  filter(country == "Italy") %>%
  select(year, lifeExp)

## # A tibble: 12 x 2
##     year lifeExp
##    <int>   <dbl>
##  1  1952  65.940
##  2  1957  67.810
##  3  1962  69.240
##  4  1967  71.060
##  5  1972  72.190
##  6  1977  73.480
##  7  1982  74.980
##  8  1987  76.420
##  9  1992  77.440
## 10  1997  78.820
## 11  2002  80.240
## 12  2007  80.546

small_df <- df[df$country %in% c("France","Italy","Spain"), c("country","year", "lifeExp")]

## Error in `[.data.frame`(df, df$country %in% c("France", "Italy", "Spain"), : undefined columns selected

library(tidyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
str(small_df)

## Error in str(small_df): objet 'small_df' introuvable

table_df=spread(small_df,"country","lifeExp")

## Error in spread(small_df, "country", "lifeExp"): objet 'small_df' introuvable

table_df

## Error in eval(expr, envir, enclos): objet 'table_df' introuvable

data_df=gather(table_df,"country","lifeExp",2:4)

## Error in gather(table_df, "country", "lifeExp", 2:4): objet 'table_df' introuvable

data_df

## Error in eval(expr, envir, enclos): objet 'data_df' introuvable

library(reshape2,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
dcast(small_df, formula = country~year,
      value.var="lifeExp")

## Error in match(x, table, nomatch = 0L): objet 'small_df' introuvable

table_df= dcast(small_df, formula = year~country,
                value.var="lifeExp")

## Error in match(x, table, nomatch = 0L): objet 'small_df' introuvable

table_df

## Error in eval(expr, envir, enclos): objet 'table_df' introuvable

melt(table_df, id.vars=c("year"),
     value.name = "lifeExp",
     variable.name="country")

## Error in melt(table_df, id.vars = c("year"), value.name = "lifeExp", variable.name = "country"): objet 'table_df' introuvable

aggregate(small_df$lifeExp,FUN=max,by=list(small_df$country))

## Error in aggregate(small_df$lifeExp, FUN = max, by = list(small_df$country)): objet 'small_df' introuvable

tapply(small_df$lifeExp,factor(small_df$country),max)

## Error in factor(small_df$country): objet 'small_df' introuvable

small_ldf = filter(ldf,country %in% c("France","Italy","Spain"))
group_by(small_ldf,country) %>% summarize(max(lifeExp))

## # A tibble: 3 x 2
##   country `max(lifeExp)`
##    <fctr>          <dbl>
## 1  France         80.657
## 2   Italy         80.546
## 3   Spain         80.941

filter(ldf,country %in% c("France","Italy","Spain")) %>%
  group_by(country) %>% 
  summarize(max(lifeExp))

## # A tibble: 3 x 2
##   country `max(lifeExp)`
##    <fctr>          <dbl>
## 1  France         80.657
## 2   Italy         80.546
## 3   Spain         80.941

filter(ldf,country %in% c("France","Italy","Spain")) %>%
  group_by(country) %>% 
  summarize(max(lifeExp))

## # A tibble: 3 x 2
##   country `max(lifeExp)`
##    <fctr>          <dbl>
## 1  France         80.657
## 2   Italy         80.546
## 3   Spain         80.941

plot(df$gdpPercap,df$lifeExp)

## Warning in min(x): aucun argument trouvé pour min ; Inf est renvoyé

## Warning in max(x): aucun argument pour max ; -Inf est renvoyé

## Warning in min(x): aucun argument trouvé pour min ; Inf est renvoyé

## Warning in max(x): aucun argument pour max ; -Inf est renvoyé

## Error in plot.window(...): 'xlim' nécessite des valeurs finies

plot of chunk unnamed-chunk-166

select(ldf,gdpPercap,lifeExp) %>% plot()

plot of chunk unnamed-chunk-167

setkey(dt, country)
dt[c("France","Italy","Spain"),max(lifeExp),by="country"]

##    country     V1
## 1:  France 80.657
## 2:   Italy 80.546
## 3:   Spain 80.941

dt[c("France","Italy","Spain"),max(lifeExp),by="country"]

##    country     V1
## 1:  France 80.657
## 2:   Italy 80.546
## 3:   Spain 80.941

head(df[with(df,order(-lifeExp)),])

## Error in order(-lifeExp): objet 'lifeExp' introuvable

head(dt[order(-lifeExp)])

##             country year       pop continent lifeExp gdpPercap
## 1:            Japan 2007 127467972      Asia  82.603     31656
## 2: Hong Kong, China 2007   6980412      Asia  82.208     39725
## 3:            Japan 2002 127065841      Asia  82.000     28605
## 4:          Iceland 2007    301931    Europe  81.757     36181
## 5:      Switzerland 2007   7554661    Europe  81.701     37506
## 6: Hong Kong, China 2002   6762476      Asia  81.495     30209

head(setorder(dt,-lifeExp))

##             country year       pop continent lifeExp gdpPercap
## 1:            Japan 2007 127467972      Asia  82.603     31656
## 2: Hong Kong, China 2007   6980412      Asia  82.208     39725
## 3:            Japan 2002 127065841      Asia  82.000     28605
## 4:          Iceland 2007    301931    Europe  81.757     36181
## 5:      Switzerland 2007   7554661    Europe  81.701     37506
## 6: Hong Kong, China 2002   6762476      Asia  81.495     30209

arrange(ldf,-lifeExp)

## # A tibble: 1,704 x 6
##             country  year       pop continent lifeExp gdpPercap
##              <fctr> <int>     <dbl>    <fctr>   <dbl>     <dbl>
##  1            Japan  2007 127467972      Asia  82.603     31656
##  2 Hong Kong, China  2007   6980412      Asia  82.208     39725
##  3            Japan  2002 127065841      Asia  82.000     28605
##  4          Iceland  2007    301931    Europe  81.757     36181
##  5      Switzerland  2007   7554661    Europe  81.701     37506
##  6 Hong Kong, China  2002   6762476      Asia  81.495     30209
##  7        Australia  2007  20434176   Oceania  81.235     34435
##  8            Spain  2007  40448191    Europe  80.941     28821
##  9           Sweden  2007   9031088    Europe  80.884     33860
## 10           Israel  2007   6426679      Asia  80.745     25523
## # ... with 1,694 more rows

df$gdp=df$pop*df$gdpPercap

## Error in `$<-.data.frame`(`*tmp*`, "gdp", value = numeric(0)): replacement has 0 rows, data has 50

df = within(df,gdp = pop*gdpPercap)

## Error in eval(expr, envir, enclos): argument manquant, sans valeur associée par défaut

df = within(df,gdp <- pop*gdpPercap)

## Error in eval(expr, envir, enclos): objet 'pop' introuvable

dt[,gdp:=pop*gdpPercap]
head(dt)

##             country year       pop continent lifeExp gdpPercap        gdp
## 1:            Japan 2007 127467972      Asia  82.603     31656 4.0351e+12
## 2: Hong Kong, China 2007   6980412      Asia  82.208     39725 2.7730e+11
## 3:            Japan 2002 127065841      Asia  82.000     28605 3.6347e+12
## 4:          Iceland 2007    301931    Europe  81.757     36181 1.0924e+10
## 5:      Switzerland 2007   7554661    Europe  81.701     37506 2.8335e+11
## 6: Hong Kong, China 2002   6762476      Asia  81.495     30209 2.0429e+11

mutate(ldf,gdp=pop*gdpPercap)

## # A tibble: 1,704 x 7
##        country  year      pop continent lifeExp gdpPercap        gdp
##         <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>      <dbl>
##  1 Afghanistan  1952  8425333      Asia  28.801    779.45 6.5671e+09
##  2 Afghanistan  1957  9240934      Asia  30.332    820.85 7.5854e+09
##  3 Afghanistan  1962 10267083      Asia  31.997    853.10 8.7589e+09
##  4 Afghanistan  1967 11537966      Asia  34.020    836.20 9.6480e+09
##  5 Afghanistan  1972 13079460      Asia  36.088    739.98 9.6786e+09
##  6 Afghanistan  1977 14880372      Asia  38.438    786.11 1.1698e+10
##  7 Afghanistan  1982 12881816      Asia  39.854    978.01 1.2599e+10
##  8 Afghanistan  1987 13867957      Asia  40.822    852.40 1.1821e+10
##  9 Afghanistan  1992 16317921      Asia  41.674    649.34 1.0596e+10
## 10 Afghanistan  1997 22227415      Asia  41.763    635.34 1.4122e+10
## # ... with 1,694 more rows

countries=read.csv2("http://freakonometrics.free.fr/countries.csv",header=TRUE)
countries = countries[-1,]
head(countries)

##       Country Interational.Direct.Dial..IDD..Code ISO.3166 Car.Code
## 2 Afghanistan                                  93       AF         
## 3     Albania                                 355       AL         
## 4     Algeria                                 213       DZ         
## 5     Andorra                              33 628       AD         
## 6      Angola                                 244       AO         
## 7    Anguilla                               1 809       AI         
##         Capital              Currency ISO.Currency Digraph  ICAO   Language
## 2         Kabul               Afghani          AFA      AF   YA-     Pashto
## 3        Tirana                   Lek          ALL      AL   ZA-   Albanian
## 4       Algiers                 Dinar          DZD      DZ   7T-     Arabic
## 5 And. La Vella          French Franc          FRF      AD   C3-    Catalan
## 6        Luanda           New Irwanza          AON      AO   D2- Portuguese
## 7               East Caribbean Dollar          XCD      AI VP-LA           
##   ISO.639
## 2      PS
## 3      SQ
## 4      AR
## 5      CA
## 6      PT
## 7

df2 = countries[,c("Country","Capital")]
names(df2)=c("countries","capital")

df1 = aggregate(df$lifeExp,FUN=max,by=list(df$country))

## Error in aggregate.data.frame(as.data.frame(x), ...): no rows to aggregate

names(df1)=c("countries","lifeExp")

## Error in names(df1) = c("countries", "lifeExp"): objet 'df1' introuvable

dim(df1)

## Error in eval(expr, envir, enclos): objet 'df1' introuvable

dim(df2)

## [1] 263   2

tail(merge(df1,df2))

## Error in merge(df1, df2): objet 'df1' introuvable

tail(merge(df1,df2,all.x=TRUE))

## Error in merge(df1, df2, all.x = TRUE): objet 'df1' introuvable

tail(merge(df1,df2,all.y=TRUE))

## Error in merge(df1, df2, all.y = TRUE): objet 'df1' introuvable

ldf1 = group_by(ldf, country) %>% summarize(max(lifeExp)) 
names(ldf1)=c("countries","lifeExp")
ldf2 <- tbl_df(df2)
inner_join(ldf1,ldf2)

## Joining, by = "countries"

## Warning: Column `countries` joining factors with different levels, coercing to
## character vector

## # A tibble: 127 x 3
##      countries lifeExp      capital
##          <chr>   <dbl>       <fctr>
##  1 Afghanistan  43.828        Kabul
##  2     Albania  76.423       Tirana
##  3     Algeria  72.301      Algiers
##  4      Angola  42.731       Luanda
##  5   Argentina  75.320 Buenos Aires
##  6   Australia  81.235     Canberra
##  7     Austria  79.829       Vienna
##  8     Bahrain  75.635       Manama
##  9  Bangladesh  64.062        Dacca
## 10     Belgium  79.441     Brussels
## # ... with 117 more rows

left_join(ldf1,ldf2)

## Joining, by = "countries"

## Warning: Column `countries` joining factors with different levels, coercing to
## character vector

## # A tibble: 142 x 3
##      countries lifeExp      capital
##          <chr>   <dbl>       <fctr>
##  1 Afghanistan  43.828        Kabul
##  2     Albania  76.423       Tirana
##  3     Algeria  72.301      Algiers
##  4      Angola  42.731       Luanda
##  5   Argentina  75.320 Buenos Aires
##  6   Australia  81.235     Canberra
##  7     Austria  79.829       Vienna
##  8     Bahrain  75.635       Manama
##  9  Bangladesh  64.062        Dacca
## 10     Belgium  79.441     Brussels
## # ... with 132 more rows

ldf1 %>% inner_join(ldf2)

## Joining, by = "countries"

## Warning: Column `countries` joining factors with different levels, coercing to
## character vector

## # A tibble: 127 x 3
##      countries lifeExp      capital
##          <chr>   <dbl>       <fctr>
##  1 Afghanistan  43.828        Kabul
##  2     Albania  76.423       Tirana
##  3     Algeria  72.301      Algiers
##  4      Angola  42.731       Luanda
##  5   Argentina  75.320 Buenos Aires
##  6   Australia  81.235     Canberra
##  7     Austria  79.829       Vienna
##  8     Bahrain  75.635       Manama
##  9  Bangladesh  64.062        Dacca
## 10     Belgium  79.441     Brussels
## # ... with 117 more rows

dt1 = dt[,max(lifeExp),by="country"]
setnames(dt1,c("countries","lifeExp"))
dt2 <- data.table(df2)
setkey(dt1,"countries")
setkey(dt2,"countries")
dt1[dt2,nomatch=0]

##        countries lifeExp      capital
##   1: Afghanistan  43.828        Kabul
##   2:     Albania  76.423       Tirana
##   3:     Algeria  72.301      Algiers
##   4:      Angola  42.731       Luanda
##   5:   Argentina  75.320 Buenos Aires
##  ---                                 
## 123:     Uruguay  76.384   Montevideo
## 124:   Venezuela  73.747      Caracas
## 125:     Vietnam  74.249        Hanoi
## 126:      Zambia  51.821       Lusaka
## 127:    Zimbabwe  62.351       Harare

dt1[dt2]

##        countries lifeExp       capital
##   1: Afghanistan  43.828         Kabul
##   2:     Albania  76.423        Tirana
##   3:     Algeria  72.301       Algiers
##   4:     Andorra      NA And. La Vella
##   5:      Angola  42.731        Luanda
##  ---                                  
## 259:       Yemen      NA        Sana'a
## 260:  Yugoslavia      NA      Belgrade
## 261:       Zaire      NA      Kinshasa
## 262:      Zambia  51.821        Lusaka
## 263:    Zimbabwe  62.351        Harare

dt2[dt1]

##               countries      capital lifeExp
##   1:        Afghanistan        Kabul  43.828
##   2:            Albania       Tirana  76.423
##   3:            Algeria      Algiers  72.301
##   4:             Angola       Luanda  42.731
##   5:          Argentina Buenos Aires  75.320
##  ---                                        
## 138:            Vietnam        Hanoi  74.249
## 139: West Bank and Gaza           NA  73.422
## 140:        Yemen, Rep.           NA  62.698
## 141:             Zambia       Lusaka  51.821
## 142:           Zimbabwe       Harare  62.351

library(stringr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(lubridate,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(plyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(LaF,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

loc="http://freakonometrics.free.fr/gapminderDataFiveYear.txt"
download.file(loc,destfile="temp")
ssn <- laf_open_csv("temp",sep="\t", 
                    column_types=c("character","integer","numeric","character","numeric","numeric"),
                    column_names=names(df)[1:6])
object.size(ssn)

## 2872 bytes

str(ssn)

## Formal class 'laf' [package "LaF"] with 8 slots
##   ..@ file_id      : int 0
##   ..@ filename     : chr "temp"
##   ..@ file_type    : chr "csv"
##   ..@ column_names : chr [1:6] "Y" "X1" "X2" "NA." ...
##   ..@ column_types : int [1:6] 3 1 0 3 0 0
##   ..@ column_widths: int(0) 
##   ..@ levels       : list()
##   ..@ options      :List of 4
##   .. ..$ sep : chr "\t"
##   .. ..$ dec : chr "."
##   .. ..$ skip: int 0
##   .. ..$ trim: logi FALSE

nrow(ssn)

## NULL

go_through <- which(ssn[1:(nrow(ssn)-1),'country'] != ssn[2:nrow(ssn),'country'])+1

## Error in 1:(nrow(ssn) - 1): l'argument est de longueur nulle

if(go_through[length(go_through)] != nrow(ssn)) go_through <- c(go_through, nrow(ssn))

## Error in eval(expr, envir, enclos): objet 'go_through' introuvable

go_through <- cbind(go_through[-length(go_through)], c(go_through[-c(1, length(go_through))]-1, go_through[length(go_through)]))

## Error in cbind(go_through[-length(go_through)], c(go_through[-c(1, length(go_through))] - : objet 'go_through' introuvable

worst_life_exp <- function(s){
  data <- ssn[go_through[s,1]:go_through[s,2],c("country","year","lifeExp")]
  data[which.min(data$lifeExp),]
}
worst_life_exp(10)

## Error in ssn[go_through[s, 1]:go_through[s, 2], c("country", "year", "lifeExp")]: objet 'go_through' introuvable

Graphs with ggplot2

library(gamair,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
data(chicago)
head(chicago)

##   death pm10median pm25median o3median so2median    time tmpd
## 1   130   -7.43354         NA  -19.592   1.92804 -2556.5 31.5
## 2   150         NA         NA  -19.039  -0.98556 -2555.5 33.0
## 3   101   -0.82653         NA  -20.217  -1.89142 -2554.5 33.0
## 4   135    5.56646         NA  -19.676   6.13934 -2553.5 29.0
## 5   126         NA         NA  -19.217   2.27846 -2552.5 32.0
## 6   130    6.56646         NA  -17.634   9.85858 -2551.5 40.0

base=data.frame(death=chicago$death,
                temp_F=chicago$tmpd,
                o3=chicago$o3median,
                date=seq(as.Date("1987-01-01"),
                         as.Date("2000-12-31"),by=1))
base$temp_C <- (base$temp_F-32)/1.8
base$year <- substring(base$date,1,4)

date2season <- function(date){
  m <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%m"))
  d <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%d"))
  s <- NA
  if(m %in% c(1,2) | ((m==12)&(d>=21)) |  ((m==3)&(d<21))) s <- "winter"
  if(m %in% c(4,5) | ((m==3)&(d>=21)) |  ((m==6)&(d<21))) s <- "spring"
  if(m %in% c(7,8) | ((m==6)&(d>=21)) |  ((m==9)&(d<21))) s <- "summer"
  if(m %in% c(10,11) | ((m==9)&(d>=21)) |  ((m==12)&(d<21))) s <- "autumn"
  return(s)}
base$season <- sapply(base$date,date2season)
head(base)

##   death temp_F      o3       date   temp_C year season
## 1   130   31.5 -19.592 1987-01-01 -0.27778 1987 winter
## 2   150   33.0 -19.039 1987-01-02  0.55556 1987 winter
## 3   101   33.0 -20.217 1987-01-03  0.55556 1987 winter
## 4   135   29.0 -19.676 1987-01-04 -1.66667 1987 winter
## 5   126   32.0 -19.217 1987-01-05  0.00000 1987 winter
## 6   130   40.0 -17.634 1987-01-06  4.44444 1987 winter

plot(base[,c("date", "temp_C")])

plot of chunk unnamed-chunk-184

library(ggplot2,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
g <- ggplot(base, aes(date, temp_C)) +
  geom_point()
g

plot of chunk unnamed-chunk-185

ggsave(file="figure.pdf", scale=3)

## Saving 21 x 21 in image

g <- g+labs(title='Temperature')
g <- g+ggtitle('Temperature')
g

plot of chunk unnamed-chunk-186

g+theme(plot.title = element_text(size=20, face="bold", vjust=2, color="red"))

plot of chunk unnamed-chunk-187

g<-g+labs(
  x="Date", 
  y=expression(paste("Temperature ( ", degree ~ C, " )")), 
  title="Temperature in Chicago")
g

plot of chunk unnamed-chunk-188

g+ theme(
  axis.title.x = element_text(color="blue"),
  axis.title.y = element_text(color="blue")   
)

plot of chunk unnamed-chunk-189

g + ylim(c(-20,60))

## Warning: Removed 10 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-190

g <- g + xlim(as.Date(c("1995-01-01","1999-01-01")))
g

## Warning: Removed 3652 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-190

base2=data.frame(
  today=base$temp_C[1:5113],
  tomorrow=base$temp_C[2:5114])
ggplot(base2, aes(today, tomorrow))+     geom_point()+ coord_equal()

plot of chunk unnamed-chunk-191

g <- ggplot(base, aes(date, temp_C, color=factor(season)))+
  xlim(as.Date(c("1995-01-01","1999-01-01")))+
  geom_point()
g+theme(legend.title=element_blank())+
  guides(colour = guide_legend(override.aes = list(size=4)))

## Warning: Removed 3652 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-192

g<-ggplot(base, aes(date, temp_C))+
  xlim(as.Date(c("1995-01-01","1999-01-01")))
g+geom_line(color="grey")

## Warning: Removed 3652 rows containing missing values (geom_path).

plot of chunk unnamed-chunk-193

g+geom_line(aes(color="temperature"))+
  scale_colour_manual(name='', values=c('temperature'='grey'))

## Warning: Removed 3652 rows containing missing values (geom_path).

plot of chunk unnamed-chunk-194

ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point() + 
  scale_color_manual(values=
                       c("dodgerblue4", "darkolivegreen4",
                         "darkorchid3", "goldenrod1"))

plot of chunk unnamed-chunk-195

ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")

plot of chunk unnamed-chunk-196

ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
  scale_color_gradient(low="blue", high="red")

plot of chunk unnamed-chunk-197

ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
  scale_color_gradient2(low="blue", mid="white", high="red")

plot of chunk unnamed-chunk-198

ggplot(base, aes(date, temp_C))+geom_point(color="firebrick")+
  theme(panel.background = element_rect(fill = 'light yellow'))

plot of chunk unnamed-chunk-199

ggplot(base, aes(date,temp_C))+geom_point(color="firebrick")+
  facet_wrap(~year, ncol=2, scales="free")

plot of chunk unnamed-chunk-200

library(ggthemes)
ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point()+labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like MS Excel")+
  theme_excel()+scale_colour_excel()+
  theme(legend.title=element_blank())

plot of chunk unnamed-chunk-201

library(ggthemes)
ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point()+labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like 'The Economist'")+
  theme_economist()+
  theme(legend.title=element_blank())

plot of chunk unnamed-chunk-202

## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'rCharts'

## Error in eval(expr, envir, enclos): impossible de trouver la fonction "nPlot"

## Error in eval(expr, envir, enclos): objet 'n1' introuvable

OpenStreetMap

library(OpenStreetMap,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
map <- openmap(c(lat= 51.516, lon= -.141), c(lat= 51.511, lon= -.133))
map <- openproj(map, projection = "+init=epsg:27700") 
plot(map)

plot of chunk unnamed-chunk-203

loc="http://freakonometrics.free.fr/cholera.zip"
download.file(loc,destfile="cholera.zip")
unzip("cholera.zip", exdir="./")
library(maptools,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

## Checking rgeos availability: TRUE

deaths<-readShapePoints("Cholera_Deaths")

## Warning: use rgdal::readOGR or sf::st_read

head(deaths@coords)

##   coords.x1 coords.x2
## 0    529309    181031
## 1    529312    181025
## 2    529314    181020
## 3    529317    181014
## 4    529321    181008
## 5    529337    181006

plot(map)
points(deaths@coords,col="red", pch=19, cex=.7 )

plot of chunk unnamed-chunk-205

X <- deaths@coords
library(KernSmooth)

## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009

kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))

## Warning in bw.ucv(X[, 1]): minimum occurred at one end of the range

library(grDevices,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
clrs <- colorRampPalette(c(rgb(0,0,1,0), rgb(0,0,1,1)), alpha = TRUE)(20)

plot(map)
points(deaths@coords,col="red", pch=19, cex=.7 )
image(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE,col=clrs)
contour(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE)

plot of chunk unnamed-chunk-207

Google Map

library(ggmap,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
get_london <- get_map(c(-.137,51.513), zoom=17)

## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=51.513,-0.137&zoom=17&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false

london <- ggmap(get_london)

OpenStreetMap

london

plot of chunk unnamed-chunk-209

df_deaths <- data.frame(X)
library(sp,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(rgdal,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

## rgdal: version: 1.2-15, (SVN revision 691)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.1.2, released 2016/10/24
##  Path to GDAL shared files: /usr/share/gdal/2.1
##  GDAL binary built with GEOS: TRUE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: (autodetected)
##  Linking to sp version: 1.2-5

coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700") 
df_deaths = spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))

OpenStreetMap

london + geom_point(aes(x=coords.x1,y=coords.x2),data=data.frame(df_deaths@coords),col="red")

## Warning: Removed 9 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-211

london <- london + geom_point(aes(x=coords.x1, y=coords.x2), 
data=data.frame(df_deaths@coords),col="red") +
  geom_density2d(data = data.frame(df_deaths@coords), 
aes(x = coords.x1, y=coords.x2), size = 0.3) + 
  stat_density2d(data = data.frame(df_deaths@coords), 
 aes(x = coords.x1, y=coords.x2,fill = ..level.., alpha = ..level..),
 size = 0.01, bins = 16, geom = "polygon") + scale_fill_gradient(low = "green", high = "red",
 guide = FALSE) + scale_alpha(range = c(0, 0.3), guide = FALSE)

london

## Warning: Removed 9 rows containing non-finite values (stat_density2d).

## Warning: Removed 9 rows containing non-finite values (stat_density2d).

## Warning: Removed 9 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-213

require(leaflet)
deaths <- readShapePoints("Cholera_Deaths")

## Warning: use rgdal::readOGR or sf::st_read

df_deaths <- data.frame(deaths@coords)
coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700") 
df_deaths=spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))
df=data.frame(df_deaths@coords)
lng=df$coords.x1
lat=df$coords.x2
m = leaflet()%>% addTiles()

m %>% fitBounds(-.141,  51.511, -.133, 51.516)

rd=.5
op=.8
clr="blue"
m = leaflet() %>% addTiles()
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr)

## Error in loadNamespace(name): there is no package called 'webshot'

X=cbind(lng,lat)
kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))

## Warning in bw.ucv(X[, 1]): minimum occurred at one end of the range

x=kde2d$x1
y=kde2d$x2
z=kde2d$fhat
CL=contourLines(x , y , z)
m = leaflet() %>% addTiles() 
m %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)

## Error in loadNamespace(name): there is no package called 'webshot'

m %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)

m = leaflet() %>% addTiles() 
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)

## Error in loadNamespace(name): there is no package called 'webshot'

m = leaflet() %>% addTiles() 
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE)

## Error in loadNamespace(name): there is no package called 'webshot'

m = leaflet() %>% addTiles() 
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>%
  addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>%
  addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red")

## Error in loadNamespace(name): there is no package called 'webshot'

Map with googleVis

library(googleVis)
df=data.frame(state=c("AU-WA","AU-VIC","AU-NT", "AU-NSW", "AU-SA","AU-QLD","AU-TAS","NZ"),
R_users=c(323,425,154,486,201,195,87,123))
Geo <- gvisGeoChart(df,
locationvar="state",colorvar=c("R_users"),
options=list(region="AU",
dataMode="regions",resolution="provinces",
width="80%", height="80%"))

Map with googleVis

library(XML,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
year <- 2012
loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="")
tabs <- readHTMLTable(htmlParse(loc)) 
tabs[1]

## $`NULL`
##     #                    Name          Date Wind Pres Cat  
## 1   1  Tropical Storm ALBERTO     19-23 MAY   50  995   -  
## 2   2    Tropical Storm BERYL 25 MAY- 2 JUN   60  992   -  
## 3   3       Hurricane-1 CHRIS     17-24 JUN   75  974   1  
## 4   4    Tropical Storm DEBBY     23-27 JUN   55  990   -  
## 5   5     Hurricane-2 ERNESTO      1-10 AUG   85  973   2  
## 6   6 Tropical Storm FLORENCE      3- 8 AUG   50 1002   -  
## 7   7   Tropical Storm HELENE      9-19 AUG   40 1004   -  
## 8   8      Hurricane-2 GORDON     15-21 AUG   95  965   2  
## 9   9       Hurricane-1 ISAAC 20 AUG- 1 SEP   70  965   1  
## 10 10    Tropical Storm JOYCE     21-24 AUG   35 1006   -  
## 11 11        Hurricane-2 KIRK 28 AUG- 3 SEP   90  970   2  
## 12 12      Hurricane-1 LESLIE 28 AUG-12 SEP   70  968   1  
## 13 13     Hurricane-3 MICHAEL      2-12 SEP  100  964   3  
## 14 14      Hurricane-1 NADINE 10 SEP- 4 OCT   80  978   1  
## 15 15    Tropical Storm OSCAR      2- 5 OCT   45  994   -  
## 16 16    Tropical Storm PATTY     10-13 OCT   40 1005   -  
## 17 17      Hurricane-1 RAFAEL     12-26 OCT   80  966   1  
## 18 18       Hurricane-3 SANDY     21-31 OCT  100  940   3  
## 19 19     Tropical Storm TONY     21-26 OCT   45 1000   -

storms <- unlist(strsplit(as.character(tabs[[1]]$Name),split=" "))
storms

##  [1] "Tropical"    "Storm"       "ALBERTO"     "Tropical"    "Storm"      
##  [6] "BERYL"       "Hurricane-1" "CHRIS"       "Tropical"    "Storm"      
## [11] "DEBBY"       "Hurricane-2" "ERNESTO"     "Tropical"    "Storm"      
## [16] "FLORENCE"    "Tropical"    "Storm"       "HELENE"      "Hurricane-2"
## [21] "GORDON"      "Hurricane-1" "ISAAC"       "Tropical"    "Storm"      
## [26] "JOYCE"       "Hurricane-2" "KIRK"        "Hurricane-1" "LESLIE"     
## [31] "Hurricane-3" "MICHAEL"     "Hurricane-1" "NADINE"      "Tropical"   
## [36] "Storm"       "OSCAR"       "Tropical"    "Storm"       "PATTY"      
## [41] "Hurricane-1" "RAFAEL"      "Hurricane-3" "SANDY"       "Tropical"   
## [46] "Storm"       "TONY"

index <- storms%in%c("Tropical", "Storm", paste("Hurricane-",1:6,sep=""), "Depression", "Subtropical", "Extratropical", "Low", paste("Storm-",1:6,sep=""), "Xxx")
nstorms  <- storms[!index]
nstorms

##  [1] "ALBERTO"  "BERYL"    "CHRIS"    "DEBBY"    "ERNESTO"  "FLORENCE"
##  [7] "HELENE"   "GORDON"   "ISAAC"    "JOYCE"    "KIRK"     "LESLIE"  
## [13] "MICHAEL"  "NADINE"   "OSCAR"    "PATTY"    "RAFAEL"   "SANDY"   
## [19] "TONY"

TRACK=NULL

for(i in length(nstorms):1){
  if((nstorms[i]=="SIXTEE")&(year==2008)) nstorms[i] <- "SIXTEEN"
  if((nstorms[i]=="LAUR")&(year==2008)) nstorms[i] <- "LAURA"
  if((nstorms[i]=="FIFTEE")&(year==2007)) nstorms[i] <- "FIFTEEN"
  if((nstorms[i]=="CHANTA")&(year==2007)) nstorms[i] <- "CHANTAL"
  if((nstorms[i]=="ANDR")&(year==2007)) nstorms[i] <- "ANDREA"
  if((nstorms[i]=="NINETE")&(year==2005)) nstorms[i] <- "NINETEEN"
  if((nstorms[i]=="JOSEPH")&(year==2002)) nstorms[i] <- "JOSEPHINE"
  if((nstorms[i]=="FLOY")&(year==1993)) nstorms[i] <- "FLOYD"
  if((nstorms[i]=="KEIT")&(year==1988)) nstorms[i] <- "KEITH"
  if((nstorms[i]=="CHARLI")&(year==1972)) nstorms[i] <- "CHARLIE"}

for(i in length(nstorms):1){
  loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="")
  track <- read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20))

  names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT")
  track$LAT=as.numeric(as.character(track$LAT))
  track$LON=as.numeric(as.character(track$LON))
  track$WIND=as.numeric(as.character(track$WIND))
  track$PR=as.numeric(as.character(track$PR))
  track$year=year
  track$name=nstorms[i]
  TRACK=rbind(TRACK,track)}

tail(TRACK,10)

##     ADV  LAT   LON         TIME WIND   PR           STAT year    name
## 645  11 30.4 -79.1  05/21/12Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 646  12 30.5 -78.3  05/21/18Z     35 1006 TROPICAL STORM 2012 ALBERTO
## 647  13 30.7 -77.1  05/22/00Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 648  14 31.5 -76.1  05/22/06Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 649  15 32.5 -74.7  05/22/12Z     30 1008            LOW 2012 ALBERTO
## 650  16 33.4 -73.4  05/22/18Z     30 1008            LOW 2012 ALBERTO
## 651  17 34.1 -71.9  05/23/00Z     25 1010            LOW 2012 ALBERTO
## 652  18 34.9 -70.1  05/23/06Z     25 1011            LOW 2012 ALBERTO
## 653  19 35.5 -67.9  05/23/12Z     25 1012            LOW 2012 ALBERTO
## 654  20 35.9 -66.0  05/23/18Z     25 1012            LOW 2012 ALBERTO

extract.track <- function(year=2012,p=TRUE){
loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="")
tabs <- readHTMLTable(htmlParse(loc)) 
storms <- unlist(strsplit(as.character(
tabs[[1]]$Name),split=" "))
index <- storms%in%c("Tropical","Storm",paste("Hurricane-",1:6,sep=""),"Depression","Subtropical","Extratropical","Low",paste("Storm-",1:6,sep=""),
"Xxx")
nstorms  <- storms[!index]
TRACK=NULL
for(i in length(nstorms):1){
loc=paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="")
track=read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20))
names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT")
track$LAT=as.numeric(as.character(track$LAT))
track$LON=as.numeric(as.character(track$LON))
track$WIND=as.numeric(as.character(track$WIND))
track$PR=as.numeric(as.character(track$PR))
track$year=year
track$name=nstorms[i]
TRACK=rbind(TRACK,track)
if(p==TRUE){  cat(year,i,nstorms[i],nrow(track),"\n")}}
return(TRACK)}

TOTTRACK=NULL
#  for(y in 2012:1851){
#    TOTTRACK=rbind(TOTTRACK,extract.track(y))
#  }

loc="http://freakonometrics.free.fr/TRACK-ATLANTIC.Rdata"
download.file(loc,destfile="TRACK-ATLANTIC.Rdata")
load("TRACK-ATLANTIC.Rdata")
dim(TOTTRACK)

## [1] 26712     9

library(grDevices,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(maps,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
map("world", col="light yellow", fill=TRUE)

plot of chunk unnamed-chunk-234

map("world", col="light yellow", fill=TRUE)
for(n in unique(TOTTRACK$name)){
  lines(TOTTRACK$LON[TOTTRACK$name==n],TOTTRACK$LAT[TOTTRACK$name==n],lwd=.5,col=rgb(1, 0, 0, alpha=.5))}

plot of chunk unnamed-chunk-235

Shapefiles

loc="http://freakonometrics.free.fr/Italy.zip"
download.file(loc,destfile="Italy.zip")
unzip("Italy.zip", exdir="./")
library(rgdal,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ita1<-readOGR(dsn="./Italy/",layer="ITA_adm1",verbose=FALSE)
names(ita1)

## [1] "ID_0"      "ISO"       "NAME_0"    "ID_1"      "NAME_1"    "NL_NAME_1"
## [7] "VARNAME_1" "TYPE_1"    "ENGTYPE_1"

plot(ita1,col="light green")

plot of chunk unnamed-chunk-237

ita1@data[,"NAME_1"][1:6]

## [1] Abruzzo        Apulia         Basilicata     Calabria       Campania      
## [6] Emilia-Romagna
## 20 Levels: Abruzzo Apulia Basilicata Calabria Campania ... Veneto

pos<-which(ita1@data[,"NAME_1"]=="Sicily")
Poly_Sicily<-ita1[pos,]

plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)

plot of chunk unnamed-chunk-239

plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
abline(v=5:20,col="light blue")
abline(h=35:50,col="light blue")
axis(1)
axis(2)

plot of chunk unnamed-chunk-240

pos<-which(ita1@data[,"NAME_1"] %in% c("Abruzzo","Apulia","Basilicata","Calabria","Campania","Lazio","Molise","Sardegna","Sicily"))
ita_south <- ita1[pos,]
ita_north <- ita1[-pos,]
plot(ita1)
plot(ita_south,col="red",add=TRUE)
plot(ita_north,col="blue",add=TRUE)

plot of chunk unnamed-chunk-241

library(xlsx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
loc="http://freakonometrics.free.fr/Italy_codes.xlsx"
download.file(loc,destfile="Italy_codes.xlsx")
data_codes<-read.xlsx2(file = "Italy_codes.xlsx",sheetName="ITALY", startRow=1, header=TRUE)
names(data_codes)[1]="NAME_1"
ita2<-merge(ita1, data_codes, by = "NAME_1", all.x=TRUE)
pos<-which(ita2@data[,"GROUP"] == "SOUTH")

## Error in which(ita2@data[, "GROUP"] == "SOUTH"): tentative d'obtenir le slot "data" d'un objet (classe "data.frame") qui n'est pas un objet S4

library(rgeos,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)

## rgeos version: 0.3-26, (SVN revision 560)
##  GEOS runtime version: 3.5.1-CAPI-1.9.1 r4246 
##  Linking to sp version: 1.2-5 
##  Polygon checking: TRUE

ita_s<-gUnionCascaded(ita_south)
ita_n<-gUnionCascaded(ita_north)
plot(ita1)
plot(ita_s,col="red",add=TRUE)
plot(ita_n,col="blue",add=TRUE)

plot of chunk unnamed-chunk-243

plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
gCentroid(Poly_Sicily,byid=TRUE)

## class       : SpatialPoints 
## features    : 1 
## extent      : 14.14668, 14.14668, 37.58842, 37.58842  (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0

points(gCentroid(Poly_Sicily,byid=TRUE),pch=19,col="red")

plot of chunk unnamed-chunk-244

G <- as.data.frame(gCentroid(ita1,byid=TRUE))
plot(ita1,col="light green")
text(G$x,G$y,1:20)

plot of chunk unnamed-chunk-245

c1 <-G[c(17,20,6,16,8,12,2),]
c2 <- G[c(13,11,1,5,3,4),]
L1<-SpatialLines(list(Lines(list(Line(c1)),"Line1")))
L2<-SpatialLines(list(Lines(list(Line(c2)),"Line2")))
L1@proj4string<-ita1@proj4string
L2@proj4string<-ita1@proj4string
cross_road <-gIntersection(L1,L2)
cross_road@coords

##          x        y
## 1 11.06947 44.03287
## 1 14.20034 41.74879

plot(ita1,col="light green")
plot(L1,col="red",lwd=2,add=TRUE)
plot(L2,col="blue",lwd=2,add=TRUE)
plot(cross_road,pch=19,cex=1.5,add=TRUE)

plot of chunk unnamed-chunk-247

Linear Model

\[ y_i=\boldsymbol{x}_i^{\text{T}}\boldsymbol{\beta}+\varepsilon_i=\beta_0 + [\beta_1 x_{1,i}+\cdots+ \beta_k x_{k,i}]+\varepsilon_i \]

See linear model, where $\varepsilon_i\sim\mathcal{N}(0,\sigma^2)$ i.id.

\[ (Y\vert \boldsymbol{X}=\boldsymbol{x})\sim \mathcal{N}(\boldsymbol{x}^{\text{T}}\boldsymbol{\beta}, \sigma^2) \] i.e. \[ \mathbb{E}[Y\vert\boldsymbol{X}=\boldsymbol{x}]=\boldsymbol{x}^{\text{T}}\boldsymbol{\beta} \] and homoscedastic model, \[ \text{Var}[Y\vert\boldsymbol{X}=\boldsymbol{x}]=\sigma^2 \]

Least squares (and maximum likelihood) estimator \[ \widehat{\boldsymbol{\beta}}=\text{argmin} \left\lbrace \sum_{i=1}^n (y_i-\boldsymbol{x}_i^{\text{T}}\boldsymbol{\beta})^2 \right\rbrace =(\boldsymbol{X}^{\text{T}}\boldsymbol{X})^{-1}\boldsymbol{X}^{\text{T}}\boldsymbol{y}\]

Linear Model

plot(cars)

plot of chunk unnamed-chunk-248

model <- lm(dist ~ speed, data=cars)

Linear Model

summary(model)

## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.069  -9.525  -2.272   9.215  43.201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
## speed         3.9324     0.4155   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12

Linear Model

X=cbind(1,cars$speed)
Y=cars$dist
solve(crossprod(X,X),crossprod(X,Y))

##            [,1]
## [1,] -17.579095
## [2,]   3.932409

model$coefficients

## (Intercept)       speed 
##  -17.579095    3.932409

Linear Model

summary(model)$sigma^2*solve(crossprod(X,X))

##           [,1]       [,2]
## [1,] 45.676514 -2.6588234
## [2,] -2.658823  0.1726509

vcov(model)

##             (Intercept)      speed
## (Intercept)   45.676514 -2.6588234
## speed         -2.658823  0.1726509

n=nrow(cars)

Linear Model

confint(model)

##                  2.5 %    97.5 %
## (Intercept) -31.167850 -3.990340
## speed         3.096964  4.767853

model$coefficients[1]+qt(c(.025,.975),n-2)* summary(model)$coefficients[1,2]

## [1] -31.16785  -3.99034

Linear Model

coefficient of determination $\displaystyle{R^2 = \frac{\text{explained variance}}{\text{total variance}}}$

summary(model)$r.squared

## [1] 0.6510794

1-deviance(model)/sum((Y-mean(Y))^2)

## [1] 0.6510794

\[ \text{Var}[Y]= \text{Var}[\mathbb{E}[Y\vert X]]+ \mathbb{E}[\text{Var}[Y\vert X]] \]

see analysis of variance

Linear Model

\[ \overline{R}^2 = 1-[1-R^2]\cdot \frac{n-1}{n-(k-1)-1} \]

summary(model)$adj.r.squared

## [1] 0.6438102

Linear Model

anova(lm(dist~speed,data=cars),lm(dist~1,data=cars))

## Analysis of Variance Table
## 
## Model 1: dist ~ speed
## Model 2: dist ~ 1
##   Res.Df   RSS Df Sum of Sq      F   Pr(>F)    
## 1     48 11354                                 
## 2     49 32539 -1    -21186 89.567 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Linear Model

plot(cars)
abline(model,col="red")

plot of chunk unnamed-chunk-257

Linear Model

plot(cars)
abline(model,col="red")
x=seq(2,26)
y=predict(model, newdata=data.frame(speed=x))
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-258

Linear Model

y=predict(model, newdata=data.frame(speed=x), interval = "confidence")
head(y)

##         fit        lwr       upr
## 1 -9.714277 -21.733068  2.304513
## 2 -5.781869 -17.026591  5.462853
## 3 -1.849460 -12.329543  8.630624
## 4  2.082949  -7.644150 11.810048
## 5  6.015358  -2.973341 15.004056
## 6  9.947766   1.678977 18.216556

\[ \text{Var}[\widehat{Y}(\boldsymbol{x})]=\text{Var}[\boldsymbol{x}^{\text{T}}\widehat{\boldsymbol{\beta}}]=\boldsymbol{x}^{\text{T}}\text{Var}[\widehat{\boldsymbol{\beta}}]\boldsymbol{x} =\widehat{\sigma}^2 \boldsymbol{x}^{\text{T}}[\boldsymbol{X}^{\text{T}}\boldsymbol{X}]^{-1}\boldsymbol{x} \]

Linear Model

plot(cars)
polygon(c(x,rev(x)),c(y[,2],rev(y[,3])),col=rgb(0,0,1,.4),border=NA)
lines(x,y[,1],lwd=2,col="blue")

plot of chunk unnamed-chunk-260

Linear Model and Bagging

Method 1

Y=matrix(NA,1000,length(x))
for(b in 1:1000){
  idx <- sample(1:nrow(cars),size=nrow(cars),replace=TRUE)
  modelb <- lm(dist ~ speed, data=cars[idx,])
  Y[b,] <- predict(modelb, newdata=data.frame(speed=x))
}

See bootstrap, based on pseudo-sample \[ \lbrace(\boldsymbol{x}_{i_1},y_{i_1}), \cdots,(\boldsymbol{x}_{i_n},y_{i_n}) \rbrace \] where $(i_1,\cdots,i_n)\in\lbrace 1,2,\cdots,n\rbrace$.

Linear Model and Bagging

Method 1

plot(cars)
for(b in 1:100) lines(x,Y[b,],col=rgb(0,0,1,.4))

plot of chunk unnamed-chunk-262

Linear Model and Bagging

plot(cars)
lines(x,apply(Y,2,mean),col="blue",lwd=2)
lines(x,apply(Y,2,function(x) quantile(x,.025)),col="red",lty=2)
lines(x,apply(Y,2,function(x) quantile(x,.975)),col="red",lty=2)

plot of chunk unnamed-chunk-263

Linear Model and Bagging

Method 2

pred_dist=predict(model)
epsilon  =residuals(model)
Y=matrix(NA,1000,length(x))
for(b in 1:1000){
  idx <- sample(1:nrow(cars),size=nrow(cars),replace=TRUE)
  carsb=data.frame(speed=cars$speed,
                   dist=pred_dist+epsilon[idx])
  modelb <- lm(dist ~ speed, data=carsb)
  Y[b,] <- predict(modelb, newdata=data.frame(speed=x))
}

See bootstrap, based on pseudo-sample \[ \lbrace(\boldsymbol{x}_{1},\widehat{y}_{1}+\widehat{\varepsilon}_{i_1}), \cdots,(\boldsymbol{x}_{n},\widehat{y}_{n}+\widehat{\varepsilon}_{i_n}) \rbrace \] where $(i_1,\cdots,i_n)\in\lbrace 1,2,\cdots,n\rbrace$.

Linear Model and Bagging

Method 2

plot(cars)
for(b in 1:100) lines(x,Y[b,],col=rgb(0,0,1,.4))

plot of chunk unnamed-chunk-265

Linear Model and Bagging

plot(cars)
lines(x,apply(Y,2,mean),col="blue",lwd=2)
lines(x,apply(Y,2,function(x) quantile(x,.025)),col="red",lty=2)
lines(x,apply(Y,2,function(x) quantile(x,.975)),col="red",lty=2)

plot of chunk unnamed-chunk-266

Errors

plot(cars)
abline(model,col="red")
segments(cars$speed,cars$dist,cars$speed,predict(model),col="blue")

plot of chunk unnamed-chunk-267

Least Squares?

cars2=cars; cars2[,2]=cars[,2]/10
plot(cars2,ylab="dist/10")
acp=princomp(cars2)
b=acp$loadings[2,1]/acp$loadings[1,1]
a=acp$center[2]-b*acp$center[1]
abline(a,b,col="red")

plot of chunk unnamed-chunk-268

Least Squares?

plot(cars2,ylab="dist/10",xlim=c(0,30),ylim=c(-1,12))
abline(a,b,col="red")
t <- acp$loadings[,1] %*% (t(cars2)-acp$center)
X1 <- acp$center[1] +t * acp$loadings[1,1]
X2 <- acp$center[2] +t * acp$loadings[2,1]
segments(cars2$speed,cars2$dist,X1,X2,col="blue")

plot of chunk unnamed-chunk-269

Least Absolute Deviation

  f <- function(a) sum(abs(cars$dist-(a[1]+a[2]*cars$speed))) 
  opt <- optim( c(0,0), f )$par
  plot(cars)
  abline(model, col='red', lty=2)
  abline(opt[1], opt[2],col="blue")

plot of chunk unnamed-chunk-270

Quantile Regression

library(quantreg,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(cars)
abline(model, col="blue")
abline(rq(dist ~ speed,data=cars, tau=.5),col="red",lty=2)
abline(rq(dist ~ speed,data=cars, tau=.9),col="purple",lty=2)

plot of chunk unnamed-chunk-271

Diagnostic and Errors

plot(model,which=1)

plot of chunk unnamed-chunk-272

Scatterplot $(\widehat{Y}_i,\widehat{\varepsilon}_i)$

Diagnostic and Errors

plot(model,which=2)

plot of chunk unnamed-chunk-273

Scatterplot $\displaystyle{\left(\widehat{\varepsilon}_{i:n},\Phi^{-1} \left(\frac{i}{n}\right)\right)}$

Diagnostic and Errors

plot(model,which=3)

plot of chunk unnamed-chunk-274

Scatterplot $(\widehat{Y}_i,\sqrt{\vert\widehat{\varepsilon}_i\vert})$

Diagnostic and Errors

plot(model,which=4)

plot of chunk unnamed-chunk-275

Cook distance, $\displaystyle{C_i=\frac{\widehat{\varepsilon}_i^2}{p\cdot\text{MSE}}\cdot\left(\frac{H_{i,i}}{(1-H_{i,i})^2}\right)}$ with $\boldsymbol{H}=\boldsymbol{X}(\boldsymbol{X}^{\text{T}}\boldsymbol{X})^{-1}\boldsymbol{X}^{\text{T}}=[H_{i,i}]$

C=cooks.distance(model)

Diagnostic and Errors

$H_{i,i}$ are leverages, and define Studentized residuals as \[ \widehat{r}_i=\frac{\widehat{\varepsilon}_i}{ \widehat{\sigma} \sqrt{1-H_{i,i}}} \]

rstudent(model)

##           1           2           3           4           5           6 
##  0.26345000  0.81607841 -0.39781154  0.81035256  0.14070334 -0.51716052 
##           7           8           9          10          11          12 
## -0.24624632  0.27983408  0.81090388 -0.57004675  0.15209173 -1.03037790 
##          13          14          15          16          17          18 
## -0.62992492 -0.36670509 -0.10509307 -0.49251696  0.02981715  0.02981715 
##          19          20          21          22          23          24 
##  0.81716230 -0.75078438 -0.09592079  1.49972043  3.02282876 -1.42097720 
##          25          26          27          28          29          30 
## -1.01227617  0.82440767 -0.87411459 -0.34752195 -1.13903469 -0.60553485 
##          31          32          33          34          35          36 
##  0.04737114 -0.73422040  0.18222855  1.52145888  2.09848208 -1.40929208 
##          37          38          39          40          41          42 
## -0.73145948  0.71330941 -1.98238877 -0.86293622 -0.59637646 -0.33247538 
##          43          44          45          46          47          48 
##  0.19208548 -0.19393283 -1.27493857 -0.45557342  1.02773460  1.09701943 
##          49          50 
##  3.18499284  0.28774529

Diagnostic and Errors

plot(model,which=5)

plot of chunk unnamed-chunk-278

Scatterplot $(H_{i,i},\widehat{r}_i)$

Diagnostic and Errors

plot(model,which=6)

plot of chunk unnamed-chunk-279

Scatterplot $(H_{i,i},C_i)$

Diagnostic and Errors

hist(residuals(model),probability=TRUE, col="light green")
lines(density(residuals(model)),lwd=2,col="red")
boxplot(residuals(model),horizontal=TRUE,add=TRUE,at=.024, 
  pars=list(boxwex=.004),col=rgb(0,0,1,.25))

plot of chunk unnamed-chunk-280

Diagnostic and Errors

sigma=summary(model)$sigma
plot(ecdf(residuals(model)/sigma))
lines(seq(-3,3,by=.1),pnorm(seq(-3,3,by=.1)),lwd=2,col="red")

plot of chunk unnamed-chunk-281

Diagnostic and Errors

Kolmogorov-Smirnov $ d=\sup_{x\in\mathbb{R}}\lbrace \vert\Phi(x)-\widehat{F}_n(x)\vert \rbrace $

ks.test(residuals(model)/sigma,"pnorm")

## Warning in ks.test(residuals(model)/sigma, "pnorm"): ties should not be present
## for the Kolmogorov-Smirnov test

## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  residuals(model)/sigma
## D = 0.13067, p-value = 0.3605
## alternative hypothesis: two-sided

Diagnostic and Errors

Anderson-Darling, Cramer-von Mises,

library(nortest,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ad.test(residuals(model))

## 
##  Anderson-Darling normality test
## 
## data:  residuals(model)
## A = 0.79406, p-value = 0.0369

cvm.test(residuals(model))

## 
##  Cramer-von Mises normality test
## 
## data:  residuals(model)
## W = 0.12573, p-value = 0.0483

Model Choice

AIC $AIC = 2k - 2\log(\mathcal{L}) = 2k + n\left[\log\left(2\pi \frac{1}{n}\sum_{i=1}^n \widehat{\varepsilon}_i^2 \right) + 1\right]$

BIC $BIC = { k \log(n) -2 \log(\mathcal{L}) } = k \log(n) + n\left[\log\left(2\pi \frac{1}{n}\sum_{i=1}^n \widehat{\varepsilon}_i^2 \right) + 1\right]$

AIC(model)

## [1] 419.1569

AIC(model,k=log(n))

## [1] 424.8929

Testing Linear Assumptions

library(splines,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_brk <- lm(dist ~ bs(speed,degree=1,knots=(c(4,15,25))), data=cars)
x=seq(4,25,by=.1)
y=predict(model_brk, newdata=data.frame(speed=x))

## Warning in predict.lm(model_brk, newdata = data.frame(speed = x)): prediction
## from a rank-deficient fit may be misleading

see $b$-splines, \[ y_i=\beta_0+\beta_1 x_i + \beta_2 (x_i-15)_+ + \varepsilon_i \]

Testing Linear Assumptions

plot(cars)
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-286

Testing Linear Assumptions

positive=function(x,s) ifelse(x>s,x-s,0)
model_brk <- lm(dist ~ speed + positive(speed,15), data=cars)
x=seq(4,25,by=.1)
y=predict(model_brk, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-287

Testing Linear Assumptions

plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")
abline(coefficients(model_brk)[1],coefficients(model_brk)[2],col="blue",lty=2)

plot of chunk unnamed-chunk-288

summary(model_brk)$coefficients

##                      Estimate Std. Error    t value    Pr(>|t|)
## (Intercept)         -7.651916 10.6253642 -0.7201557 0.474995540
## speed                3.018648  0.8626766  3.4991649 0.001032415
## positive(speed, 15)  1.756247  1.4551278  1.2069365 0.233496132

Testing Linear Assumptions

library(strucchange,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(Fstats(dist ~ speed,data=cars,from=7/50))

plot of chunk unnamed-chunk-289

see Chow Test

Testing Linear Assumptions

\[ W_t=\frac{1}{\widehat{\sigma}\sqrt{n}}\sum_{i=1}^{ \lfloor nt \rfloor} \widehat{\varepsilon}_i \]

cusum <- efp(dist ~ speed, type = "OLS-CUSUM",data=cars)
plot(cusum,ylim=c(-2,2))

plot of chunk unnamed-chunk-290

Testing Linear Assumptions

plot(cusum, alpha = 0.05, alt.boundary = TRUE,ylim=c(-2,2))

plot of chunk unnamed-chunk-291

see CUSUM test

Model with Categorical Covariates

model_cut=lm(dist~ cut(speed, breaks=c(0,10,15,20,25)),data=cars)
y=predict(model_cut, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue",type="s")

plot of chunk unnamed-chunk-292

Model with Categorical Covariates

library(rpart,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
tree=rpart(dist~speed,data=cars)
y=predict(tree,newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue",type="s")

plot of chunk unnamed-chunk-293

Smoothing : Polynomial Regression

model_poly=lm(dist~ poly(speed, df=3),data=cars)
y=predict(model_poly, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-294

Smoothing : Local Regression

library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(cars)
bw <- dpill(cars$speed,cars$dist) 
lines( locpoly(cars$speed,cars$dist,degree=0, bandwidth=bw), col='red' )
lines( locpoly(cars$speed,cars$dist,degree=1, bandwidth=bw), col='green' )
lines( locpoly(cars$speed,cars$dist,degree=2, bandwidth=bw), col='blue' )

plot of chunk unnamed-chunk-295

Smoothing : $k$-Nearest Neighboors

library(FNN,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
knn=knn.reg(train=cars$speed,y=cars$dist,k=5)
plot(cars)
lines(cars$speed,knn$pred,col="red")

plot of chunk unnamed-chunk-296

Smoothing : $b$ Splines

library(splines,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_bs <- lm(dist ~ bs(speed), data=cars)
y=predict(model_bs, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-297

Smoothing Linear Model

Tuning parameter selection, Silverman's Rule \[ b^\star=0.9 \cdot\frac{ \min\lbrace \sigma,F^{-1}(.75)-F^{-1}(.25)\rbrace}{1.34 \cdot n^{1/5}} \]

bw.nrd0(cars$speed)

## [1] 2.150016

See kernel regression

Smoothing Linear Model

Tuning parameter selection, Cross Validation \[ b^\star=\text{argmin}\left\lbrace \sum_{i=1}^n \left(y_i - \widehat{m}_{(i)}(\boldsymbol{x}_i)\right)^2\right\rbrace \]

bw.ucv(cars$speed)

## Warning in bw.ucv(cars$speed): minimum occurred at one end of the range

## [1] 2.748934

Smoothing Linear Model

library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
Nadaraya_Watson =with(cars,ksmooth(speed, dist, "normal",bandwidth=2.75))
plot(cars)
abline(model,col="red",lty=2)
lines(Nadaraya_Watson,lwd=2,col="blue")

plot of chunk unnamed-chunk-300

Smoothing Linear Model

library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_loess=loess(dist~ speed, data=cars,degree=1, family="gaussian")
y=predict(model_loess, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-301

Multiple Regression

Life Expectancy (1), Homicide Rate (2), Illiteracy Rate (3)

chicago=read.table("http://freakonometrics.free.fr/chicago.txt",header=TRUE,sep=";")
model_c = lm(Fire~X_2+X_3,data=chicago)
y=function(x2,x3) predict(model_c,newdata=data.frame(X_2=x2,X_3=x3))
VX2=seq(0,80,length=26); VX3=seq(5,25,length=26)
VY=outer(VX2,VX3,y)
persp(VX2,VX3,VY,xlab="Homicide",ylab="Illiteracy",zlab="Fire",theta=20)

plot of chunk unnamed-chunk-302

Multiple Regression

VX2=seq(0,80,length=251); VX3=seq(5,25,length=251)
VY=outer(VX2,VX3,y)
image(VX2,VX3,VY,xlab="Homicide",ylab="Illiteracy")
contour(VX2,VX3,VY,add=TRUE)

plot of chunk unnamed-chunk-303

Model selection

model_c = lm(Fire~.,data=chicago)
summary(model_c)$r.squared

## [1] 0.4416723

summary(model_c)$adj.r.squared

## [1] 0.4027192

Model selection

logLik(model_c)

## 'log Lik.' -152.7678 (df=5)

AIC(model_c, k=2)               # AIC

## [1] 315.5357

AIC(model_c, k=log(nrow(cars))) # BIC

## [1] 325.0958

Penalization

\[ \widehat{\boldsymbol{\beta}} = \text{argmin}\left\lbrace \sum_{i=1}^n \left[y_i- \left(\beta_0+\sum_{j=1}^k \beta_j x_{j,i} \right)\right] + \color{red}{\lambda} \sum_{j=1}^k \beta_j ^2\right\rbrace \] with an explicit solution $\widehat{\boldsymbol{\beta}}=(\boldsymbol{X}^{\text{T}}\boldsymbol{X}-\color{red}{\lambda} \mathbb{I})^{-1} \boldsymbol{X}^{\text{T}}\boldsymbol{Y}$.

library(MASS,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_ridge <- lm.ridge(Fire ~ ., data=chicago, lambda=1)

see more generally Tikhonov regularization

Optimal Penalization

mse <- NULL
n=100
v <- matrix(c(0,coefficients(model_c)[-1]), nr=n, nc=4, byrow=TRUE)
kl <- c(1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 
        .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.2, 1.4, 1.6, 1.8, 2)
for (k in kl) {
  r <- matrix(NA, nr=n, nc=4)
  for (i in 1:n) {
    boot_c <- chicago[sample(1:nrow(chicago),nrow(chicago),replace=TRUE),]
    r[i,2:4] <- model_ridge <- lm.ridge(Fire ~ ., data=boot_c, lambda=k)$coef
    r[i,1] <- mean(boot_c[,"Fire"])
  }
  mse <- append(mse, apply( (r-v)^2, 2, mean )[2])
}

Optimal Penalization

plot( mse ~ kl, type='l' )

plot of chunk unnamed-chunk-308

Variable Selection

step(lm(Fire ~ .,data=chicago))

## Start:  AIC=180.16
## Fire ~ X_1 + X_2 + X_3
## 
##        Df Sum of Sq    RSS    AIC
## - X_1   1      0.60 1832.4 178.17
## <none>              1831.8 180.16
## - X_2   1    561.94 2393.7 190.73
## - X_3   1    702.09 2533.8 193.41
## 
## Step:  AIC=178.17
## Fire ~ X_2 + X_3
## 
##        Df Sum of Sq    RSS    AIC
## <none>              1832.4 178.17
## - X_2   1    620.98 2453.3 189.89
## - X_3   1   1003.70 2836.1 196.70

## 
## Call:
## lm(formula = Fire ~ X_2 + X_3, data = chicago)
## 
## Coefficients:
## (Intercept)          X_2          X_3  
##     21.4965       0.2213      -1.5248

Variable Selection

LASSO (least absolute shrinkage and selection operator) \[ \widehat{\boldsymbol{\beta}} = \text{argmin}\left\lbrace \sum_{i=1}^n \left[y_i- \left(\beta_0+\sum_{j=1}^k \beta_j x_{j,i} \right)\right] + \color{blue}{\lambda} \sum_{j=1}^k \vert\beta_j\vert\right\rbrace \]

library(glmnet,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
fit = glmnet(x = as.matrix(chicago[,2:4]), y = chicago[,1], family = "gaussian", alpha = 1)

plot(fit, xvar="lambda", label = TRUE )

plot of chunk unnamed-chunk-311

Variable Selection

step(model)

## Start:  AIC=275.26
## dist ~ speed
## 
##         Df Sum of Sq   RSS    AIC
## <none>               11354 275.26
## - speed  1     21186 32539 325.91

## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Coefficients:
## (Intercept)        speed  
##     -17.579        3.932

Dimension Reduction

model_acp=lm(Fire ~ princomp(cbind(X_1,X_2,X_3))$scores[,1:3],data=chicago)
predict(model_c)[1:5]

##         1         2         3         4         5 
##  9.975499 16.985083 14.244580 13.411960 18.337225

predict(model_acp)[1:5]

##         1         2         3         4         5 
##  9.975499 16.985083 14.244580 13.411960 18.337225

Dimension Reduction

model_acp=lm(Fire ~ princomp(cbind(X_1,X_2,X_3))$scores[,1:2],data=chicago)
predict(model_c)[1:5]

##         1         2         3         4         5 
##  9.975499 16.985083 14.244580 13.411960 18.337225

predict(model_acp)[1:5]

##        1        2        3        4        5 
## 10.01032 17.02103 14.29953 13.43876 18.39402

Logistic Regression

\[ \mathbb{E}[Y\vert \boldsymbol{X}=\boldsymbol{x}]= p(\boldsymbol{x})=\frac{\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]}{1+\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]}=H(\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}) \]

logistic regression

Logistic Regression

myocarde <-read.table("http://freakonometrics.free.fr/myocarde.csv", head=TRUE,sep=";")
model_bernoulli = glm(PRONO~., data=myocarde, family=binomial)
coefficients(summary(model_bernoulli))

##                  Estimate   Std. Error     z value  Pr(>|z|)
## (Intercept) -10.187641685 11.895227082 -0.85644785 0.3917501
## FRCAR         0.138178119  0.114112163  1.21089738 0.2259347
## INCAR        -5.862429035  6.748785486 -0.86866430 0.3850308
## INSYS         0.717084018  0.561444684  1.27721223 0.2015273
## PRDIA        -0.073668171  0.291636009 -0.25260314 0.8005749
## PAPUL         0.016756506  0.341942251  0.04900391 0.9609162
## PVENT        -0.106776012  0.110550096 -0.96586087 0.3341138
## REPUL        -0.003154187  0.004890917 -0.64490712 0.5189874

Logistic Regression

X=cbind(1,as.matrix(myocarde[,1:7]))
Y=myocarde$PRONO=="Survival"
beta=as.matrix(lm(Y~0+X)$coefficients,ncol=1)
for(s in 1:9){
   pi=exp(X%*%beta[,s])/(1+exp(X%*%beta[,s]))
   gradient=t(X)%*%(Y-pi)
   omega=matrix(0,nrow(X),nrow(X));diag(omega)=(pi*(1-pi))
   Hessian=-t(X)%*%omega%*%X
   beta=cbind(beta,beta[,s]-solve(Hessian)%*%gradient)}

Logistic Regression

 beta[,1]

##      X XFRCAR XINCAR XINSYS XPRDIA XPAPUL XPVENT XREPUL 
##      0      0      0      0      0      0      0      0

 beta[,9]

##             X        XFRCAR        XINCAR        XINSYS        XPRDIA 
## -9.202735e+00  4.772571e-14 -5.755396e-13  4.540812e-14  4.996004e-16 
##        XPAPUL        XPVENT        XREPUL 
##  2.664535e-15  1.387779e-15 -2.081668e-17

# -solve(Hessian)
sqrt(-diag(solve(Hessian)))

##                     FRCAR        INCAR        INSYS        PRDIA        PAPUL 
## 180.18368217   1.87748780  93.03024470   6.52098379   6.34628288   5.47188241 
##        PVENT        REPUL 
##   3.05900360   0.05472085

Logistic Regression

GLM <- glm(PRONO ~ PVENT + REPUL, data = myocarde, family = binomial)
pred_GLM = function(p,r){
 return(predict(GLM, newdata =
 data.frame(PVENT=p,REPUL=r), type="response"))}
vp=seq(min(myocarde$PVENT),max(myocarde$PVENT),length=251)
vr=seq(min(myocarde$REPUL),max(myocarde$REPUL),length=251)
vd=outer(vp,vr,pred_GLM)
library(RColorBrewer,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
CL=colorRampPalette(brewer.pal(name="RdYlBu", 11))(100)

Logistic Regression

image(vp,vr,vd,col=CL,xlab="PVENT",ylab="REPUL")

plot of chunk unnamed-chunk-320

Logistic Regression

probabilities <- predict(GLM, myocarde, type="response")
predictions <- levels(myocarde$PRONO)[(probabilities>.5)+1] 
table(predictions, myocarde$PRONO)

##            
## predictions DECES SURVIE
##      DECES     24      4
##      SURVIE     5     38

Residuals and GLM

Pearson's residuals \[ \widehat{\varepsilon}_i=\frac{y_i - \widehat{\pi}_i}{ \sqrt{\widehat{\pi}_i\cdot[1-\widehat{\pi}_i]}} \] Deviance residuals \[ \widehat{\varepsilon}_i=\pm \left( y_i \log[\widehat{\pi}_i]+ (1-y_i) \log[1-\widehat{\pi}_i]\right) \]

E1=residuals(model_bernoulli,type="pearson")
E2=residuals(model_bernoulli,type="deviance")

Residuals and GLM

plot(1:nrow(myocarde),E1)

plot of chunk unnamed-chunk-323

Multinomial Regression

Binomial case, $Y\in\lbrace 0,1\rbrace$,

$\mathbb{P}[Y=1\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]$

$\mathbb{P}[Y=0\vert \boldsymbol{X}=\boldsymbol{x}]\propto1$

Multinomial case, $Y\in\lbrace A,B,C\rbrace$,

$\mathbb{P}[Y=\color{red}{A}\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\color{red}{\boldsymbol{\beta}_A}]$

$\mathbb{P}[Y=\color{blue}{B}\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\color{blue}{\boldsymbol{\beta}_B}]$

$\mathbb{P}[Y=\color{green}{C}\vert \boldsymbol{X}=\boldsymbol{x}]\propto1$

Multinomial Regression

library(VGAM,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_multi = vglm(PRONO~., data=myocarde, family="multinomial")
model_multi

## 
## Call:
## vglm(formula = PRONO ~ ., family = "multinomial", data = myocarde)
## 
## 
## Coefficients:
##  (Intercept)        FRCAR        INCAR        INSYS        PRDIA        PAPUL 
## 10.187641072 -0.138178115  5.862428927 -0.717083993  0.073668174 -0.016756516 
##        PVENT        REPUL 
##  0.106776011  0.003154187 
## 
## Degrees of Freedom: 71 Total; 63 Residual
## Residual deviance: 41.04314 
## Log-likelihood: -20.52157 
## 
## This is a multinomial logit model with 2 levels

Generalized Linear Models

Poisson and Gamma distributions in the exponential family

base=data.frame(x=c(1,2,3,4,5),y=c(1,2,4,2,6))
regNId <- glm(y~x,family=gaussian(link="identity"),data=base)
regNlog <- glm(y~x,family=gaussian(link="log"),data=base)
regPId <- glm(y~x,family=poisson(link="identity"),data=base)
regPlog <- glm(y~x,family=poisson(link="log"),data=base)
regGId <- glm(y~x,family=Gamma(link="identity"),data=base)
regGlog <- glm(y~x,family=Gamma(link="log"),data=base)

Generalized Linear Models

visuel=function(regression,titre=""){
 plot(base$x,base$y,pch=19,cex=1.5,main=titre,xlab="",ylab="")
 abs <- seq(0,7,by=.1)
 yp <- predict(regression,newdata=data.frame(x=abs),se.fit = TRUE, type="response")
 polygon(c(abs,rev(abs)),c(yp$fit+2*yp$se.fit,rev(yp$fit-2*yp$se.fit)), col="light green",border=NA)
 points(base$x,base$y,pch=19,cex=1.5)
 lines(abs,yp$fit,lwd=2,col="red")
 lines(abs,yp$fit+2*yp$se.fit,lty=2)
 lines(abs,yp$fit-2*yp$se.fit,lty=2)}

Generalized Linear Models

visuel(regNId,"Gaussienne, lien identité")

plot of chunk unnamed-chunk-327

Generalized Linear Models

visuel(regPId,"Poisson, lien identité")

plot of chunk unnamed-chunk-328

Generalized Linear Models

visuel(regGId,"Gamma, lien identité")

plot of chunk unnamed-chunk-329

Generalized Linear Models

visuel(regNlog,"Gaussienne, lien logarithmique")

plot of chunk unnamed-chunk-330

Generalized Linear Models

visuel(regPlog,"Poisson, lien logarithme")

plot of chunk unnamed-chunk-331

Generalized Linear Models

visuel(regGlog,"Gamma, lien logarithme")

plot of chunk unnamed-chunk-332