Data Science for Actuaries

Arthur Charpentier, @freakonometrics
2018

R & Packages

getOption("defaultPackages")
## [1] "datasets"  "utils"     "grDevices" "graphics"  "stats"     "methods"
(.packages(all.available=TRUE))
##   [1] "abind"                 "acepack"              
##   [3] "ada"                   "ade4"                 
##   [5] "adegenet"              "adegraphics"          
##   [7] "adephylo"              "ADGofTest"            
##   [9] "AER"                   "alabama"              
##  [11] "amap"                  "animation"            
##  [13] "ape"                   "aplpack"              
##  [15] "arm"                   "arules"               
##  [17] "arulesViz"             "ash"                  
##  [19] "assertthat"            "backports"            
##  [21] "base64enc"             "BB"                   
##  [23] "bcp"                   "bdsmatrix"            
##  [25] "BH"                    "biclust"              
##  [27] "biglm"                 "bindr"                
##  [29] "bindrcpp"              "BiocInstaller"        
##  [31] "bipartite"             "bit"                  
##  [33] "bit64"                 "bitops"               
##  [35] "BLCOP"                 "blob"                 
##  [37] "bold"                  "brew"                 
##  [39] "brglm"                 "broom"                
##  [41] "ca"                    "callr"                
##  [43] "car"                   "carData"              
##  [45] "caret"                 "cartography"          
##  [47] "catdata"               "caTools"              
##  [49] "cba"                   "cccp"                 
##  [51] "cellranger"            "checkmate"            
##  [53] "chron"                 "CircStats"            
##  [55] "classInt"              "cli"                  
##  [57] "clipr"                 "clusterGeneration"    
##  [59] "clv"                   "coda"                 
##  [61] "coin"                  "colorspace"           
##  [63] "combinat"              "commonmark"           
##  [65] "Compositional"         "config"               
##  [67] "contfrac"              "copula"               
##  [69] "corrgram"              "corrplot"             
##  [71] "covr"                  "covRobust"            
##  [73] "crayon"                "crosstalk"            
##  [75] "crul"                  "cubature"             
##  [77] "curl"                  "CVST"                 
##  [79] "cvTools"               "data.table"           
##  [81] "DBI"                   "dbplyr"               
##  [83] "ddalpha"               "debugme"              
##  [85] "deldir"                "dendextend"           
##  [87] "dendroextras"          "DEoptim"              
##  [89] "DEoptimR"              "desc"                 
##  [91] "descr"                 "deSolve"              
##  [93] "devtools"              "dfoptim"              
##  [95] "dichromat"             "digest"               
##  [97] "dimRed"                "diptest"              
##  [99] "distr"                 "distrEx"              
## [101] "DistributionUtils"     "doBy"                 
## [103] "doMC"                  "doParallel"           
## [105] "doRNG"                 "dotCall64"            
## [107] "downloader"            "dplR"                 
## [109] "dplyr"                 "DRR"                  
## [111] "DT"                    "dtplyr"               
## [113] "dtw"                   "dynlm"                
## [115] "e1071"                 "ecodist"              
## [117] "ECOSolveR"             "effects"              
## [119] "ellipse"               "elliptic"             
## [121] "emplik"                "enaR"                 
## [123] "energy"                "EnvStats"             
## [125] "eva"                   "evaluate"             
## [127] "evir"                  "expm"                 
## [129] "FactoMineR"            "fAssets"              
## [131] "fastmatch"             "fBasics"              
## [133] "fCopulae"              "fGarch"               
## [135] "fields"                "fit.models"           
## [137] "flashClust"            "flexclust"            
## [139] "flexmix"               "flows"                
## [141] "fMultivar"             "FNN"                  
## [143] "forcats"               "foreach"              
## [145] "forecast"              "formatR"              
## [147] "Formula"               "fpc"                  
## [149] "fPortfolio"            "fracdiff"             
## [151] "FRAPO"                 "fts"                  
## [153] "gam"                   "gamair"               
## [155] "gamlss"                "gamlss.data"          
## [157] "gamlss.dist"           "GB2"                  
## [159] "gbm"                   "gclus"                
## [161] "gdata"                 "GeneralizedHyperbolic"
## [163] "geometry"              "geoR"                 
## [165] "geosphere"             "GGally"               
## [167] "ggcorrplot"            "ggdendro"             
## [169] "ggmap"                 "ggnet"                
## [171] "ggplot2"               "ggplot2movies"        
## [173] "ggthemes"              "ggvis"                
## [175] "git2r"                 "glmnet"               
## [177] "glue"                  "gmodels"              
## [179] "gnm"                   "goftest"              
## [181] "googleVis"             "gower"                
## [183] "gplots"                "gridBase"             
## [185] "gridExtra"             "gsl"                  
## [187] "gss"                   "gstat"                
## [189] "gsubfn"                "gtable"               
## [191] "gtools"                "gurobi"               
## [193] "gWidgets"              "h2o"                  
## [195] "haven"                 "hdrcde"               
## [197] "hexbin"                "highr"                
## [199] "hmeasure"              "Hmisc"                
## [201] "hms"                   "htmlTable"            
## [203] "htmltools"             "htmlwidgets"          
## [205] "httpcode"              "httpuv"               
## [207] "httr"                  "hypergeo"             
## [209] "Icens"                 "igraph"               
## [211] "igraphdata"            "Imap"                 
## [213] "ineq"                  "inline"               
## [215] "interval"              "intervals"            
## [217] "IntroCompFinR"         "ipred"                
## [219] "irlba"                 "ISLR"                 
## [221] "iterators"             "jpeg"                 
## [223] "jsonlite"              "kernlab"              
## [225] "knitr"                 "ks"                   
## [227] "labeling"              "laeken"               
## [229] "LaF"                   "Lahman"               
## [231] "lasso2"                "latticeExtra"         
## [233] "lava"                  "lazyeval"             
## [235] "leaflet"               "leaflet.extras"       
## [237] "leaps"                 "LearnBayes"           
## [239] "limSolve"              "linprog"              
## [241] "lme4"                  "lmtest"               
## [243] "locfit"                "logcondens"           
## [245] "longmemo"              "lpSolve"              
## [247] "lpSolveAPI"            "lubridate"            
## [249] "magic"                 "mapproj"              
## [251] "maps"                  "maptools"             
## [253] "markdown"              "markovchain"          
## [255] "MASS"                  "matchingR"            
## [257] "matlab"                "Matrix"               
## [259] "MatrixModels"          "matrixStats"          
## [261] "maxLik"                "maxmatching"          
## [263] "mclust"                "memoise"              
## [265] "mice"                  "microbenchmark"       
## [267] "mime"                  "minqa"                
## [269] "misc3d"                "miscTools"            
## [271] "missMDA"               "mix"                  
## [273] "mixture"               "MLEcens"              
## [275] "mlogit"                "mnormt"               
## [277] "ModelMetrics"          "modelr"               
## [279] "modeltools"            "MPV"                  
## [281] "MTS"                   "multcomp"             
## [283] "multicool"             "munsell"              
## [285] "mvnormtest"            "mvoutlier"            
## [287] "mvtnorm"               "nanotime"             
## [289] "natserv"               "nbpMatching"          
## [291] "network"               "networkD3"            
## [293] "nloptr"                "NLP"                  
## [295] "NMF"                   "nortest"              
## [297] "np"                    "numDeriv"             
## [299] "nycflights13"          "odfWeave"             
## [301] "openssl"               "OpenStreetMap"        
## [303] "opera"                 "optextras"            
## [305] "optimx"                "optmatch"             
## [307] "optrees"               "osmar"                
## [309] "packrat"               "pander"               
## [311] "party"                 "pbkrtest"             
## [313] "PBSmapping"            "pcaPP"                
## [315] "PerformanceAnalytics"  "perm"                 
## [317] "permute"               "pglm"                 
## [319] "phylobase"             "pixmap"               
## [321] "pkgconfig"             "pkgmaker"             
## [323] "PKI"                   "plm"                  
## [325] "plogr"                 "plotly"               
## [327] "plotrix"               "pls"                  
## [329] "plyr"                  "pmml"                 
## [331] "png"                   "polyclip"             
## [333] "PortfolioAnalytics"    "prabclus"             
## [335] "praise"                "prettymapr"           
## [337] "prettyunits"           "pROC"                 
## [339] "processx"              "prodlim"              
## [341] "profileModel"          "progress"             
## [343] "proto"                 "proxy"                
## [345] "pscl"                  "pspline"              
## [347] "psych"                 "purrr"                
## [349] "qap"                   "qcc"                  
## [351] "quadprog"              "quantmod"             
## [353] "quantreg"              "qvcalc"               
## [355] "R.methodsS3"           "R.oo"                 
## [357] "R.utils"               "R6"                   
## [359] "rainbow"               "RandomFields"         
## [361] "RandomFieldsUtils"     "randomForest"         
## [363] "randomNames"           "raster"               
## [365] "Rcgmin"                "RColorBrewer"         
## [367] "Rcpp"                  "RcppArmadillo"        
## [369] "RcppCCTZ"              "RcppDE"               
## [371] "RcppEigen"             "RcppParallel"         
## [373] "RcppRoll"              "RcppZiggurat"         
## [375] "RCurl"                 "Rdonlp2"              
## [377] "readr"                 "readxl"               
## [379] "recipes"               "registry"             
## [381] "ReIns"                 "relimp"               
## [383] "rematch"               "rentrez"              
## [385] "reprex"                "reshape2"             
## [387] "reticulate"            "rex"                  
## [389] "Rfast"                 "rgdal"                
## [391] "rgeos"                 "rgl"                  
## [393] "RgoogleMaps"           "ritis"                
## [395] "RItools"               "rjson"                
## [397] "RJSONIO"               "rlang"                
## [399] "rmarkdown"             "Rmdanimation"         
## [401] "Rmixmod"               "RMongo"               
## [403] "RMySQL"                "rncl"                 
## [405] "rneos"                 "RNeXML"               
## [407] "rngtools"              "robCompositions"      
## [409] "robust"                "robustbase"           
## [411] "ROCR"                  "RODBC"                
## [413] "ROI"                   "ROI.plugin.alabama"   
## [415] "ROI.plugin.ecos"       "ROI.plugin.glpk"      
## [417] "ROI.plugin.ipop"       "ROI.plugin.lpsolve"   
## [419] "ROI.plugin.nloptr"     "ROI.plugin.optimx"    
## [421] "ROI.plugin.quadprog"   "ROI.plugin.scs"       
## [423] "ROI.plugin.symphony"   "rosm"                 
## [425] "rotl"                  "roxygen2"             
## [427] "rpart"                 "rpart.plot"           
## [429] "RPostgreSQL"           "rprojroot"            
## [431] "rrcov"                 "rredlist"             
## [433] "rsconnect"             "Rsolnp"               
## [435] "RSQLite"               "rstudioapi"           
## [437] "rugarch"               "RUnit"                
## [439] "rvest"                 "Rvmmin"               
## [441] "rworldmap"             "sampleSelection"      
## [443] "sand"                  "sandwich"             
## [445] "sas7bdat"              "scales"               
## [447] "scatterplot3d"         "scrapeR"              
## [449] "scs"                   "segmented"            
## [451] "selectr"               "seqinr"               
## [453] "seriation"             "setRNG"               
## [455] "sf"                    "sfsmisc"              
## [457] "sgeostat"              "shapefiles"           
## [459] "shiny"                 "SkewHyperbolic"       
## [461] "slam"                  "slidify"              
## [463] "slidifyLibraries"      "sn"                   
## [465] "sna"                   "SnowballC"            
## [467] "SOAR"                  "solrium"              
## [469] "sourcetools"           "sp"                   
## [471] "spacetime"             "spam"                 
## [473] "SparseM"               "spatstat"             
## [475] "spatstat.data"         "spatstat.utils"       
## [477] "spd"                   "spdep"                
## [479] "splancs"               "sqldf"                
## [481] "sROC"                  "stabledist"           
## [483] "startupmsg"            "statmod"              
## [485] "statnet.common"        "stringr"              
## [487] "strucchange"           "survey"               
## [489] "survival"              "svd"                  
## [491] "svUnit"                "SweaveListingUtils"   
## [493] "systemfit"             "tau"                  
## [495] "taxize"                "tensor"               
## [497] "tensorflow"            "testthat"             
## [499] "textcat"               "tfruns"               
## [501] "TH.data"               "tibble"               
## [503] "tidyr"                 "tidyselect"           
## [505] "tidyverse"             "timeDate"             
## [507] "timeSeries"            "tis"                  
## [509] "tm"                    "transport"            
## [511] "triebeard"             "trimcluster"          
## [513] "tripack"               "truncnorm"            
## [515] "truncreg"              "tseries"              
## [517] "TSP"                   "TTR"                  
## [519] "tweedie"               "ucminf"               
## [521] "udunits2"              "units"                
## [523] "urca"                  "urltools"             
## [525] "uuid"                  "V8"                   
## [527] "vars"                  "vcd"                  
## [529] "vcdExtra"              "vegan"                
## [531] "verification"          "VGAM"                 
## [533] "VIM"                   "viridis"              
## [535] "viridisLite"           "visNetwork"           
## [537] "waveslim"              "whisker"              
## [539] "WikidataR"             "WikipediR"            
## [541] "wikitaxa"              "winference"           
## [543] "withr"                 "wordcloud"            
## [545] "worrms"                "wskm"                 
## [547] "xlsx"                  "xlsxjars"             
## [549] "XML"                   "xml2"                 
## [551] "xtable"                "xts"                  
## [553] "yaml"                  "YieldCurve"           
## [555] "zipcode"               "zoo"                  
## [557] "littler"               "magrittr"             
## [559] "pkgKitten"             "RcppGSL"              
## [561] "reshape"               "Rglpk"                
## [563] "rJava"                 "Rsymphony"            
## [565] "stringi"               "base"                 
## [567] "boot"                  "class"                
## [569] "cluster"               "codetools"            
## [571] "compiler"              "datasets"             
## [573] "foreign"               "graphics"             
## [575] "grDevices"             "grid"                 
## [577] "KernSmooth"            "lattice"              
## [579] "methods"               "mgcv"                 
## [581] "nlme"                  "nnet"                 
## [583] "parallel"              "spatial"              
## [585] "splines"               "stats"                
## [587] "stats4"                "tcltk"                
## [589] "tools"                 "utils"
rm(list=ls())
options(width = 80)
library(knitr); library(htmlwidgets); library(dplyr); library(leaflet)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(distr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## Utilities for Start-Up Messages (version 0.9.4)
## For more information see ?"startupmsg", NEWS("startupmsg")
## 
## Attaching package: 'sfsmisc'
## The following object is masked from 'package:dplyr':
## 
##     last
## Utilities for Sweave Together with TeX 'listings' Package (version 0.7.7)
## NOTE: Support for this package will stop soon.
## Package 'knitr' is providing the same functionality in a better way.
## Some functions from package 'base' are intentionally masked ---see SweaveListingMASK().
## Note that global options are controlled by SweaveListingoptions() ---c.f. ?"SweaveListingoptions".
## For more information see ?"SweaveListingUtils", NEWS("SweaveListingUtils")
## There is a vignette to this package; try vignette("ExampleSweaveListingUtils").
## 
## Attaching package: 'SweaveListingUtils'
## The following objects are masked from 'package:base':
## 
##     library, require
## Object Oriented Implementation of Distributions (version 2.6.2)
## Attention: Arithmetics on distribution objects are understood as operations on corresponding random variables (r.v.s); see distrARITH().
## Some functions from package 'stats' are intentionally masked ---see distrMASK().
## Note that global options are controlled by distroptions() ---c.f. ?"distroptions".
## For more information see ?"distr", NEWS("distr"), as well as
##   http://distr.r-forge.r-project.org/
## Package "distrDoc" provides a vignette to this package as well as to several extension packages; try vignette("distr").
library(distrEx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## Extensions of Package 'distr' (version 2.6.1)
## Note: Packages "e1071", "moments", "fBasics" should be attached /before/ package "distrEx". See distrExMASK().Note: Extreme value distribution functionality has been moved to
##       package "RobExtremes". See distrExMOVED().
## For more information see ?"distrEx", NEWS("distrEx"), as well as
##   http://distr.r-forge.r-project.org/
## Package "distrDoc" provides a vignette to this package as well as to several related packages; try vignette("distr").
library(pryr,verbose=FALSE,quietly=TRUE)
## Error in library(pryr, verbose = FALSE, quietly = TRUE): there is no package called 'pryr'
library(wordcloud,verbose=FALSE,quietly=TRUE)
library(tm,verbose=FALSE,quietly=TRUE)

Using with RStudio

?rq
## No documentation for 'rq' in specified packages and libraries:
## you could try '??rq'
library(quantreg,verbose=FALSE,quietly=TRUE)
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
require(quantreg)
?rq
rq
## function (formula, tau = 0.5, data, subset, weights, na.action, 
##     method = "br", model = TRUE, contrasts = NULL, ...) 
## {
##     call <- match.call()
##     mf <- match.call(expand.dots = FALSE)
##     m <- match(c("formula", "data", "subset", "weights", "na.action"), 
##         names(mf), 0)
##     mf <- mf[c(1, m)]
##     mf$drop.unused.levels <- TRUE
##     mf[[1]] <- as.name("model.frame")
##     mf <- eval.parent(mf)
##     if (method == "model.frame") 
##         return(mf)
##     mt <- attr(mf, "terms")
##     weights <- as.vector(model.weights(mf))
##     Y <- model.response(mf)
##     if (method == "sfn") {
##         if (requireNamespace("MatrixModels") && requireNamespace("Matrix")) {
##             X <- MatrixModels::model.Matrix(mt, data, sparse = TRUE)
##             vnames <- dimnames(X)[[2]]
##             X <- as(X, "matrix.csr")
##             mf$x <- X
##         }
##     }
##     else X <- model.matrix(mt, mf, contrasts)
##     eps <- .Machine$double.eps^(2/3)
##     Rho <- function(u, tau) u * (tau - (u < 0))
##     if (length(tau) > 1) {
##         if (any(tau < 0) || any(tau > 1)) 
##             stop("invalid tau:  taus should be >= 0 and <= 1")
##         if (any(tau == 0)) 
##             tau[tau == 0] <- eps
##         if (any(tau == 1)) 
##             tau[tau == 1] <- 1 - eps
##         coef <- matrix(0, ncol(X), length(tau))
##         rho <- rep(0, length(tau))
##         fitted <- resid <- matrix(0, nrow(X), length(tau))
##         for (i in 1:length(tau)) {
##             z <- {
##                 if (length(weights)) 
##                   rq.wfit(X, Y, tau = tau[i], weights, method, 
##                     ...)
##                 else rq.fit(X, Y, tau = tau[i], method, ...)
##             }
##             coef[, i] <- z$coefficients
##             resid[, i] <- z$residuals
##             rho[i] <- sum(Rho(z$residuals, tau[i]))
##             fitted[, i] <- Y - z$residuals
##         }
##         taulabs <- paste("tau=", format(round(tau, 3)))
##         dimnames(coef) <- list(dimnames(X)[[2]], taulabs)
##         dimnames(resid) <- list(dimnames(X)[[1]], taulabs)
##         fit <- z
##         fit$coefficients <- coef
##         fit$residuals <- resid
##         fit$fitted.values <- fitted
##         if (method == "lasso") 
##             class(fit) <- c("lassorqs", "rqs")
##         else if (method == "scad") 
##             class(fit) <- c("scadrqs", "rqs")
##         else class(fit) <- "rqs"
##     }
##     else {
##         process <- (tau < 0 || tau > 1)
##         if (tau == 0) 
##             tau <- eps
##         if (tau == 1) 
##             tau <- 1 - eps
##         fit <- {
##             if (length(weights)) 
##                 rq.wfit(X, Y, tau = tau, weights, method, ...)
##             else rq.fit(X, Y, tau = tau, method, ...)
##         }
##         if (process) 
##             rho <- list(x = fit$sol[1, ], y = fit$sol[3, ])
##         else {
##             if (length(dim(fit$residuals))) 
##                 dimnames(fit$residuals) <- list(dimnames(X)[[1]], 
##                   NULL)
##             rho <- sum(Rho(fit$residuals, tau))
##         }
##         if (method == "lasso") 
##             class(fit) <- c("lassorq", "rq")
##         else if (method == "scad") 
##             class(fit) <- c("scadrq", "rq")
##         else class(fit) <- ifelse(process, "rq.process", "rq")
##     }
##     fit$na.action <- attr(mf, "na.action")
##     fit$formula <- formula
##     fit$terms <- mt
##     fit$xlevels <- .getXlevels(mt, mf)
##     fit$call <- call
##     fit$tau <- tau
##     fit$weights <- weights
##     fit$residuals <- drop(fit$residuals)
##     fit$rho <- rho
##     fit$method <- method
##     fit$fitted.values <- drop(fit$fitted.values)
##     attr(fit, "na.message") <- attr(m, "na.message")
##     if (model) 
##         fit$model <- mf
##     fit
## }
## <environment: namespace:quantreg>
detach(package:quantreg, unload=TRUE)
?rq
## No documentation for 'rq' in specified packages and libraries:
## you could try '??rq'
apropos("norm")
##  [1] ".__T__norm:base" "dlnorm"          "dnorm"           "norm"           
##  [5] "norm"            "normal_print"    "normalizePath"   "plnorm"         
##  [9] "pnorm"           "qlnorm"          "qnorm"           "qqnorm"         
## [13] "rlnorm"          "rnorm"
?pnorm

S3 vs S4

person3 <- function(name,age,weight,height){
  characteristics<-list(name=name,age=age,weight=weight,height=height)
  class(characteristics)<-"person3"
  return(characteristics)}
JohnDoe3 <- person3(name="John",age=28, weight=76, height=182)
JohnDoe3$age
## [1] 28
BMI3 <- function(object,...) {return(object$weight*1e4/object$height^2)}
BMI3(JohnDoe3)
## [1] 22.94409
JohnDoe3
## $name
## [1] "John"
## 
## $age
## [1] 28
## 
## $weight
## [1] 76
## 
## $height
## [1] 182
## 
## attr(,"class")
## [1] "person3"
reg3 <- lm(dist~speed,data=cars)
reg3$coefficients
## (Intercept)       speed 
##  -17.579095    3.932409
coef(reg3)
## (Intercept)       speed 
##  -17.579095    3.932409
summary(reg3)
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.069  -9.525  -2.272   9.215  43.201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
## speed         3.9324     0.4155   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12
setClass("person4", representation(name="character",
age="numeric", weight="numeric", height="numeric"),where=.GlobalEnv)
JohnDoe4 <- new("person4",name="John",age=28, weight=76, height=182)
JohnDoe4
## An object of class "person4"
## Slot "name":
## [1] "John"
## 
## Slot "age":
## [1] 28
## 
## Slot "weight":
## [1] 76
## 
## Slot "height":
## [1] 182
JohnDoe4@age
## [1] 28
setGeneric("BMI4",function(object,separator) return(standardGeneric("BMI4")),
           where=.GlobalEnv)
## [1] "BMI4"
setMethod("BMI4", "person4",where=.GlobalEnv,
          function(object){return(object@weight*1e4/object@height^2)})
## [1] "BMI4"
BMI4(JohnDoe4)
## [1] 22.94409
library(VGAM,verbose=FALSE,quietly=TRUE)
reg4 <- vglm(dist~speed,data=cars,family=gaussianff)
reg4@coefficients
## (Intercept)       speed 
##  -17.579095    3.932409
coef(reg4)
## (Intercept)       speed 
##  -17.579095    3.932409
pnorm(3.65102,mean=5,sd=2)
## [1] 0.2499999
library(distr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## Error in library(distr, verbose = FALSE, quietly = TRUE, warn.conflicts = FALSE): there is no package called 'distr'
X <- Norm(mean=5,sd=2)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "Norm"
mean(X)
## Error in mean(X): objet 'X' introuvable
p(X)(3.65102)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "p"
qnorm(0.25,mean=5,sd=2)
## [1] 3.65102
library(distrEx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## Error in library(distrEx, verbose = FALSE, quietly = TRUE, warn.conflicts = FALSE): there is no package called 'distrEx'
distr::q(X)
## Error in loadNamespace(name): there is no package called 'distr'
distr::q(X)(0.25)
## Error in loadNamespace(name): there is no package called 'distr'
X1 <- Norm(mean=5,sd=2)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "Norm"
X2 <- Norm(mean=2,sd=1)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "Norm"
S <- X1+X2
## Error in eval(expr, envir, enclos): objet 'X1' introuvable
distr::q(S)(.5)
## Error in loadNamespace(name): there is no package called 'distr'
plot(S)
## Error in plot(S): objet 'S' introuvable

R objects : Numbers

set.seed(1)
U <- runif(20)
U[1:4]
## [1] 0.2655087 0.3721239 0.5728534 0.9082078
U[1] == 0.2655087 
## [1] FALSE
options(digits = 3)
U[1:4]
## [1] 0.266 0.372 0.573 0.908
options(digits = 22)
U[1:4]
## [1] 0.2655086631420999765396 0.3721238996367901563644 0.5728533633518964052200
## [4] 0.9082077899947762489319
options(digits = 3)
x <- exp(1)
y <- x
x <- 2
y
## [1] 2.72
class(x)
## [1] "numeric"
0/0
## [1] NaN
1/0
## [1] Inf
.Machine$double.xmax
## [1] 1.8e+308
2e+307<Inf
## [1] TRUE
2e+308<Inf
## [1] FALSE
(3/10-1/10)
## [1] 0.2
(3/10-1/10)==(7/10-5/10)
## [1] FALSE
(3/10-1/10)-(7/10-5/10)
## [1] 2.78e-17
all.equal((3/10-1/10),(7/10-5/10))
## [1] TRUE
sqrt(2)^2 == 2
## [1] FALSE
.Machine$double.eps
## [1] 2.22e-16
.Machine$double.xmin
## [1] 2.23e-308
x <- rnorm(8)
names(x) <- letters[1:8]
x
##       a       b       c       d       e       f       g       h 
##  1.5118  0.3898 -0.6212 -2.2147  1.1249 -0.0449 -0.0162  0.9438
x[2:4]
##      b      c      d 
##  0.390 -0.621 -2.215
x[c("b","c","d")]
##      b      c      d 
##  0.390 -0.621 -2.215
min(x[x>0])
## [1] 0.39
x[-(3:8)]
##    a    b 
## 1.51 0.39
# x[-3:8]
x[c(TRUE,FALSE)]
##       a       c       e       g 
##  1.5118 -0.6212  1.1249 -0.0162
x[c(TRUE,NA,FALSE,TRUE)]
##      a   <NA>      d      e   <NA>      h 
##  1.512     NA -2.215  1.125     NA  0.944
x[2]
##    b 
## 0.39
x[[2]]
## [1] 0.39
for(i in 1:2){
  nom_var <- paste0("x", i)
  assign(nom_var, rpois(5,7))
}
x1
## [1] 9 4 8 6 9
x2
## [1] 8 9 7 7 9
x <- runif(4)
x
## [1] 0.0233 0.4772 0.7323 0.6927
x
## [1] 0.0233 0.4772 0.7323 0.6927
library(pryr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## Error in library(pryr, verbose = FALSE, quietly = TRUE, warn.conflicts = FALSE): there is no package called 'pryr'
x %<a-% runif(1)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "%<a-%"
x
## [1] 0.0233 0.4772 0.7323 0.6927
x
## [1] 0.0233 0.4772 0.7323 0.6927
x <- 1:2
x
## [1] 1 2
rm("x")
x
## Error in eval(expr, envir, enclos): objet 'x' introuvable

R objects : Integers

(x_num=c(1,6,10))
## [1]  1  6 10
(x_int=c(1L,6L,10L))
## [1]  1  6 10
object.size(x_num)
## 72 bytes
object.size(x_int)
## 56 bytes
typeof(x_num)
## [1] "double"
typeof(x_int)
## [1] "integer"
is.integer(x_num)
## [1] FALSE
is.integer(x_int)
## [1] TRUE
str(x_num)
##  num [1:3] 1 6 10
str(x_int)
##  int [1:3] 1 6 10
c(1,c(2,c(3,c(4,5))))
## [1] 1 2 3 4 5

R objects : Matrices

M<-1:24
dim(M)=c(6,4)
M
##      [,1] [,2] [,3] [,4]
## [1,]    1    7   13   19
## [2,]    2    8   14   20
## [3,]    3    9   15   21
## [4,]    4   10   16   22
## [5,]    5   11   17   23
## [6,]    6   12   18   24
str(M)
##  int [1:6, 1:4] 1 2 3 4 5 6 7 8 9 10 ...
colnames(M)=letters[1:4]
rownames(M)=LETTERS[10:15]
M
##   a  b  c  d
## J 1  7 13 19
## K 2  8 14 20
## L 3  9 15 21
## M 4 10 16 22
## N 5 11 17 23
## O 6 12 18 24
M["K",]
##  a  b  c  d 
##  2  8 14 20
M[c("K","N"),]
##   a  b  c  d
## K 2  8 14 20
## N 5 11 17 23
M[c(2,5),]
##   a  b  c  d
## K 2  8 14 20
## N 5 11 17 23
M[c("K",5),]
## Error in M[c("K", 5), ]: indice hors limites

On Windows machines

rm(list=ls())
memory.size()
## Warning: 'memory.size()' is Windows-specific
## [1] Inf
memory.limit()
## Warning: 'memory.limit()' is Windows-specific
## [1] Inf
memory.limit(size=8000)
## Warning: 'memory.limit()' is Windows-specific
## [1] Inf
memory.limit(size=4000)
## Warning: 'memory.limit()' is Windows-specific
## [1] Inf
x3 <- 1:1e3
x4 <- 1:1e4
x5 <- 1:1e5
x6 <- 1:1e6
object.size(x3)
## 4040 bytes
object.size(x4)
## 40040 bytes
object.size(x6)
## 4000040 bytes
z1 <- matrix(0,1e+06,3) 
z2 <- matrix(as.integer(0),1e+06,3) 
object.size(z1)
## 24000200 bytes
print(object.size(z1),units="Mb")
## 22.9 Mb
object.size(z2)
## 12000200 bytes
print(object.size(z2),units="Mb")
## 11.4 Mb
mydat_r <- matrix(rnorm(3*5e6,0,2),5e6,3)
colnames(mydat_r) <- c("first","second","third")
mydat_rdf <- as.data.frame(mydat_r)
print(object.size(mydat_rdf ),units="Mb")
## 114.4 Mb
library(biglm)
## Loading required package: DBI
z1 <- matrix(as.integer(5), 300, 400)
object.size(z1)
## 480200 bytes
library(bigmemory)
## Error in library(bigmemory): there is no package called 'bigmemory'
z2 <- big.matrix(300, 400, type="integer", init=5)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "big.matrix"
object.size(z2)
## 12000200 bytes
rm(list=ls())

R objects : Factors

x <- c("A", "A", "B", "B", "C")
x
## [1] "A" "A" "B" "B" "C"
factor(x)
## [1] A A B B C
## Levels: A B C
relevel(factor(x),"B")
## [1] A A B B C
## Levels: B A C
model.matrix(~0+x)
##   xA xB xC
## 1  1  0  0
## 2  1  0  0
## 3  0  1  0
## 4  0  1  0
## 5  0  0  1
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$x
## [1] "contr.treatment"
n <- 10; nn <- 100
group <- factor(round(n * runif(n * nn)))
x <- rnorm(n * nn) + sqrt(as.numeric(group))
tapply(x,group,mean)
##    0    1    2    3    4    5    6    7    8    9   10 
## 1.10 1.40 1.75 2.06 2.25 2.60 2.59 2.76 3.04 3.06 3.47
aggregate(x,by=list(group),mean)
##    Group.1    x
## 1        0 1.10
## 2        1 1.40
## 3        2 1.75
## 4        3 2.06
## 5        4 2.25
## 6        5 2.60
## 7        6 2.59
## 8        7 2.76
## 9        8 3.04
## 10       9 3.06
## 11      10 3.47
xg <- split(x, group)
sapply(xg, length)
##   0   1   2   3   4   5   6   7   8   9  10 
##  55  99  92 102 101 102 101 102  93 108  45
sapply(xg, mean)
##    0    1    2    3    4    5    6    7    8    9   10 
## 1.10 1.40 1.75 2.06 2.25 2.60 2.59 2.76 3.04 3.06 3.47
library(pbapply,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## Error in library(pbapply, verbose = FALSE, quietly = TRUE, warn.conflicts = FALSE): there is no package called 'pbapply'
n <- 10; nn <- 1e6
group <- factor(round(n * runif(n * nn)))
x <- rnorm(n * nn) + sqrt(as.numeric(group))
xg <- split(x, group)
pbsapply(xg, mean)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "pbsapply"
x <- factor(c("b","a","b"))
levels(x)
## [1] "a" "b"
x[3]="c"
## Warning in `[<-.factor`(`*tmp*`, 3, value = "c"): invalid factor level, NA
## generated
x
## [1] b    a    <NA>
## Levels: a b
factor(x,levels=c("b","a"))
## [1] b    a    <NA>
## Levels: b a
x[1]
## [1] b
## Levels: a b
x[1,drop=TRUE]
## [1] b
## Levels: b
U <- runif(20)
cut(U,breaks=2)
##  [1] (0.494,0.971]  (0.494,0.971]  (0.494,0.971]  (0.0169,0.494] (0.494,0.971] 
##  [6] (0.494,0.971]  (0.0169,0.494] (0.494,0.971]  (0.0169,0.494] (0.494,0.971] 
## [11] (0.0169,0.494] (0.0169,0.494] (0.494,0.971]  (0.0169,0.494] (0.0169,0.494]
## [16] (0.494,0.971]  (0.494,0.971]  (0.494,0.971]  (0.0169,0.494] (0.0169,0.494]
## Levels: (0.0169,0.494] (0.494,0.971]
cut(U,breaks=2,labels=c("small","large"))
##  [1] large large large small large large small large small large small small
## [13] large small small large large large small small
## Levels: small large
cut(U,breaks=c(0,.3,.8,1),labels=c("small","medium","large"))
##  [1] large  large  medium small  large  large  small  large  small  large 
## [11] small  medium medium small  small  medium medium medium medium medium
## Levels: small medium large
table(cut(U,breaks=c(0,.3,.8,1),labels=c("small","medium","large")))
## 
##  small medium  large 
##      6      8      6

Short Quizz

What would 1:3*1:5 return ?

  1. 1 2 3
  2. 1 2 3 2 4 6 3 6 9 4 8 12 5 10 15
  3. 1 4 9 4 10
  4. 1 2 3 4 5 2 4 6 8 10 3 6 9 12 15

a*b returns the componentwise product of a and b

1:3*1:5 is read (1:3)*(1:5) which returns the componentwise product of 1:3 - or c(1,2,3) - and 1:5 - or c(1,2,3,4,5).

Short Quizz

What would intersect(seq(4,28,by=7),seq (31,3,by=-2)) return ?

  1. 11 25
  2. 25 11
  3. 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31
  4. FALSE

See seq(4,28,by=7) %in% seq(3,31,by=2)

seq(4,28,by=7)[seq(4,28,by=7) %in% seq(3,31,by=2)].

Short Quizz

Consider function dg <- function(M) diag(M[,ncol(M):1]).

Define M <- matrix(1:25,5,5). What would dg(M) return ?

  1. 5 9 13 17 21
  2. 21 17 13 9 5
  3. 1 2 3 4 5
  4. 1 7 13 19 25

See dg return the second diagonal of the matrix (watch the order).

dgreturn the second diagonal of the matrix, starting from the upper right corner (21) and ending with the lower left corner (5).

Short Quizz

Define M <- matrix(1:5 ,3 ,3)

What would M[,2] return ?

  1. 4 5
  2. 4 5 NA
  3. 4 5 NULL
  4. 4 5 1

Observe that M <- matrix(c(1:5,1:4),3,3).

M <- matrix(c(1:5,1:4),3,3).

Short Quizz

Compute \(\sum_{i=10}^{20}(i^2+4/i)\)

  1. 2588.075
  2. 2581.925
  3. 2855.609
  4. 373.2841

Use sum(i^2-4/i).

Set i=10:20 and compute sum(i^2+4/i).

Short Quizz

Solve \[ \left\{ 3x+2y-z=1 \\ 2x-2y+4z=-2 \\ -x+y/2-z=0 \right. \]

  1. -1 2 0
  2. 1 -2 0
  3. 0 1 2
  4. there is no unique solution

Use solve(A,b).

Set A <- matrix(c(3,2,-1,2,-2,.5,-1,4,-1),3,3) and b <- c(1,-2,0) and then compute solve(A,b).

Short Quizz

Compute \(\displaystyle{\sum_{i=1}^{10}\sum_{j=i}^{10} \frac{i^2}{5+i\cdot j}}\)

  1. 32.21865
  2. 27.71865
  3. 2.5
  4. 6.5

Use a double loop for(i in 1:10) for(j in i:10).

Set s <- 0 and then compute for(i in 1:10) for(j in i:10) s <- s+i^2/(5*i*j).

Short Quizz

What sqrt(7)^2 == 7

  1. TRUE
  2. FALSE
  3. ``
  4. NULL

Compute sqrt(7)^2 - 7.

Use all.equal(sqrt(7)^2, 7) to avoid numerical issue like that one.

Short Quizz

Find roots of polynomial \(x^2+x=1\)

  1. ``
  2. ``
  3. ``
  4. NULL

Compute sqrt(7)^2 - 7.

Use Re(polyroot(c(1,1,-1))) or uniroot(f,c(-10,0))$root and uniroot(f,c(0,10))$root

R objects : Strings and Characters

"Be carefull of 'quotes'"
'Be carefull of "quotes"'
cities <- c("New York, NY", "Los Angeles, CA", "Boston, MA")
substr(cities, nchar(cities)-1, nchar(cities))
## [1] "NY" "CA" "MA"
unlist(strsplit(cities, ", "))[seq(2,6,by=2)]
## [1] "NY" "CA" "MA"
library(stringr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
tweet = "Emerging #climate change, environment and sustainability concerns for #actuaries Apr 9 #Toronto. Register TODAY http://bit.ly/CIAClimateForum"
hash="#[a-zA-Z]{1,}"
str_extract(tweet,hash)
## [1] "#climate"
str_extract_all(tweet,hash)
## [[1]]
## [1] "#climate"   "#actuaries" "#Toronto"
str_locate_all(tweet,hash)
## [[1]]
##      start end
## [1,]    10  17
## [2,]    71  80
## [3,]    88  95
urls="http://([\\da-z\\.-]+)\\.([a-z\\.]{2,6})/[a-zA-Z0-9]{1,}"
str_extract_all(tweet,urls)
## [[1]]
## [1] "http://bit.ly/CIAClimateForum"
email="^([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})$"
grep(pattern = email, x = "charpentier.arthur@uqam.ca")
## [1] 1
grepl(pattern = email, x = "charpentier.arthur@uqam.ca")
## [1] TRUE
str_detect(pattern = email, string=c("charpentier.arthur@uqam.ca","@freakonometrics"))
## [1]  TRUE FALSE
ex_sentence = "This is 1 simple sentence, just to play with, then we'll play with 4, and that will be more difficult"
library(tm,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ex_corpus <- Corpus(VectorSource(ex_sentence))
ex_corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
inspect(ex_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] This is 1 simple sentence, just to play with, then we'll play with 4, and that will be more difficult
grep("hard", ex_sentence)
## integer(0)
grep("difficult", ex_sentence)
## [1] 1
library(stringr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
word(ex_sentence,4)
## [1] "simple"
word(ex_sentence,1:20)
##  [1] "This"      "is"        "1"         "simple"    "sentence," "just"     
##  [7] "to"        "play"      "with,"     "then"      "we'll"     "play"     
## [13] "with"      "4,"        "and"       "that"      "will"      "be"       
## [19] "more"      "difficult"
ex_words <- strsplit(ex_sentence, split=" ")[[1]]
ex_words
##  [1] "This"      "is"        "1"         "simple"    "sentence," "just"     
##  [7] "to"        "play"      "with,"     "then"      "we'll"     "play"     
## [13] "with"      "4,"        "and"       "that"      "will"      "be"       
## [19] "more"      "difficult"
grep(pattern="w",ex_words,value=TRUE)
## [1] "with," "we'll" "with"  "will"
str_count(ex_words,"w")
##  [1] 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0
grep(pattern="[ai]",ex_words,value=TRUE)
##  [1] "This"      "is"        "simple"    "play"      "with,"     "play"     
##  [7] "with"      "and"       "that"      "will"      "difficult"
grep(pattern="[[:punct:]]",ex_words,value=TRUE)
## [1] "sentence," "with,"     "we'll"     "4,"
require(wordcloud)
## Loading required package: wordcloud
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'wordcloud'
wordcloud(ex_corpus)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "wordcloud"
cols<-colorRampPalette(c("lightgrey","blue"))(40)
wordcloud(words = ex_corpus, max.words = 40, random.order=FALSE, scale = c(5, 0.5), colors=cols)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "wordcloud"
tdm <- TermDocumentMatrix(ex_corpus)
tdm
## <<TermDocumentMatrix (terms: 12, documents: 1)>>
## Non-/sparse entries: 12/0
## Sparsity           : 0%
## Maximal term length: 9
## Weighting          : term frequency (tf)
inspect(tdm)
## <<TermDocumentMatrix (terms: 12, documents: 1)>>
## Non-/sparse entries: 12/0
## Sparsity           : 0%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## Sample             :
##            Docs
## Terms       1
##   and       1
##   difficult 1
##   just      1
##   more      1
##   play      2
##   sentence  1
##   simple    1
##   that      1
##   then      1
##   with      2
inspect(ex_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] This is 1 simple sentence, just to play with, then we'll play with 4, and that will be more difficult
fix_contractions <- function(doc) {
  doc <- gsub("won't", "will not", doc)
  doc <- gsub("n't", " not", doc)
  doc <- gsub("'ll", " will", doc)
  doc <- gsub("'re", " are", doc)
  doc <- gsub("'ve", " have", doc)
  doc <- gsub("'m", " am", doc)
  doc <- gsub("'s", "", doc) 
  return(doc)
}
ex_corpus <- tm_map(ex_corpus, fix_contractions)
inspect(ex_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] This is 1 simple sentence, just to play with, then we will play with 4, and that will be more difficult
ex_corpus <- tm_map(ex_corpus, tolower)
inspect(ex_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] this is 1 simple sentence, just to play with, then we will play with 4, and that will be more difficult
ex_corpus <- tm_map(ex_corpus, removeNumbers)
inspect(ex_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] this is  simple sentence, just to play with, then we will play with , and that will be more difficult
ex_corpus <- tm_map(ex_corpus,gsub, pattern= "[[:punct:]]", replacement = " ")
inspect(ex_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] this is  simple sentence  just to play with  then we will play with   and that will be more difficult
stopwords("en")[sample(1:length(stopwords("en")),size=10)]
##  [1] "very"   "when"   "who"    "what's" "they'd" "i'm"    "am"     "she's" 
##  [9] "then"   "here"
inspect(ex_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] this is  simple sentence  just to play with  then we will play with   and that will be more difficult
library(SnowballC,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ex_corpus <- tm_map(ex_corpus, stemDocument)
inspect(ex_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] this is simpl sentenc just to play with then we will play with and that will be more difficult
wordcloud(ex_corpus[[1]])
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "wordcloud"
wordcloud(words = ex_corpus[[1]], max.words = 40, random.order=FALSE, scale = c(5, 0.5),colors=cols)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "wordcloud"

Short Quizz

What would grep("ab",c("abc","b","a","ba","cab")) return?

  1. TRUE
  2. 1 5
  3. abc cab
  4. TRUE FALSE FALSE FALSE TRUE

Use ?grep.

only two elements contain ab in that order.

R objects : Strings and Characters

some.dates <- as.Date(c("16/10/12","19/11/12"),format="%d/%m/%y")
diff(some.dates)
## Time difference of 34 days
sequence.date <- seq(from=some.dates[1],to=some.dates[2],by=7)
format(sequence.date,"%b")
## [1] "oct." "oct." "oct." "nov." "nov."
months(sequence.date)
## [1] "octobre"  "octobre"  "octobre"  "novembre" "novembre"
weekdays(some.dates)
## [1] "mardi" "lundi"
substr(as.POSIXct(some.dates), 1, 4)
## [1] "2012" "2012"
strftime(some.dates,"%Y")
## [1] "2012" "2012"
Sys.setlocale("LC_TIME", "fr_FR")
## Warning in Sys.setlocale("LC_TIME", "fr_FR"): la requête OS pour spécifier la
## localisation à "fr_FR" n'a pas pu être honorée
## [1] ""
months(sequence.date)
## [1] "octobre"  "octobre"  "octobre"  "novembre" "novembre"

R objects : Lists

x <- list(1:5,c(1,2,3,4,5),a="test",
b=c(TRUE,FALSE),rpois(5,8))
x
## [[1]]
## [1] 1 2 3 4 5
## 
## [[2]]
## [1] 1 2 3 4 5
## 
## $a
## [1] "test"
## 
## $b
## [1]  TRUE FALSE
## 
## [[5]]
## [1]  6  7 13  6  4
str(x)
## List of 5
##  $  : int [1:5] 1 2 3 4 5
##  $  : num [1:5] 1 2 3 4 5
##  $ a: chr "test"
##  $ b: logi [1:2] TRUE FALSE
##  $  : int [1:5] 6 7 13 6 4
x <- list(
  list(1:5,c(1,2,3,4,5)),a="test",
  b=c(TRUE,FALSE),rpois(5,8))
x
## [[1]]
## [[1]][[1]]
## [1] 1 2 3 4 5
## 
## [[1]][[2]]
## [1] 1 2 3 4 5
## 
## 
## $a
## [1] "test"
## 
## $b
## [1]  TRUE FALSE
## 
## [[4]]
## [1]  5 10 10  7  8
names(x)
## [1] ""  "a" "b" ""
str(x)
## List of 4
##  $  :List of 2
##   ..$ : int [1:5] 1 2 3 4 5
##   ..$ : num [1:5] 1 2 3 4 5
##  $ a: chr "test"
##  $ b: logi [1:2] TRUE FALSE
##  $  : int [1:5] 5 10 10 7 8
is.list(x)
## [1] TRUE
is.recursive(x)
## [1] TRUE
x[[3]]
## [1]  TRUE FALSE
x[["b"]]
## [1]  TRUE FALSE
x[[1]]
## [[1]]
## [1] 1 2 3 4 5
## 
## [[2]]
## [1] 1 2 3 4 5
x[[1]][[1]]
## [1] 1 2 3 4 5
f <- function(x) { return(x*(1-x)) }
optim.f <- optimize(f, interval=c(0, 1), maximum=TRUE)
str(optim.f)
## List of 2
##  $ maximum  : num 0.5
##  $ objective: num 0.25
list(abc=1)$a
## [1] 1
list(abc=1,a=2)$a
## [1] 2
list(bac=1)$a
## NULL
list(abc=1,b=2)$a
## [1] 1

R objects : Functions

factorial
## function (x) 
## gamma(x + 1)
## <bytecode: 0x5583a3a67628>
## <environment: namespace:base>
gamma
## function (x)  .Primitive("gamma")
?gamma

If Condition

set.seed(1)
u <- runif(1)
if(u>.5) {("greater than 50%")} else {("smaller than 50%")}
## [1] "smaller than 50%"
ifelse(u>.5,("greater than 50%"),("smaller than 50%"))
## [1] "smaller than 50%"
u
## [1] 0.266
u <- runif(3)
if(u>.5) {print("greater than 50%")} else {("smaller than 50%")}
## Warning in if (u > 0.5) {: la condition a une longueur > 1 et seul le premier
## élément est utilisé
## [1] "smaller than 50%"
ifelse(u>.5,("greater than 50%"),("smaller than 50%"))
## [1] "smaller than 50%" "greater than 50%" "greater than 50%"
u                
## [1] 0.372 0.573 0.908

Loops

v_x <- runif(1e5)
sqrt_x <- NULL
for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x))

sqrt_x <- NULL
system.time(for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x)))
##    user  system elapsed 
##    9.91    0.08    9.99
sqrt_x <- numeric(length(v_x))
for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i])

sqrt_x <- numeric(length(v_x))
system.time(for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i]))
##    user  system elapsed 
##   0.104   0.000   0.105
system.time(Vectorize(sqrt)(v_x))
##    user  system elapsed 
##       0       0       0
sqrt_x <- sapply(v_x,sqrt)
sqrt_x <- unlist(lapply(v_x,sqrt))
system.time(unlist(lapply(v_x,sqrt)))
##    user  system elapsed 
##   0.028   0.000   0.028
library(parallel,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
(allcores <- detectCores(all.tests=TRUE))
## [1] 4
sqrt_x <- unlist(mclapply(v_x,sqrt,mc.cores=4))
system.time(unlist(mclapply(v_x,sqrt,mc.cores=4)))
##    user  system elapsed 
##   0.060   0.040   0.047
f<-function(x) dlnorm(x)
integrate(f,0,Inf)
## 1 with absolute error < 2.5e-07
integrate(f,0,1e5)
## 1.82e-05 with absolute error < 3.6e-05
integrate(f,0,1e3)$value+integrate(f,1e3,1e5)$value
## [1] 1

Apply

rN.Poisson <- function(n) rpois(n,5)
rX.Exponential <- function(n) rexp(n,2)
set.seed(1)
rN.Poisson(5)
## [1] 4 4 5 8 3
rX.Exponential(3)
## [1] 0.229 0.129 1.447
rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  V <- rep(0,n)
  for(i in 1:n){
    N <- rN(1)
    if(N>0){V[i] <- sum(rX(N))}
  }
  return(V)}
rcpd1(3)
## [1] 1.52 3.88 0.79
rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  V <- rep(0,n)
  for (i in 1:n) V[i] <- sum(rX(rN(1)))
  return(V)}
rcpd1(3)
## [1] 0.893 3.368 3.023
rcpd2 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  N <- rN(n)
  X <- rX(sum(N))
  I <- factor(rep(1:n,N),levels=1:n)
  return(as.numeric(xtabs(X ~ I)))}
rcpd2(3)
## [1] 2.29 2.06 5.46
rcpd3 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  N <- rN(n)
  X <- rX(sum(N))
  I <- factor(rep(1:n,N),levels=1:n)
  V <- tapply(X,I,sum)
  V[is.na(V)] <- 0
  return(as.numeric(V))}
rcpd3(3)
## [1] 2.22 6.03 4.33
rcpd4 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(sapply(rN(n), function(x) sum(rX(x))))}
rcpd4(3)
## [1] 2.83 4.09 2.99
rcpd5 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(sapply(Vectorize(rX)(rN(n)),sum))}
rcpd5(3)
## [1] 1.81 3.69 3.60
rcpd6 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(unlist(lapply(lapply(t(rN(n)),rX),sum)))}
rcpd6(3)
## [1] 0.291 1.344 2.948
n <- 100
library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
options(digits=1)
microbenchmark(rcpd1(n),rcpd2(n),rcpd3(n),rcpd4(n),rcpd5(n),rcpd6(n))
## Unit: microseconds
##      expr min  lq mean median   uq  max neval cld
##  rcpd1(n) 548 580  734    621  698 5570   100  b 
##  rcpd2(n) 766 835  950    878 1010 1992   100   c
##  rcpd3(n) 337 373  412    386  414  797   100 a  
##  rcpd4(n) 350 371  430    392  428  980   100 a  
##  rcpd5(n) 399 439  492    460  502  942   100 a  
##  rcpd6(n) 320 337  394    361  401  760   100 a
options(digits=5)

Vectorization

f <- function(x,m=0,s=1){
  H<-function(t) 1-pnorm(t,m,s)
  integral<-integrate(H,lower=x,upper=Inf)$value
  res<-H(x)/integral
  return(res)
}
f(0)
## [1] 1.2533
f(0:1) 
## Warning in if (is.finite(lower)) {: la condition a une longueur > 1 et seul le
## premier élément est utilisé
## [1] 1.25331 0.39769
y <- rep(NA,2)
x <- 0:1
for(i in 1:2) y[i] <- f(x[i])
y
## [1] 1.2533 1.9043
sapply(x,"f")
## [1] 1.2533 1.9043
Vectorize(f)(x)
## [1] 1.2533 1.9043
binorm <- function(x1,x2,r=0){
  exp(-(x1^2+x2^2-2*r*x1*x2)/(2*(1-r^2)))/(2*pi*sqrt(1-r^2))
}
u <- seq(-2,2)
binorm(u,u)
## [1] 0.002915 0.058550 0.159155 0.058550 0.002915
outer(u,u,binorm)
##          [,1]     [,2]     [,3]     [,4]     [,5]
## [1,] 0.002915 0.013064 0.021539 0.013064 0.002915
## [2,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [3,] 0.021539 0.096532 0.159155 0.096532 0.021539
## [4,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [5,] 0.002915 0.013064 0.021539 0.013064 0.002915
(uv<-expand.grid(u,u))
##    Var1 Var2
## 1    -2   -2
## 2    -1   -2
## 3     0   -2
## 4     1   -2
## 5     2   -2
## 6    -2   -1
## 7    -1   -1
## 8     0   -1
## 9     1   -1
## 10    2   -1
## 11   -2    0
## 12   -1    0
## 13    0    0
## 14    1    0
## 15    2    0
## 16   -2    1
## 17   -1    1
## 18    0    1
## 19    1    1
## 20    2    1
## 21   -2    2
## 22   -1    2
## 23    0    2
## 24    1    2
## 25    2    2
matrix(binorm(uv$Var1,uv$Var2),5,5)
##          [,1]     [,2]     [,3]     [,4]     [,5]
## [1,] 0.002915 0.013064 0.021539 0.013064 0.002915
## [2,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [3,] 0.021539 0.096532 0.159155 0.096532 0.021539
## [4,] 0.013064 0.058550 0.096532 0.058550 0.013064
## [5,] 0.002915 0.013064 0.021539 0.013064 0.002915
u <- seq(-2,2,length=31)
persp(u,u,outer(u,u,binorm),theta=25)

plot of chunk unnamed-chunk-93

Computational Efficiency

library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
n <- 1000
A<-matrix(seq(1,n^2),n,n)
B<-matrix(seq(1,n^2),n,n)
C<-1:n
microbenchmark((A%*%B)%*%C,A%*%(B%*%C),replications=1)
## Unit: nanoseconds
##             expr       min        lq       mean    median         uq        max
##  (A %*% B) %*% C 741482231 828932318 9.0671e+08 881234170 9.5726e+08 1593058647
##  A %*% (B %*% C)   4658177   5269338 8.5710e+06   6380956 9.0802e+06  105423967
##     replications         5        21 2.7901e+02       246 3.1850e+02       2706
##  neval cld
##    100   b
##    100  a 
##    100  a
m <- 1000
n <- 50
X <- matrix(rnorm(m * n, mean = 10, sd = 3), nrow = m)
group <- rep(c("A","B"), each = n / 2)

system.time(for(i in 1:m) t.test(X[i, ] ~ group)$stat)
##    user  system elapsed 
##   0.716   0.000   0.713
system.time(for(i in 1:m) t.test(X[i, group == "A"], X[i, group == "B"])$stat)
##    user  system elapsed 
##   0.188   0.000   0.189
t.test_1 <- function(x, grp){
  t.test(x[ group == "A"], x[ group == "B"])$stat
}
system.time(t1 <- apply(X, 1, t.test_1, grp = grp))
##    user  system elapsed 
##   0.212   0.000   0.213
t.test_2 <- function(x, grp) {
  t_stat <- function(x) {
    m <- mean(x)
    n <- length(x)
    var <- sum((x - m) ^ 2) / (n - 1)
    list(m = m, n = n, var = var)
  }
  g1 <- t_stat(x[grp == "A"])
  g2 <- t_stat(x[grp == "B"])
  se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
  (g1$m - g2$m) / se_total
}
system.time(apply(X, 1, t.test_2, grp = group))
##    user  system elapsed 
##   0.028   0.000   0.027
t.test_3 <- function(X, grp){
  t_stat <- function(X) {
    m <- rowMeans(X)
    n <- ncol(X)
    var <- rowSums((X - m) ^ 2) / (n - 1)
    list(m = m, n = n, var = var)
  }
  g1 <- t_stat(X[, grp == "A"])
  g2 <- t_stat(X[, grp == "B"])
  se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
  (g1$m - g2$m) / se_total
}
system.time( t.test_3(X, group))
##    user  system elapsed 
##   0.000   0.000   0.001
"%pm%" <- function(x,s) x + c(qnorm(.05),qnorm(.95))*s
100 %pm% 10
## [1]  83.551 116.449
'second<-' <- function(x, value) {
  x[2] <- value
  return(x)
}
x <- 1:10
second(x) <- 5
x
##  [1]  1  5  3  4  5  6  7  8  9 10
library(pryr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## Error in library(pryr, verbose = FALSE, quietly = TRUE, warn.conflicts = FALSE): there is no package called 'pryr'
x <- 1:10
address(x)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "address"
second(x) <- 6
address(x)
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "address"
ls(globalenv())
## [1] "BMI4"
find("x")
## character(0)
find("pi")
## [1] "package:base"
environment(sd)
## <environment: namespace:stats>
f <- function(x) x^2
f
## function(x) x^2
## <environment: 0x558395eef418>
formals(f)
## $x
body(f)
## x^2
e <- new.env()
e$d <- 1
e$f <- 1:5
e$g <- 1:5
ls(e)
## [1] "d" "f" "g"
str(e)
## <environment: 0x5583a5be7960>
identical(globalenv(),e)
## [1] FALSE
search()
##  [1] ".GlobalEnv"             "package:microbenchmark" "package:parallel"      
##  [4] "package:SnowballC"      "package:stringr"        "package:biglm"         
##  [7] "package:DBI"            "package:VGAM"           "package:splines"       
## [10] "package:stats4"         "package:SparseM"        "package:tm"            
## [13] "package:NLP"            "package:leaflet"        "package:dplyr"         
## [16] "package:htmlwidgets"    "package:knitr"          "package:stats"         
## [19] "package:graphics"       "package:grDevices"      "package:utils"         
## [22] "package:datasets"       "package:methods"        "Autoloads"             
## [25] "package:base"
f1 <- function(x1){
  f2 <- function(x2){
    f3 <- function(x3){
      x1+x2+x3
    }
    f3(3)
  } 
  f2(2)
}
f1(1)
## [1] 6
f <- function(x) log(x)
f("x")
## Error in log(x): argument non numérique pour une fonction mathématique
try(f("x"))
inherits(try(f("x")), "try-error")
## [1] TRUE
inherits(try(f("x"), silent=TRUE), "try-error")
## [1] TRUE
a=0
try(a<-f("x"), silent=TRUE)
a
## [1] 0
try(a<-f(x), silent=TRUE)
a
##  [1] 0.0000 1.7918 1.0986 1.3863 1.6094 1.7918 1.9459 2.0794 2.1972 2.3026
x=1:10
g=function(f) f(x)
g(mean)
## [1] 5.5
fibonacci <- function(n){
  if(n<2) return(1)
  fibonacci(n-2)+fibonacci(n-1)
}
fibonacci(20)
## [1] 10946
system.time(fibonacci(30))
##    user  system elapsed 
##   2.056   0.004   2.094
library(memoise,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
fibonacci <- memoise(function(n){
  if(n<2) return(1)
  fibonacci(n-2)+fibonacci(n-1)
})
system.time(fibonacci(30))
##    user  system elapsed 
##   0.016   0.000   0.017
set.seed(1)
x<-rexp(6)
sum(x)
## [1] 5.5534
.Primitive("sum")(x)
## [1] 5.5534
library(Rcpp,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
cppFunction('double sum_C(NumericVector x) {
            int n = x.size();
            double total = 0;
            for(int i = 0; i < n; ++i) {
            total += x[i];
            }
            return total;
            }')
sum_C(x)
## [1] 5.5534
library(microbenchmark,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
x <- runif(1e6)
microbenchmark(
  sum(x),
  .Primitive("sum")(x),
  sum_C(x))
## Unit: microseconds
##                  expr     min      lq   mean median     uq    max neval cld
##                sum(x)  968.60  987.35 1034.0 1021.8 1061.2 1216.9   100  a 
##  .Primitive("sum")(x)  971.29  987.96 1029.3 1010.9 1045.5 1211.6   100  a 
##              sum_C(x) 1247.59 1265.43 1310.7 1300.8 1323.9 1543.4   100   b
names_list <- function(...){
  names(list(...))
}
names_list(a=5,b=7)
## [1] "a" "b"
power <- function(exponent) {
  function(x) {
    x ^ exponent
  }
}
square <- power(2)
square(4)
## [1] 16
cube <- power(3)
cube(4)
## [1] 64
cube
## function(x) {
##     x ^ exponent
##   }
## <environment: 0x5583a5a86520>
total <- 20
pb <- txtProgressBar(min = 0, max = total, style = 3)
## 
  |                                                                            
  |                                                                      |   0%
for(i in 1:(10*total)){
  Sys.sleep(0.01)
  if(i %%10 == 0) setTxtProgressBar(pb, i%/%10)
}
## 
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |======================================================================| 100%
close(pb)

R objects : Data Frames

df <- data.frame(x=1:3,y=letters[1:3])
str(df)
## 'data.frame':    3 obs. of  2 variables:
##  $ x: int  1 2 3
##  $ y: Factor w/ 3 levels "a","b","c": 1 2 3
typeof(df)
## [1] "list"
class(df)
## [1] "data.frame"
cbind(df,z=9:7)
##   x y z
## 1 1 a 9
## 2 2 b 8
## 3 3 c 7
df$z<-5:3
df
##   x y z
## 1 1 a 5
## 2 2 b 4
## 3 3 c 3
df[1]
##   x
## 1 1
## 2 2
## 3 3
df[,1,drop=FALSE]
##   x
## 1 1
## 2 2
## 3 3
df[,1,drop=TRUE]
## [1] 1 2 3
df[[1]]
## [1] 1 2 3
df$x
## [1] 1 2 3
df[,"x"]
## [1] 1 2 3
df[["x"]]
## [1] 1 2 3
set.seed(1)
df[sample(nrow(df)),]
##   x y z
## 1 1 a 5
## 3 3 c 3
## 2 2 b 4
set.seed(1)
df[sample(nrow(df),nrow(df)*2,replace=TRUE),]
##     x y z
## 1   1 a 5
## 2   2 b 4
## 2.1 2 b 4
## 3   3 c 3
## 1.1 1 a 5
## 3.1 3 c 3
subset(df, x>1)
##   x y z
## 2 2 b 4
## 3 3 c 3
set.seed(123)
df <- data.frame(Y=rnorm(50), X1=as.factor(sample(LETTERS[1:4],size=50,
replace=TRUE)), X2=as.factor(sample(1:3,size=50,replace=TRUE)))
tail(df)
##            Y X1 X2
## 45  1.207962  D  3
## 46 -1.123109  A  2
## 47 -0.402885  A  2
## 48 -0.466655  A  2
## 49  0.779965  A  1
## 50 -0.083369  C  2
reg <- lm(Y~X1+X2,data=df)
model.matrix(reg)[45:50,]
##    (Intercept) X1B X1C X1D X22 X23
## 45           1   0   0   1   0   1
## 46           1   0   0   0   1   0
## 47           1   0   0   0   1   0
## 48           1   0   0   0   1   0
## 49           1   0   0   0   0   0
## 50           1   0   1   0   1   0
reg <- lm(Y~(X1=="A")+X2,data=df)
model.matrix(reg)[45:50,]
##    (Intercept) X1 == "A"TRUE X22 X23
## 45           1             0   0   1
## 46           1             1   1   0
## 47           1             1   1   0
## 48           1             1   1   0
## 49           1             1   0   0
## 50           1             0   1   0
reg <- lm(Y~X1*X2,data=df)
model.matrix(reg)[45:50,]
##    (Intercept) X1B X1C X1D X22 X23 X1B:X22 X1C:X22 X1D:X22 X1B:X23 X1C:X23
## 45           1   0   0   1   0   1       0       0       0       0       0
## 46           1   0   0   0   1   0       0       0       0       0       0
## 47           1   0   0   0   1   0       0       0       0       0       0
## 48           1   0   0   0   1   0       0       0       0       0       0
## 49           1   0   0   0   0   0       0       0       0       0       0
## 50           1   0   1   0   1   0       0       1       0       0       0
##    X1D:X23
## 45       1
## 46       0
## 47       0
## 48       0
## 49       0
## 50       0
reg <- lm(Y~X1+X2%in%X1,data=df)
model.matrix(reg)[45:50,]
##    (Intercept) X1B X1C X1D X1A:X22 X1B:X22 X1C:X22 X1D:X22 X1A:X23 X1B:X23
## 45           1   0   0   1       0       0       0       0       0       0
## 46           1   0   0   0       1       0       0       0       0       0
## 47           1   0   0   0       1       0       0       0       0       0
## 48           1   0   0   0       1       0       0       0       0       0
## 49           1   0   0   0       0       0       0       0       0       0
## 50           1   0   1   0       0       0       1       0       0       0
##    X1C:X23 X1D:X23
## 45       0       1
## 46       0       0
## 47       0       0
## 48       0       0
## 49       0       0
## 50       0       0
download.file("http://freakonometrics.free.fr/superheroes.RData","superheroes.RData")
load("superheroes.RData")
superheroes
##       name alignment gender         publisher
## 1  Magneto       bad   male            Marvel
## 2    Storm      good female            Marvel
## 3 Mystique       bad female            Marvel
## 4   Batman      good   male                DC
## 5    Joker       bad   male                DC
## 6 Catwoman       bad female                DC
## 7  Hellboy      good   male Dark Horse Comics
publishers
##   publisher yr_founded
## 1        DC       1934
## 2    Marvel       1939
## 3     Image       1992
library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
inner_join(superheroes, publishers)
## Joining, by = "publisher"
## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector
##       name alignment gender publisher yr_founded
## 1  Magneto       bad   male    Marvel       1939
## 2    Storm      good female    Marvel       1939
## 3 Mystique       bad female    Marvel       1939
## 4   Batman      good   male        DC       1934
## 5    Joker       bad   male        DC       1934
## 6 Catwoman       bad female        DC       1934
inner_join(publishers, superheroes)
## Joining, by = "publisher"
## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector
##   publisher yr_founded     name alignment gender
## 1        DC       1934   Batman      good   male
## 2        DC       1934    Joker       bad   male
## 3        DC       1934 Catwoman       bad female
## 4    Marvel       1939  Magneto       bad   male
## 5    Marvel       1939    Storm      good female
## 6    Marvel       1939 Mystique       bad female
merge(superheroes, publishers, all = TRUE)
##           publisher     name alignment gender yr_founded
## 1 Dark Horse Comics  Hellboy      good   male         NA
## 2                DC   Batman      good   male       1934
## 3                DC    Joker       bad   male       1934
## 4                DC Catwoman       bad female       1934
## 5            Marvel  Magneto       bad   male       1939
## 6            Marvel    Storm      good female       1939
## 7            Marvel Mystique       bad female       1939
## 8             Image     <NA>      <NA>   <NA>       1992
semi_join(superheroes, publishers)
## Joining, by = "publisher"
## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector
##       name alignment gender publisher
## 1  Magneto       bad   male    Marvel
## 2    Storm      good female    Marvel
## 3 Mystique       bad female    Marvel
## 4   Batman      good   male        DC
## 5    Joker       bad   male        DC
## 6 Catwoman       bad female        DC
semi_join(publishers, superheroes)
## Joining, by = "publisher"
## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector
##   publisher yr_founded
## 1        DC       1934
## 2    Marvel       1939
left_join(superheroes, publishers)
## Joining, by = "publisher"
## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector
##       name alignment gender         publisher yr_founded
## 1  Magneto       bad   male            Marvel       1939
## 2    Storm      good female            Marvel       1939
## 3 Mystique       bad female            Marvel       1939
## 4   Batman      good   male                DC       1934
## 5    Joker       bad   male                DC       1934
## 6 Catwoman       bad female                DC       1934
## 7  Hellboy      good   male Dark Horse Comics         NA
left_join(publishers, superheroes)
## Joining, by = "publisher"
## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector
##   publisher yr_founded     name alignment gender
## 1        DC       1934   Batman      good   male
## 2        DC       1934    Joker       bad   male
## 3        DC       1934 Catwoman       bad female
## 4    Marvel       1939  Magneto       bad   male
## 5    Marvel       1939    Storm      good female
## 6    Marvel       1939 Mystique       bad female
## 7     Image       1992     <NA>      <NA>   <NA>
anti_join(superheroes, publishers)
## Joining, by = "publisher"
## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector
##      name alignment gender         publisher
## 1 Hellboy      good   male Dark Horse Comics
anti_join(publishers, superheroes)
## Joining, by = "publisher"
## Warning: Column `publisher` joining factors with different levels, coercing to
## character vector
##   publisher yr_founded
## 1     Image       1992
download.file("http://freakonometrics.free.fr/gapminderDataFiveYear.txt",
              "gapminderDataFiveYear.txt")
gdf <- read.delim("gapminderDataFiveYear.txt")
head(gdf,4)
##       country year      pop continent lifeExp gdpPercap
## 1 Afghanistan 1952  8425333      Asia  28.801    779.45
## 2 Afghanistan 1957  9240934      Asia  30.332    820.85
## 3 Afghanistan 1962 10267083      Asia  31.997    853.10
## 4 Afghanistan 1967 11537966      Asia  34.020    836.20
str(gdf)
## 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
gtbl <- tbl_df(gdf)
library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
gdfdt <- data.table(gdf)
gdfdt
##           country year      pop continent lifeExp gdpPercap
##    1: Afghanistan 1952  8425333      Asia  28.801    779.45
##    2: Afghanistan 1957  9240934      Asia  30.332    820.85
##    3: Afghanistan 1962 10267083      Asia  31.997    853.10
##    4: Afghanistan 1967 11537966      Asia  34.020    836.20
##    5: Afghanistan 1972 13079460      Asia  36.088    739.98
##   ---                                                      
## 1700:    Zimbabwe 1987  9216418    Africa  62.351    706.16
## 1701:    Zimbabwe 1992 10704340    Africa  60.377    693.42
## 1702:    Zimbabwe 1997 11404948    Africa  46.809    792.45
## 1703:    Zimbabwe 2002 11926563    Africa  39.989    672.04
## 1704:    Zimbabwe 2007 12311143    Africa  43.487    469.71
subset(gdf, lifeExp < 30)
##          country year     pop continent lifeExp gdpPercap
## 1    Afghanistan 1952 8425333      Asia  28.801    779.45
## 1293      Rwanda 1992 7290203    Africa  23.599    737.07
gdf[lifeExp < 30,]
## Error in `[.data.frame`(gdf, lifeExp < 30, ): objet 'lifeExp' introuvable
gdf[gdf$lifeExp < 30,]
##          country year     pop continent lifeExp gdpPercap
## 1    Afghanistan 1952 8425333      Asia  28.801    779.45
## 1293      Rwanda 1992 7290203    Africa  23.599    737.07
filter(gtbl, lifeExp < 30)
## # A tibble: 2 x 6
##       country  year     pop continent lifeExp gdpPercap
##        <fctr> <int>   <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1952 8425333      Asia  28.801    779.45
## 2      Rwanda  1992 7290203    Africa  23.599    737.07
subset(gdfdt, lifeExp < 30)
##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07
gdfdt[lifeExp < 30,]
##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07
gdf[gdf$country == "Italy", c("year", "lifeExp")]
##     year lifeExp
## 769 1952  65.940
## 770 1957  67.810
## 771 1962  69.240
## 772 1967  71.060
## 773 1972  72.190
## 774 1977  73.480
## 775 1982  74.980
## 776 1987  76.420
## 777 1992  77.440
## 778 1997  78.820
## 779 2002  80.240
## 780 2007  80.546
gtbl %>% 
  filter(country == "Italy") %>%
  select(year, lifeExp)
## # A tibble: 12 x 2
##     year lifeExp
##    <int>   <dbl>
##  1  1952  65.940
##  2  1957  67.810
##  3  1962  69.240
##  4  1967  71.060
##  5  1972  72.190
##  6  1977  73.480
##  7  1982  74.980
##  8  1987  76.420
##  9  1992  77.440
## 10  1997  78.820
## 11  2002  80.240
## 12  2007  80.546
gdfdt[country == "Italy",cbind(year, lifeExp)]
##       year lifeExp
##  [1,] 1952  65.940
##  [2,] 1957  67.810
##  [3,] 1962  69.240
##  [4,] 1967  71.060
##  [5,] 1972  72.190
##  [6,] 1977  73.480
##  [7,] 1982  74.980
##  [8,] 1987  76.420
##  [9,] 1992  77.440
## [10,] 1997  78.820
## [11,] 2002  80.240
## [12,] 2007  80.546
setkey(gdfdt,country)
gdfdt["Italy",cbind(year, lifeExp)]
##       year lifeExp
##  [1,] 1952  65.940
##  [2,] 1957  67.810
##  [3,] 1962  69.240
##  [4,] 1967  71.060
##  [5,] 1972  72.190
##  [6,] 1977  73.480
##  [7,] 1982  74.980
##  [8,] 1987  76.420
##  [9,] 1992  77.440
## [10,] 1997  78.820
## [11,] 2002  80.240
## [12,] 2007  80.546

Data frames (and other format)

library(dplyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ldf <- tbl_df(gdf)
glimpse(ldf)
## Observations: 1,704
## Variables: 6
## $ country   <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afgh...
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 199...
## $ pop       <dbl> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372,...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, As...
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 4...
## $ gdpPercap <dbl> 779.45, 820.85, 853.10, 836.20, 739.98, 786.11, 978.01, 8...
select(ldf,lifeExp)
## # A tibble: 1,704 x 1
##    lifeExp
##      <dbl>
##  1  28.801
##  2  30.332
##  3  31.997
##  4  34.020
##  5  36.088
##  6  38.438
##  7  39.854
##  8  40.822
##  9  41.674
## 10  41.763
## # ... with 1,694 more rows
select(ldf,lifeExp)$lifeExp[1:10]
##  [1] 28.801 30.332 31.997 34.020 36.088 38.438 39.854 40.822 41.674 41.763
ldf[4:5,]
## # A tibble: 2 x 6
##       country  year      pop continent lifeExp gdpPercap
##        <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1967 11537966      Asia  34.020    836.20
## 2 Afghanistan  1972 13079460      Asia  36.088    739.98
slice(ldf,4:5)
## # A tibble: 2 x 6
##       country  year      pop continent lifeExp gdpPercap
##        <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1967 11537966      Asia  34.020    836.20
## 2 Afghanistan  1972 13079460      Asia  36.088    739.98
filter(ldf,lifeExp<30 | gdpPercap<300)
## # A tibble: 6 x 6
##            country  year      pop continent lifeExp gdpPercap
##             <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>
## 1      Afghanistan  1952  8425333      Asia  28.801    779.45
## 2 Congo, Dem. Rep.  2002 55379852    Africa  44.966    241.17
## 3 Congo, Dem. Rep.  2007 64606759    Africa  46.462    277.55
## 4    Guinea-Bissau  1952   580653    Africa  32.500    299.85
## 5          Lesotho  1952   748747    Africa  42.138    298.85
## 6           Rwanda  1992  7290203    Africa  23.599    737.07
rename(ldf,population = pop)
## # A tibble: 1,704 x 6
##        country  year population continent lifeExp gdpPercap
##         <fctr> <int>      <dbl>    <fctr>   <dbl>     <dbl>
##  1 Afghanistan  1952    8425333      Asia  28.801    779.45
##  2 Afghanistan  1957    9240934      Asia  30.332    820.85
##  3 Afghanistan  1962   10267083      Asia  31.997    853.10
##  4 Afghanistan  1967   11537966      Asia  34.020    836.20
##  5 Afghanistan  1972   13079460      Asia  36.088    739.98
##  6 Afghanistan  1977   14880372      Asia  38.438    786.11
##  7 Afghanistan  1982   12881816      Asia  39.854    978.01
##  8 Afghanistan  1987   13867957      Asia  40.822    852.40
##  9 Afghanistan  1992   16317921      Asia  41.674    649.34
## 10 Afghanistan  1997   22227415      Asia  41.763    635.34
## # ... with 1,694 more rows
library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
dt <- data.table(gdf)
is.data.frame(dt)
## [1] TRUE
object.size(gdf)
## 72184 bytes
object.size(ldf)
## 72304 bytes
object.size(dt)
## 72880 bytes
df[df$lifeExp<30,]
## [1] Y  X1 X2
## <0 rows> (or 0-length row.names)
subset(df, lifeExp < 30)
## Error in eval(expr, envir, enclos): objet 'lifeExp' introuvable
filter(ldf, lifeExp < 30)
## # A tibble: 2 x 6
##       country  year     pop continent lifeExp gdpPercap
##        <fctr> <int>   <dbl>    <fctr>   <dbl>     <dbl>
## 1 Afghanistan  1952 8425333      Asia  28.801    779.45
## 2      Rwanda  1992 7290203    Africa  23.599    737.07
subset(dt, lifeExp < 30)
##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07
dt[lifeExp<30]
##        country year     pop continent lifeExp gdpPercap
## 1: Afghanistan 1952 8425333      Asia  28.801    779.45
## 2:      Rwanda 1992 7290203    Africa  23.599    737.07
setkey(dt, country)
dt["Italy"]
##     country year      pop continent lifeExp gdpPercap
##  1:   Italy 1952 47666000    Europe  65.940    4931.4
##  2:   Italy 1957 49182000    Europe  67.810    6248.7
##  3:   Italy 1962 50843200    Europe  69.240    8243.6
##  4:   Italy 1967 52667100    Europe  71.060   10022.4
##  5:   Italy 1972 54365564    Europe  72.190   12269.3
##  6:   Italy 1977 56059245    Europe  73.480   14256.0
##  7:   Italy 1982 56535636    Europe  74.980   16537.5
##  8:   Italy 1987 56729703    Europe  76.420   19207.2
##  9:   Italy 1992 56840847    Europe  77.440   22013.6
## 10:   Italy 1997 57479469    Europe  78.820   24675.0
## 11:   Italy 2002 57926999    Europe  80.240   27968.1
## 12:   Italy 2007 58147733    Europe  80.546   28569.7
dt["Italy",mult="first"]
##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1952 47666000    Europe   65.94    4931.4
dt["Italy",max(lifeExp)]
## [1] 80.546
max(df$lifeExp[df$country=="Italy"])
## Warning in max(df$lifeExp[df$country == "Italy"]): aucun argument pour max ; -
## Inf est renvoyé
## [1] -Inf
setkey(dt, country, lifeExp) 
dt["Italy"]
##     country year      pop continent lifeExp gdpPercap
##  1:   Italy 1952 47666000    Europe  65.940    4931.4
##  2:   Italy 1957 49182000    Europe  67.810    6248.7
##  3:   Italy 1962 50843200    Europe  69.240    8243.6
##  4:   Italy 1967 52667100    Europe  71.060   10022.4
##  5:   Italy 1972 54365564    Europe  72.190   12269.3
##  6:   Italy 1977 56059245    Europe  73.480   14256.0
##  7:   Italy 1982 56535636    Europe  74.980   16537.5
##  8:   Italy 1987 56729703    Europe  76.420   19207.2
##  9:   Italy 1992 56840847    Europe  77.440   22013.6
## 10:   Italy 1997 57479469    Europe  78.820   24675.0
## 11:   Italy 2002 57926999    Europe  80.240   27968.1
## 12:   Italy 2007 58147733    Europe  80.546   28569.7
dt[.("Italy",77.44)]
##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1992 56840847    Europe   77.44     22014
dt[.("Italy",77)]
##    country year pop continent lifeExp gdpPercap
## 1:   Italy   NA  NA        NA      77        NA
dt[.("Italy",77),roll="nearest"]
##    country year      pop continent lifeExp gdpPercap
## 1:   Italy 1992 56840847    Europe      77     22014
df[df$country == "Italy", c("year", "lifeExp")]
## Error in `[.data.frame`(df, df$country == "Italy", c("year", "lifeExp")): undefined columns selected
ldf %>%
  filter(country == "Italy") %>%
  select(year, lifeExp)
## # A tibble: 12 x 2
##     year lifeExp
##    <int>   <dbl>
##  1  1952  65.940
##  2  1957  67.810
##  3  1962  69.240
##  4  1967  71.060
##  5  1972  72.190
##  6  1977  73.480
##  7  1982  74.980
##  8  1987  76.420
##  9  1992  77.440
## 10  1997  78.820
## 11  2002  80.240
## 12  2007  80.546
small_df <- df[df$country %in% c("France","Italy","Spain"), c("country","year", "lifeExp")]
## Error in `[.data.frame`(df, df$country %in% c("France", "Italy", "Spain"), : undefined columns selected
library(tidyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
str(small_df)
## Error in str(small_df): objet 'small_df' introuvable
table_df=spread(small_df,"country","lifeExp")
## Error in spread(small_df, "country", "lifeExp"): objet 'small_df' introuvable
table_df
## Error in eval(expr, envir, enclos): objet 'table_df' introuvable
data_df=gather(table_df,"country","lifeExp",2:4)
## Error in gather(table_df, "country", "lifeExp", 2:4): objet 'table_df' introuvable
data_df
## Error in eval(expr, envir, enclos): objet 'data_df' introuvable
library(reshape2,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
dcast(small_df, formula = country~year,
      value.var="lifeExp")
## Error in match(x, table, nomatch = 0L): objet 'small_df' introuvable
table_df= dcast(small_df, formula = year~country,
                value.var="lifeExp")
## Error in match(x, table, nomatch = 0L): objet 'small_df' introuvable
table_df
## Error in eval(expr, envir, enclos): objet 'table_df' introuvable
melt(table_df, id.vars=c("year"),
     value.name = "lifeExp",
     variable.name="country")
## Error in melt(table_df, id.vars = c("year"), value.name = "lifeExp", variable.name = "country"): objet 'table_df' introuvable
aggregate(small_df$lifeExp,FUN=max,by=list(small_df$country))
## Error in aggregate(small_df$lifeExp, FUN = max, by = list(small_df$country)): objet 'small_df' introuvable
tapply(small_df$lifeExp,factor(small_df$country),max)
## Error in factor(small_df$country): objet 'small_df' introuvable
small_ldf = filter(ldf,country %in% c("France","Italy","Spain"))
group_by(small_ldf,country) %>% summarize(max(lifeExp))
## # A tibble: 3 x 2
##   country `max(lifeExp)`
##    <fctr>          <dbl>
## 1  France         80.657
## 2   Italy         80.546
## 3   Spain         80.941
filter(ldf,country %in% c("France","Italy","Spain")) %>%
  group_by(country) %>% 
  summarize(max(lifeExp))
## # A tibble: 3 x 2
##   country `max(lifeExp)`
##    <fctr>          <dbl>
## 1  France         80.657
## 2   Italy         80.546
## 3   Spain         80.941
filter(ldf,country %in% c("France","Italy","Spain")) %>%
  group_by(country) %>% 
  summarize(max(lifeExp)) 
## # A tibble: 3 x 2
##   country `max(lifeExp)`
##    <fctr>          <dbl>
## 1  France         80.657
## 2   Italy         80.546
## 3   Spain         80.941
plot(df$gdpPercap,df$lifeExp)
## Warning in min(x): aucun argument trouvé pour min ; Inf est renvoyé
## Warning in max(x): aucun argument pour max ; -Inf est renvoyé
## Warning in min(x): aucun argument trouvé pour min ; Inf est renvoyé
## Warning in max(x): aucun argument pour max ; -Inf est renvoyé
## Error in plot.window(...): 'xlim' nécessite des valeurs finies

plot of chunk unnamed-chunk-166

select(ldf,gdpPercap,lifeExp) %>% plot()

plot of chunk unnamed-chunk-167

setkey(dt, country)
dt[c("France","Italy","Spain"),max(lifeExp),by="country"]
##    country     V1
## 1:  France 80.657
## 2:   Italy 80.546
## 3:   Spain 80.941
dt[c("France","Italy","Spain"),max(lifeExp),by="country"]
##    country     V1
## 1:  France 80.657
## 2:   Italy 80.546
## 3:   Spain 80.941
head(df[with(df,order(-lifeExp)),])
## Error in order(-lifeExp): objet 'lifeExp' introuvable
head(dt[order(-lifeExp)])
##             country year       pop continent lifeExp gdpPercap
## 1:            Japan 2007 127467972      Asia  82.603     31656
## 2: Hong Kong, China 2007   6980412      Asia  82.208     39725
## 3:            Japan 2002 127065841      Asia  82.000     28605
## 4:          Iceland 2007    301931    Europe  81.757     36181
## 5:      Switzerland 2007   7554661    Europe  81.701     37506
## 6: Hong Kong, China 2002   6762476      Asia  81.495     30209
head(setorder(dt,-lifeExp))
##             country year       pop continent lifeExp gdpPercap
## 1:            Japan 2007 127467972      Asia  82.603     31656
## 2: Hong Kong, China 2007   6980412      Asia  82.208     39725
## 3:            Japan 2002 127065841      Asia  82.000     28605
## 4:          Iceland 2007    301931    Europe  81.757     36181
## 5:      Switzerland 2007   7554661    Europe  81.701     37506
## 6: Hong Kong, China 2002   6762476      Asia  81.495     30209
arrange(ldf,-lifeExp)
## # A tibble: 1,704 x 6
##             country  year       pop continent lifeExp gdpPercap
##              <fctr> <int>     <dbl>    <fctr>   <dbl>     <dbl>
##  1            Japan  2007 127467972      Asia  82.603     31656
##  2 Hong Kong, China  2007   6980412      Asia  82.208     39725
##  3            Japan  2002 127065841      Asia  82.000     28605
##  4          Iceland  2007    301931    Europe  81.757     36181
##  5      Switzerland  2007   7554661    Europe  81.701     37506
##  6 Hong Kong, China  2002   6762476      Asia  81.495     30209
##  7        Australia  2007  20434176   Oceania  81.235     34435
##  8            Spain  2007  40448191    Europe  80.941     28821
##  9           Sweden  2007   9031088    Europe  80.884     33860
## 10           Israel  2007   6426679      Asia  80.745     25523
## # ... with 1,694 more rows
df$gdp=df$pop*df$gdpPercap
## Error in `$<-.data.frame`(`*tmp*`, "gdp", value = numeric(0)): replacement has 0 rows, data has 50
df = within(df,gdp = pop*gdpPercap)
## Error in eval(expr, envir, enclos): argument manquant, sans valeur associée par défaut
df = within(df,gdp <- pop*gdpPercap)
## Error in eval(expr, envir, enclos): objet 'pop' introuvable
dt[,gdp:=pop*gdpPercap]
head(dt)
##             country year       pop continent lifeExp gdpPercap        gdp
## 1:            Japan 2007 127467972      Asia  82.603     31656 4.0351e+12
## 2: Hong Kong, China 2007   6980412      Asia  82.208     39725 2.7730e+11
## 3:            Japan 2002 127065841      Asia  82.000     28605 3.6347e+12
## 4:          Iceland 2007    301931    Europe  81.757     36181 1.0924e+10
## 5:      Switzerland 2007   7554661    Europe  81.701     37506 2.8335e+11
## 6: Hong Kong, China 2002   6762476      Asia  81.495     30209 2.0429e+11
mutate(ldf,gdp=pop*gdpPercap)
## # A tibble: 1,704 x 7
##        country  year      pop continent lifeExp gdpPercap        gdp
##         <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>      <dbl>
##  1 Afghanistan  1952  8425333      Asia  28.801    779.45 6.5671e+09
##  2 Afghanistan  1957  9240934      Asia  30.332    820.85 7.5854e+09
##  3 Afghanistan  1962 10267083      Asia  31.997    853.10 8.7589e+09
##  4 Afghanistan  1967 11537966      Asia  34.020    836.20 9.6480e+09
##  5 Afghanistan  1972 13079460      Asia  36.088    739.98 9.6786e+09
##  6 Afghanistan  1977 14880372      Asia  38.438    786.11 1.1698e+10
##  7 Afghanistan  1982 12881816      Asia  39.854    978.01 1.2599e+10
##  8 Afghanistan  1987 13867957      Asia  40.822    852.40 1.1821e+10
##  9 Afghanistan  1992 16317921      Asia  41.674    649.34 1.0596e+10
## 10 Afghanistan  1997 22227415      Asia  41.763    635.34 1.4122e+10
## # ... with 1,694 more rows
countries=read.csv2("http://freakonometrics.free.fr/countries.csv",header=TRUE)
countries = countries[-1,]
head(countries)
##       Country Interational.Direct.Dial..IDD..Code ISO.3166 Car.Code
## 2 Afghanistan                                  93       AF         
## 3     Albania                                 355       AL         
## 4     Algeria                                 213       DZ         
## 5     Andorra                              33 628       AD         
## 6      Angola                                 244       AO         
## 7    Anguilla                               1 809       AI         
##         Capital              Currency ISO.Currency Digraph  ICAO   Language
## 2         Kabul               Afghani          AFA      AF   YA-     Pashto
## 3        Tirana                   Lek          ALL      AL   ZA-   Albanian
## 4       Algiers                 Dinar          DZD      DZ   7T-     Arabic
## 5 And. La Vella          French Franc          FRF      AD   C3-    Catalan
## 6        Luanda           New Irwanza          AON      AO   D2- Portuguese
## 7               East Caribbean Dollar          XCD      AI VP-LA           
##   ISO.639
## 2      PS
## 3      SQ
## 4      AR
## 5      CA
## 6      PT
## 7
df2 = countries[,c("Country","Capital")]
names(df2)=c("countries","capital")
df1 = aggregate(df$lifeExp,FUN=max,by=list(df$country))
## Error in aggregate.data.frame(as.data.frame(x), ...): no rows to aggregate
names(df1)=c("countries","lifeExp")
## Error in names(df1) = c("countries", "lifeExp"): objet 'df1' introuvable
dim(df1)
## Error in eval(expr, envir, enclos): objet 'df1' introuvable
dim(df2)
## [1] 263   2
tail(merge(df1,df2))
## Error in merge(df1, df2): objet 'df1' introuvable
tail(merge(df1,df2,all.x=TRUE))
## Error in merge(df1, df2, all.x = TRUE): objet 'df1' introuvable
tail(merge(df1,df2,all.y=TRUE))
## Error in merge(df1, df2, all.y = TRUE): objet 'df1' introuvable
ldf1 = group_by(ldf, country) %>% summarize(max(lifeExp)) 
names(ldf1)=c("countries","lifeExp")
ldf2 <- tbl_df(df2)
inner_join(ldf1,ldf2)
## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing to
## character vector
## # A tibble: 127 x 3
##      countries lifeExp      capital
##          <chr>   <dbl>       <fctr>
##  1 Afghanistan  43.828        Kabul
##  2     Albania  76.423       Tirana
##  3     Algeria  72.301      Algiers
##  4      Angola  42.731       Luanda
##  5   Argentina  75.320 Buenos Aires
##  6   Australia  81.235     Canberra
##  7     Austria  79.829       Vienna
##  8     Bahrain  75.635       Manama
##  9  Bangladesh  64.062        Dacca
## 10     Belgium  79.441     Brussels
## # ... with 117 more rows
left_join(ldf1,ldf2)
## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing to
## character vector
## # A tibble: 142 x 3
##      countries lifeExp      capital
##          <chr>   <dbl>       <fctr>
##  1 Afghanistan  43.828        Kabul
##  2     Albania  76.423       Tirana
##  3     Algeria  72.301      Algiers
##  4      Angola  42.731       Luanda
##  5   Argentina  75.320 Buenos Aires
##  6   Australia  81.235     Canberra
##  7     Austria  79.829       Vienna
##  8     Bahrain  75.635       Manama
##  9  Bangladesh  64.062        Dacca
## 10     Belgium  79.441     Brussels
## # ... with 132 more rows
ldf1 %>% inner_join(ldf2)
## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing to
## character vector
## # A tibble: 127 x 3
##      countries lifeExp      capital
##          <chr>   <dbl>       <fctr>
##  1 Afghanistan  43.828        Kabul
##  2     Albania  76.423       Tirana
##  3     Algeria  72.301      Algiers
##  4      Angola  42.731       Luanda
##  5   Argentina  75.320 Buenos Aires
##  6   Australia  81.235     Canberra
##  7     Austria  79.829       Vienna
##  8     Bahrain  75.635       Manama
##  9  Bangladesh  64.062        Dacca
## 10     Belgium  79.441     Brussels
## # ... with 117 more rows
dt1 = dt[,max(lifeExp),by="country"]
setnames(dt1,c("countries","lifeExp"))
dt2 <- data.table(df2)
setkey(dt1,"countries")
setkey(dt2,"countries")
dt1[dt2,nomatch=0]
##        countries lifeExp      capital
##   1: Afghanistan  43.828        Kabul
##   2:     Albania  76.423       Tirana
##   3:     Algeria  72.301      Algiers
##   4:      Angola  42.731       Luanda
##   5:   Argentina  75.320 Buenos Aires
##  ---                                 
## 123:     Uruguay  76.384   Montevideo
## 124:   Venezuela  73.747      Caracas
## 125:     Vietnam  74.249        Hanoi
## 126:      Zambia  51.821       Lusaka
## 127:    Zimbabwe  62.351       Harare
dt1[dt2]
##        countries lifeExp       capital
##   1: Afghanistan  43.828         Kabul
##   2:     Albania  76.423        Tirana
##   3:     Algeria  72.301       Algiers
##   4:     Andorra      NA And. La Vella
##   5:      Angola  42.731        Luanda
##  ---                                  
## 259:       Yemen      NA        Sana'a
## 260:  Yugoslavia      NA      Belgrade
## 261:       Zaire      NA      Kinshasa
## 262:      Zambia  51.821        Lusaka
## 263:    Zimbabwe  62.351        Harare
dt2[dt1]
##               countries      capital lifeExp
##   1:        Afghanistan        Kabul  43.828
##   2:            Albania       Tirana  76.423
##   3:            Algeria      Algiers  72.301
##   4:             Angola       Luanda  42.731
##   5:          Argentina Buenos Aires  75.320
##  ---                                        
## 138:            Vietnam        Hanoi  74.249
## 139: West Bank and Gaza           NA  73.422
## 140:        Yemen, Rep.           NA  62.698
## 141:             Zambia       Lusaka  51.821
## 142:           Zimbabwe       Harare  62.351
library(stringr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(lubridate,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(plyr,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
library(data.table,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(LaF,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
loc="http://freakonometrics.free.fr/gapminderDataFiveYear.txt"
download.file(loc,destfile="temp")
ssn <- laf_open_csv("temp",sep="\t", 
                    column_types=c("character","integer","numeric","character","numeric","numeric"),
                    column_names=names(df)[1:6])
object.size(ssn)
## 2872 bytes
str(ssn)
## Formal class 'laf' [package "LaF"] with 8 slots
##   ..@ file_id      : int 0
##   ..@ filename     : chr "temp"
##   ..@ file_type    : chr "csv"
##   ..@ column_names : chr [1:6] "Y" "X1" "X2" "NA." ...
##   ..@ column_types : int [1:6] 3 1 0 3 0 0
##   ..@ column_widths: int(0) 
##   ..@ levels       : list()
##   ..@ options      :List of 4
##   .. ..$ sep : chr "\t"
##   .. ..$ dec : chr "."
##   .. ..$ skip: int 0
##   .. ..$ trim: logi FALSE
nrow(ssn)
## NULL
go_through <- which(ssn[1:(nrow(ssn)-1),'country'] != ssn[2:nrow(ssn),'country'])+1
## Error in 1:(nrow(ssn) - 1): l'argument est de longueur nulle
if(go_through[length(go_through)] != nrow(ssn)) go_through <- c(go_through, nrow(ssn))
## Error in eval(expr, envir, enclos): objet 'go_through' introuvable
go_through <- cbind(go_through[-length(go_through)], c(go_through[-c(1, length(go_through))]-1, go_through[length(go_through)]))
## Error in cbind(go_through[-length(go_through)], c(go_through[-c(1, length(go_through))] - : objet 'go_through' introuvable
worst_life_exp <- function(s){
  data <- ssn[go_through[s,1]:go_through[s,2],c("country","year","lifeExp")]
  data[which.min(data$lifeExp),]
}
worst_life_exp(10)
## Error in ssn[go_through[s, 1]:go_through[s, 2], c("country", "year", "lifeExp")]: objet 'go_through' introuvable

Graphs with ggplot2

library(gamair,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
data(chicago)
head(chicago)
##   death pm10median pm25median o3median so2median    time tmpd
## 1   130   -7.43354         NA  -19.592   1.92804 -2556.5 31.5
## 2   150         NA         NA  -19.039  -0.98556 -2555.5 33.0
## 3   101   -0.82653         NA  -20.217  -1.89142 -2554.5 33.0
## 4   135    5.56646         NA  -19.676   6.13934 -2553.5 29.0
## 5   126         NA         NA  -19.217   2.27846 -2552.5 32.0
## 6   130    6.56646         NA  -17.634   9.85858 -2551.5 40.0
base=data.frame(death=chicago$death,
                temp_F=chicago$tmpd,
                o3=chicago$o3median,
                date=seq(as.Date("1987-01-01"),
                         as.Date("2000-12-31"),by=1))
base$temp_C <- (base$temp_F-32)/1.8
base$year <- substring(base$date,1,4)
date2season <- function(date){
  m <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%m"))
  d <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%d"))
  s <- NA
  if(m %in% c(1,2) | ((m==12)&(d>=21)) |  ((m==3)&(d<21))) s <- "winter"
  if(m %in% c(4,5) | ((m==3)&(d>=21)) |  ((m==6)&(d<21))) s <- "spring"
  if(m %in% c(7,8) | ((m==6)&(d>=21)) |  ((m==9)&(d<21))) s <- "summer"
  if(m %in% c(10,11) | ((m==9)&(d>=21)) |  ((m==12)&(d<21))) s <- "autumn"
  return(s)}
base$season <- sapply(base$date,date2season)
head(base)
##   death temp_F      o3       date   temp_C year season
## 1   130   31.5 -19.592 1987-01-01 -0.27778 1987 winter
## 2   150   33.0 -19.039 1987-01-02  0.55556 1987 winter
## 3   101   33.0 -20.217 1987-01-03  0.55556 1987 winter
## 4   135   29.0 -19.676 1987-01-04 -1.66667 1987 winter
## 5   126   32.0 -19.217 1987-01-05  0.00000 1987 winter
## 6   130   40.0 -17.634 1987-01-06  4.44444 1987 winter
plot(base[,c("date", "temp_C")])

plot of chunk unnamed-chunk-184

library(ggplot2,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
g <- ggplot(base, aes(date, temp_C)) +
  geom_point()
g

plot of chunk unnamed-chunk-185

ggsave(file="figure.pdf", scale=3)
## Saving 21 x 21 in image
g <- g+labs(title='Temperature')
g <- g+ggtitle('Temperature')
g

plot of chunk unnamed-chunk-186

g+theme(plot.title = element_text(size=20, face="bold", vjust=2, color="red"))

plot of chunk unnamed-chunk-187

g<-g+labs(
  x="Date", 
  y=expression(paste("Temperature ( ", degree ~ C, " )")), 
  title="Temperature in Chicago")
g

plot of chunk unnamed-chunk-188

g+ theme(
  axis.title.x = element_text(color="blue"),
  axis.title.y = element_text(color="blue")   
)

plot of chunk unnamed-chunk-189

g + ylim(c(-20,60))
## Warning: Removed 10 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-190

g <- g + xlim(as.Date(c("1995-01-01","1999-01-01")))
g
## Warning: Removed 3652 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-190

base2=data.frame(
  today=base$temp_C[1:5113],
  tomorrow=base$temp_C[2:5114])
ggplot(base2, aes(today, tomorrow))+     geom_point()+ coord_equal()

plot of chunk unnamed-chunk-191

g <- ggplot(base, aes(date, temp_C, color=factor(season)))+
  xlim(as.Date(c("1995-01-01","1999-01-01")))+
  geom_point()
g+theme(legend.title=element_blank())+
  guides(colour = guide_legend(override.aes = list(size=4)))
## Warning: Removed 3652 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-192

g<-ggplot(base, aes(date, temp_C))+
  xlim(as.Date(c("1995-01-01","1999-01-01")))
g+geom_line(color="grey")
## Warning: Removed 3652 rows containing missing values (geom_path).

plot of chunk unnamed-chunk-193

g+geom_line(aes(color="temperature"))+
  scale_colour_manual(name='', values=c('temperature'='grey'))
## Warning: Removed 3652 rows containing missing values (geom_path).

plot of chunk unnamed-chunk-194

ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point() + 
  scale_color_manual(values=
                       c("dodgerblue4", "darkolivegreen4",
                         "darkorchid3", "goldenrod1"))

plot of chunk unnamed-chunk-195

ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")

plot of chunk unnamed-chunk-196

ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
  scale_color_gradient(low="blue", high="red")

plot of chunk unnamed-chunk-197

ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
  scale_color_gradient2(low="blue", mid="white", high="red")

plot of chunk unnamed-chunk-198

ggplot(base, aes(date, temp_C))+geom_point(color="firebrick")+
  theme(panel.background = element_rect(fill = 'light yellow'))

plot of chunk unnamed-chunk-199

ggplot(base, aes(date,temp_C))+geom_point(color="firebrick")+
  facet_wrap(~year, ncol=2, scales="free")

plot of chunk unnamed-chunk-200

library(ggthemes)
ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point()+labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like MS Excel")+
  theme_excel()+scale_colour_excel()+
  theme(legend.title=element_blank())

plot of chunk unnamed-chunk-201

library(ggthemes)
ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point()+labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like 'The Economist'")+
  theme_economist()+
  theme(legend.title=element_blank())

plot of chunk unnamed-chunk-202

## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'rCharts'
## Error in eval(expr, envir, enclos): impossible de trouver la fonction "nPlot"
## Error in eval(expr, envir, enclos): objet 'n1' introuvable

OpenStreetMap

library(OpenStreetMap,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
map <- openmap(c(lat= 51.516, lon= -.141), c(lat= 51.511, lon= -.133))
map <- openproj(map, projection = "+init=epsg:27700") 
plot(map)

plot of chunk unnamed-chunk-203

loc="http://freakonometrics.free.fr/cholera.zip"
download.file(loc,destfile="cholera.zip")
unzip("cholera.zip", exdir="./")
library(maptools,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## Checking rgeos availability: TRUE
deaths<-readShapePoints("Cholera_Deaths")
## Warning: use rgdal::readOGR or sf::st_read
head(deaths@coords)
##   coords.x1 coords.x2
## 0    529309    181031
## 1    529312    181025
## 2    529314    181020
## 3    529317    181014
## 4    529321    181008
## 5    529337    181006
plot(map)
points(deaths@coords,col="red", pch=19, cex=.7 )

plot of chunk unnamed-chunk-205

X <- deaths@coords
library(KernSmooth)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))
## Warning in bw.ucv(X[, 1]): minimum occurred at one end of the range
library(grDevices,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
clrs <- colorRampPalette(c(rgb(0,0,1,0), rgb(0,0,1,1)), alpha = TRUE)(20)
plot(map)
points(deaths@coords,col="red", pch=19, cex=.7 )
image(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE,col=clrs)
contour(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE)

plot of chunk unnamed-chunk-207

Google Map

library(ggmap,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
get_london <- get_map(c(-.137,51.513), zoom=17)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=51.513,-0.137&zoom=17&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
london <- ggmap(get_london)

OpenStreetMap

london

plot of chunk unnamed-chunk-209

df_deaths <- data.frame(X)
library(sp,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(rgdal,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## rgdal: version: 1.2-15, (SVN revision 691)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.1.2, released 2016/10/24
##  Path to GDAL shared files: /usr/share/gdal/2.1
##  GDAL binary built with GEOS: TRUE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: (autodetected)
##  Linking to sp version: 1.2-5
coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700") 
df_deaths = spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))

OpenStreetMap

london + geom_point(aes(x=coords.x1,y=coords.x2),data=data.frame(df_deaths@coords),col="red")
## Warning: Removed 9 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-211

london <- london + geom_point(aes(x=coords.x1, y=coords.x2), 
data=data.frame(df_deaths@coords),col="red") +
  geom_density2d(data = data.frame(df_deaths@coords), 
aes(x = coords.x1, y=coords.x2), size = 0.3) + 
  stat_density2d(data = data.frame(df_deaths@coords), 
 aes(x = coords.x1, y=coords.x2,fill = ..level.., alpha = ..level..),
 size = 0.01, bins = 16, geom = "polygon") + scale_fill_gradient(low = "green", high = "red",
 guide = FALSE) + scale_alpha(range = c(0, 0.3), guide = FALSE)
london
## Warning: Removed 9 rows containing non-finite values (stat_density2d).

## Warning: Removed 9 rows containing non-finite values (stat_density2d).
## Warning: Removed 9 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-213

require(leaflet)
deaths <- readShapePoints("Cholera_Deaths")
## Warning: use rgdal::readOGR or sf::st_read
df_deaths <- data.frame(deaths@coords)
coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700") 
df_deaths=spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))
df=data.frame(df_deaths@coords)
lng=df$coords.x1
lat=df$coords.x2
m = leaflet()%>% addTiles() 
m %>% fitBounds(-.141,  51.511, -.133, 51.516)
rd=.5
op=.8
clr="blue"
m = leaflet() %>% addTiles()
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr)
## Error in loadNamespace(name): there is no package called 'webshot'
X=cbind(lng,lat)
kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))
## Warning in bw.ucv(X[, 1]): minimum occurred at one end of the range
x=kde2d$x1
y=kde2d$x2
z=kde2d$fhat
CL=contourLines(x , y , z)
m = leaflet() %>% addTiles() 
m %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
## Error in loadNamespace(name): there is no package called 'webshot'
m %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
m = leaflet() %>% addTiles() 
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
## Error in loadNamespace(name): there is no package called 'webshot'
m = leaflet() %>% addTiles() 
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE)
## Error in loadNamespace(name): there is no package called 'webshot'
m = leaflet() %>% addTiles() 
m %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>%
  addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>%
  addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red")
## Error in loadNamespace(name): there is no package called 'webshot'

Map with googleVis

library(googleVis)
df=data.frame(state=c("AU-WA","AU-VIC","AU-NT", "AU-NSW", "AU-SA","AU-QLD","AU-TAS","NZ"),
R_users=c(323,425,154,486,201,195,87,123))
Geo <- gvisGeoChart(df,
locationvar="state",colorvar=c("R_users"),
options=list(region="AU",
dataMode="regions",resolution="provinces",
width="80%", height="80%"))

Map with googleVis

library(XML,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
year <- 2012
loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="")
tabs <- readHTMLTable(htmlParse(loc)) 
tabs[1]
## $`NULL`
##     #                    Name          Date Wind Pres Cat  
## 1   1  Tropical Storm ALBERTO     19-23 MAY   50  995   -  
## 2   2    Tropical Storm BERYL 25 MAY- 2 JUN   60  992   -  
## 3   3       Hurricane-1 CHRIS     17-24 JUN   75  974   1  
## 4   4    Tropical Storm DEBBY     23-27 JUN   55  990   -  
## 5   5     Hurricane-2 ERNESTO      1-10 AUG   85  973   2  
## 6   6 Tropical Storm FLORENCE      3- 8 AUG   50 1002   -  
## 7   7   Tropical Storm HELENE      9-19 AUG   40 1004   -  
## 8   8      Hurricane-2 GORDON     15-21 AUG   95  965   2  
## 9   9       Hurricane-1 ISAAC 20 AUG- 1 SEP   70  965   1  
## 10 10    Tropical Storm JOYCE     21-24 AUG   35 1006   -  
## 11 11        Hurricane-2 KIRK 28 AUG- 3 SEP   90  970   2  
## 12 12      Hurricane-1 LESLIE 28 AUG-12 SEP   70  968   1  
## 13 13     Hurricane-3 MICHAEL      2-12 SEP  100  964   3  
## 14 14      Hurricane-1 NADINE 10 SEP- 4 OCT   80  978   1  
## 15 15    Tropical Storm OSCAR      2- 5 OCT   45  994   -  
## 16 16    Tropical Storm PATTY     10-13 OCT   40 1005   -  
## 17 17      Hurricane-1 RAFAEL     12-26 OCT   80  966   1  
## 18 18       Hurricane-3 SANDY     21-31 OCT  100  940   3  
## 19 19     Tropical Storm TONY     21-26 OCT   45 1000   -  
storms <- unlist(strsplit(as.character(tabs[[1]]$Name),split=" "))
storms
##  [1] "Tropical"    "Storm"       "ALBERTO"     "Tropical"    "Storm"      
##  [6] "BERYL"       "Hurricane-1" "CHRIS"       "Tropical"    "Storm"      
## [11] "DEBBY"       "Hurricane-2" "ERNESTO"     "Tropical"    "Storm"      
## [16] "FLORENCE"    "Tropical"    "Storm"       "HELENE"      "Hurricane-2"
## [21] "GORDON"      "Hurricane-1" "ISAAC"       "Tropical"    "Storm"      
## [26] "JOYCE"       "Hurricane-2" "KIRK"        "Hurricane-1" "LESLIE"     
## [31] "Hurricane-3" "MICHAEL"     "Hurricane-1" "NADINE"      "Tropical"   
## [36] "Storm"       "OSCAR"       "Tropical"    "Storm"       "PATTY"      
## [41] "Hurricane-1" "RAFAEL"      "Hurricane-3" "SANDY"       "Tropical"   
## [46] "Storm"       "TONY"
index <- storms%in%c("Tropical", "Storm", paste("Hurricane-",1:6,sep=""), "Depression", "Subtropical", "Extratropical", "Low", paste("Storm-",1:6,sep=""), "Xxx")
nstorms  <- storms[!index]
nstorms
##  [1] "ALBERTO"  "BERYL"    "CHRIS"    "DEBBY"    "ERNESTO"  "FLORENCE"
##  [7] "HELENE"   "GORDON"   "ISAAC"    "JOYCE"    "KIRK"     "LESLIE"  
## [13] "MICHAEL"  "NADINE"   "OSCAR"    "PATTY"    "RAFAEL"   "SANDY"   
## [19] "TONY"
TRACK=NULL
for(i in length(nstorms):1){
  if((nstorms[i]=="SIXTEE")&(year==2008)) nstorms[i] <- "SIXTEEN"
  if((nstorms[i]=="LAUR")&(year==2008)) nstorms[i] <- "LAURA"
  if((nstorms[i]=="FIFTEE")&(year==2007)) nstorms[i] <- "FIFTEEN"
  if((nstorms[i]=="CHANTA")&(year==2007)) nstorms[i] <- "CHANTAL"
  if((nstorms[i]=="ANDR")&(year==2007)) nstorms[i] <- "ANDREA"
  if((nstorms[i]=="NINETE")&(year==2005)) nstorms[i] <- "NINETEEN"
  if((nstorms[i]=="JOSEPH")&(year==2002)) nstorms[i] <- "JOSEPHINE"
  if((nstorms[i]=="FLOY")&(year==1993)) nstorms[i] <- "FLOYD"
  if((nstorms[i]=="KEIT")&(year==1988)) nstorms[i] <- "KEITH"
  if((nstorms[i]=="CHARLI")&(year==1972)) nstorms[i] <- "CHARLIE"}
for(i in length(nstorms):1){
  loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="")
  track <- read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20))

  names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT")
  track$LAT=as.numeric(as.character(track$LAT))
  track$LON=as.numeric(as.character(track$LON))
  track$WIND=as.numeric(as.character(track$WIND))
  track$PR=as.numeric(as.character(track$PR))
  track$year=year
  track$name=nstorms[i]
  TRACK=rbind(TRACK,track)}

tail(TRACK,10)
##     ADV  LAT   LON         TIME WIND   PR           STAT year    name
## 645  11 30.4 -79.1  05/21/12Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 646  12 30.5 -78.3  05/21/18Z     35 1006 TROPICAL STORM 2012 ALBERTO
## 647  13 30.7 -77.1  05/22/00Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 648  14 31.5 -76.1  05/22/06Z     35 1007 TROPICAL STORM 2012 ALBERTO
## 649  15 32.5 -74.7  05/22/12Z     30 1008            LOW 2012 ALBERTO
## 650  16 33.4 -73.4  05/22/18Z     30 1008            LOW 2012 ALBERTO
## 651  17 34.1 -71.9  05/23/00Z     25 1010            LOW 2012 ALBERTO
## 652  18 34.9 -70.1  05/23/06Z     25 1011            LOW 2012 ALBERTO
## 653  19 35.5 -67.9  05/23/12Z     25 1012            LOW 2012 ALBERTO
## 654  20 35.9 -66.0  05/23/18Z     25 1012            LOW 2012 ALBERTO
extract.track <- function(year=2012,p=TRUE){
loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="")
tabs <- readHTMLTable(htmlParse(loc)) 
storms <- unlist(strsplit(as.character(
tabs[[1]]$Name),split=" "))
index <- storms%in%c("Tropical","Storm",paste("Hurricane-",1:6,sep=""),"Depression","Subtropical","Extratropical","Low",paste("Storm-",1:6,sep=""),
"Xxx")
nstorms  <- storms[!index]
TRACK=NULL
for(i in length(nstorms):1){
loc=paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="")
track=read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20))
names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT")
track$LAT=as.numeric(as.character(track$LAT))
track$LON=as.numeric(as.character(track$LON))
track$WIND=as.numeric(as.character(track$WIND))
track$PR=as.numeric(as.character(track$PR))
track$year=year
track$name=nstorms[i]
TRACK=rbind(TRACK,track)
if(p==TRUE){  cat(year,i,nstorms[i],nrow(track),"\n")}}
return(TRACK)}
TOTTRACK=NULL
#  for(y in 2012:1851){
#    TOTTRACK=rbind(TOTTRACK,extract.track(y))
#  }

loc="http://freakonometrics.free.fr/TRACK-ATLANTIC.Rdata"
download.file(loc,destfile="TRACK-ATLANTIC.Rdata")
load("TRACK-ATLANTIC.Rdata")
dim(TOTTRACK)
## [1] 26712     9
library(grDevices,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
library(maps,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
map("world", col="light yellow", fill=TRUE)

plot of chunk unnamed-chunk-234

map("world", col="light yellow", fill=TRUE)
for(n in unique(TOTTRACK$name)){
  lines(TOTTRACK$LON[TOTTRACK$name==n],TOTTRACK$LAT[TOTTRACK$name==n],lwd=.5,col=rgb(1, 0, 0, alpha=.5))}

plot of chunk unnamed-chunk-235

Shapefiles

loc="http://freakonometrics.free.fr/Italy.zip"
download.file(loc,destfile="Italy.zip")
unzip("Italy.zip", exdir="./")
library(rgdal,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ita1<-readOGR(dsn="./Italy/",layer="ITA_adm1",verbose=FALSE)
names(ita1)
## [1] "ID_0"      "ISO"       "NAME_0"    "ID_1"      "NAME_1"    "NL_NAME_1"
## [7] "VARNAME_1" "TYPE_1"    "ENGTYPE_1"
plot(ita1,col="light green")

plot of chunk unnamed-chunk-237

ita1@data[,"NAME_1"][1:6]
## [1] Abruzzo        Apulia         Basilicata     Calabria       Campania      
## [6] Emilia-Romagna
## 20 Levels: Abruzzo Apulia Basilicata Calabria Campania ... Veneto
pos<-which(ita1@data[,"NAME_1"]=="Sicily")
Poly_Sicily<-ita1[pos,]
plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)

plot of chunk unnamed-chunk-239

plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
abline(v=5:20,col="light blue")
abline(h=35:50,col="light blue")
axis(1)
axis(2)

plot of chunk unnamed-chunk-240

pos<-which(ita1@data[,"NAME_1"] %in% c("Abruzzo","Apulia","Basilicata","Calabria","Campania","Lazio","Molise","Sardegna","Sicily"))
ita_south <- ita1[pos,]
ita_north <- ita1[-pos,]
plot(ita1)
plot(ita_south,col="red",add=TRUE)
plot(ita_north,col="blue",add=TRUE)

plot of chunk unnamed-chunk-241

library(xlsx,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
loc="http://freakonometrics.free.fr/Italy_codes.xlsx"
download.file(loc,destfile="Italy_codes.xlsx")
data_codes<-read.xlsx2(file = "Italy_codes.xlsx",sheetName="ITALY", startRow=1, header=TRUE)
names(data_codes)[1]="NAME_1"
ita2<-merge(ita1, data_codes, by = "NAME_1", all.x=TRUE)
pos<-which(ita2@data[,"GROUP"] == "SOUTH")
## Error in which(ita2@data[, "GROUP"] == "SOUTH"): tentative d'obtenir le slot "data" d'un objet (classe "data.frame") qui n'est pas un objet S4
library(rgeos,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
## rgeos version: 0.3-26, (SVN revision 560)
##  GEOS runtime version: 3.5.1-CAPI-1.9.1 r4246 
##  Linking to sp version: 1.2-5 
##  Polygon checking: TRUE
ita_s<-gUnionCascaded(ita_south)
ita_n<-gUnionCascaded(ita_north)
plot(ita1)
plot(ita_s,col="red",add=TRUE)
plot(ita_n,col="blue",add=TRUE)

plot of chunk unnamed-chunk-243

plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
gCentroid(Poly_Sicily,byid=TRUE)
## class       : SpatialPoints 
## features    : 1 
## extent      : 14.14668, 14.14668, 37.58842, 37.58842  (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
points(gCentroid(Poly_Sicily,byid=TRUE),pch=19,col="red")

plot of chunk unnamed-chunk-244

G <- as.data.frame(gCentroid(ita1,byid=TRUE))
plot(ita1,col="light green")
text(G$x,G$y,1:20)

plot of chunk unnamed-chunk-245

c1 <-G[c(17,20,6,16,8,12,2),]
c2 <- G[c(13,11,1,5,3,4),]
L1<-SpatialLines(list(Lines(list(Line(c1)),"Line1")))
L2<-SpatialLines(list(Lines(list(Line(c2)),"Line2")))
L1@proj4string<-ita1@proj4string
L2@proj4string<-ita1@proj4string
cross_road <-gIntersection(L1,L2)
cross_road@coords
##          x        y
## 1 11.06947 44.03287
## 1 14.20034 41.74879
plot(ita1,col="light green")
plot(L1,col="red",lwd=2,add=TRUE)
plot(L2,col="blue",lwd=2,add=TRUE)
plot(cross_road,pch=19,cex=1.5,add=TRUE)

plot of chunk unnamed-chunk-247

Linear Model

\[ y_i=\boldsymbol{x}_i^{\text{T}}\boldsymbol{\beta}+\varepsilon_i=\beta_0 + [\beta_1 x_{1,i}+\cdots+ \beta_k x_{k,i}]+\varepsilon_i \]

See linear model, where \(\varepsilon_i\sim\mathcal{N}(0,\sigma^2)\) i.id.

\[ (Y\vert \boldsymbol{X}=\boldsymbol{x})\sim \mathcal{N}(\boldsymbol{x}^{\text{T}}\boldsymbol{\beta}, \sigma^2) \] i.e. \[ \mathbb{E}[Y\vert\boldsymbol{X}=\boldsymbol{x}]=\boldsymbol{x}^{\text{T}}\boldsymbol{\beta} \] and homoscedastic model, \[ \text{Var}[Y\vert\boldsymbol{X}=\boldsymbol{x}]=\sigma^2 \]

Least squares (and maximum likelihood) estimator \[ \widehat{\boldsymbol{\beta}}=\text{argmin} \left\lbrace \sum_{i=1}^n (y_i-\boldsymbol{x}_i^{\text{T}}\boldsymbol{\beta})^2 \right\rbrace =(\boldsymbol{X}^{\text{T}}\boldsymbol{X})^{-1}\boldsymbol{X}^{\text{T}}\boldsymbol{y}\]

Linear Model

plot(cars)

plot of chunk unnamed-chunk-248

model <- lm(dist ~ speed, data=cars)

Linear Model

summary(model)
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.069  -9.525  -2.272   9.215  43.201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
## speed         3.9324     0.4155   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12

Linear Model

X=cbind(1,cars$speed)
Y=cars$dist
solve(crossprod(X,X),crossprod(X,Y))
##            [,1]
## [1,] -17.579095
## [2,]   3.932409
model$coefficients
## (Intercept)       speed 
##  -17.579095    3.932409

Linear Model

summary(model)$sigma^2*solve(crossprod(X,X))
##           [,1]       [,2]
## [1,] 45.676514 -2.6588234
## [2,] -2.658823  0.1726509
vcov(model)
##             (Intercept)      speed
## (Intercept)   45.676514 -2.6588234
## speed         -2.658823  0.1726509
n=nrow(cars)

Linear Model

confint(model)
##                  2.5 %    97.5 %
## (Intercept) -31.167850 -3.990340
## speed         3.096964  4.767853
model$coefficients[1]+qt(c(.025,.975),n-2)* summary(model)$coefficients[1,2]
## [1] -31.16785  -3.99034

Linear Model

coefficient of determination \(\displaystyle{R^2 = \frac{\text{explained variance}}{\text{total variance}}}\)

summary(model)$r.squared
## [1] 0.6510794
1-deviance(model)/sum((Y-mean(Y))^2)
## [1] 0.6510794

\[ \text{Var}[Y]= \text{Var}[\mathbb{E}[Y\vert X]]+ \mathbb{E}[\text{Var}[Y\vert X]] \]

see analysis of variance

Linear Model

\[ \overline{R}^2 = 1-[1-R^2]\cdot \frac{n-1}{n-(k-1)-1} \]

summary(model)$adj.r.squared
## [1] 0.6438102

Linear Model

anova(lm(dist~speed,data=cars),lm(dist~1,data=cars))
## Analysis of Variance Table
## 
## Model 1: dist ~ speed
## Model 2: dist ~ 1
##   Res.Df   RSS Df Sum of Sq      F   Pr(>F)    
## 1     48 11354                                 
## 2     49 32539 -1    -21186 89.567 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Linear Model

plot(cars)
abline(model,col="red")

plot of chunk unnamed-chunk-257

Linear Model

plot(cars)
abline(model,col="red")
x=seq(2,26)
y=predict(model, newdata=data.frame(speed=x))
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-258

Linear Model

y=predict(model, newdata=data.frame(speed=x), interval = "confidence")
head(y)
##         fit        lwr       upr
## 1 -9.714277 -21.733068  2.304513
## 2 -5.781869 -17.026591  5.462853
## 3 -1.849460 -12.329543  8.630624
## 4  2.082949  -7.644150 11.810048
## 5  6.015358  -2.973341 15.004056
## 6  9.947766   1.678977 18.216556

\[ \text{Var}[\widehat{Y}(\boldsymbol{x})]=\text{Var}[\boldsymbol{x}^{\text{T}}\widehat{\boldsymbol{\beta}}]=\boldsymbol{x}^{\text{T}}\text{Var}[\widehat{\boldsymbol{\beta}}]\boldsymbol{x} =\widehat{\sigma}^2 \boldsymbol{x}^{\text{T}}[\boldsymbol{X}^{\text{T}}\boldsymbol{X}]^{-1}\boldsymbol{x} \]

Linear Model

plot(cars)
polygon(c(x,rev(x)),c(y[,2],rev(y[,3])),col=rgb(0,0,1,.4),border=NA)
lines(x,y[,1],lwd=2,col="blue")

plot of chunk unnamed-chunk-260

Linear Model and Bagging

Method 1

Y=matrix(NA,1000,length(x))
for(b in 1:1000){
  idx <- sample(1:nrow(cars),size=nrow(cars),replace=TRUE)
  modelb <- lm(dist ~ speed, data=cars[idx,])
  Y[b,] <- predict(modelb, newdata=data.frame(speed=x))
}

See bootstrap, based on pseudo-sample \[ \lbrace(\boldsymbol{x}_{i_1},y_{i_1}), \cdots,(\boldsymbol{x}_{i_n},y_{i_n}) \rbrace \] where \((i_1,\cdots,i_n)\in\lbrace 1,2,\cdots,n\rbrace\).

Linear Model and Bagging

Method 1

plot(cars)
for(b in 1:100) lines(x,Y[b,],col=rgb(0,0,1,.4))

plot of chunk unnamed-chunk-262

Linear Model and Bagging

plot(cars)
lines(x,apply(Y,2,mean),col="blue",lwd=2)
lines(x,apply(Y,2,function(x) quantile(x,.025)),col="red",lty=2)
lines(x,apply(Y,2,function(x) quantile(x,.975)),col="red",lty=2)

plot of chunk unnamed-chunk-263

Linear Model and Bagging

Method 2

pred_dist=predict(model)
epsilon  =residuals(model)
Y=matrix(NA,1000,length(x))
for(b in 1:1000){
  idx <- sample(1:nrow(cars),size=nrow(cars),replace=TRUE)
  carsb=data.frame(speed=cars$speed,
                   dist=pred_dist+epsilon[idx])
  modelb <- lm(dist ~ speed, data=carsb)
  Y[b,] <- predict(modelb, newdata=data.frame(speed=x))
}

See bootstrap, based on pseudo-sample \[ \lbrace(\boldsymbol{x}_{1},\widehat{y}_{1}+\widehat{\varepsilon}_{i_1}), \cdots,(\boldsymbol{x}_{n},\widehat{y}_{n}+\widehat{\varepsilon}_{i_n}) \rbrace \] where \((i_1,\cdots,i_n)\in\lbrace 1,2,\cdots,n\rbrace\).

Linear Model and Bagging

Method 2

plot(cars)
for(b in 1:100) lines(x,Y[b,],col=rgb(0,0,1,.4))

plot of chunk unnamed-chunk-265

Linear Model and Bagging

plot(cars)
lines(x,apply(Y,2,mean),col="blue",lwd=2)
lines(x,apply(Y,2,function(x) quantile(x,.025)),col="red",lty=2)
lines(x,apply(Y,2,function(x) quantile(x,.975)),col="red",lty=2)

plot of chunk unnamed-chunk-266

Errors

plot(cars)
abline(model,col="red")
segments(cars$speed,cars$dist,cars$speed,predict(model),col="blue")

plot of chunk unnamed-chunk-267

Least Squares?

cars2=cars; cars2[,2]=cars[,2]/10
plot(cars2,ylab="dist/10")
acp=princomp(cars2)
b=acp$loadings[2,1]/acp$loadings[1,1]
a=acp$center[2]-b*acp$center[1]
abline(a,b,col="red")

plot of chunk unnamed-chunk-268

Least Squares?

plot(cars2,ylab="dist/10",xlim=c(0,30),ylim=c(-1,12))
abline(a,b,col="red")
t <- acp$loadings[,1] %*% (t(cars2)-acp$center)
X1 <- acp$center[1] +t * acp$loadings[1,1]
X2 <- acp$center[2] +t * acp$loadings[2,1]
segments(cars2$speed,cars2$dist,X1,X2,col="blue")

plot of chunk unnamed-chunk-269

Least Absolute Deviation

  f <- function(a) sum(abs(cars$dist-(a[1]+a[2]*cars$speed))) 
  opt <- optim( c(0,0), f )$par
  plot(cars)
  abline(model, col='red', lty=2)
  abline(opt[1], opt[2],col="blue")

plot of chunk unnamed-chunk-270

Quantile Regression

library(quantreg,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(cars)
abline(model, col="blue")
abline(rq(dist ~ speed,data=cars, tau=.5),col="red",lty=2)
abline(rq(dist ~ speed,data=cars, tau=.9),col="purple",lty=2)

plot of chunk unnamed-chunk-271

Diagnostic and Errors

plot(model,which=1)

plot of chunk unnamed-chunk-272

Scatterplot \((\widehat{Y}_i,\widehat{\varepsilon}_i)\)

Diagnostic and Errors

plot(model,which=2)

plot of chunk unnamed-chunk-273

Scatterplot \(\displaystyle{\left(\widehat{\varepsilon}_{i:n},\Phi^{-1} \left(\frac{i}{n}\right)\right)}\)

Diagnostic and Errors

plot(model,which=3)

plot of chunk unnamed-chunk-274

Scatterplot \((\widehat{Y}_i,\sqrt{\vert\widehat{\varepsilon}_i\vert})\)

Diagnostic and Errors

plot(model,which=4)

plot of chunk unnamed-chunk-275

Cook distance, \(\displaystyle{C_i=\frac{\widehat{\varepsilon}_i^2}{p\cdot\text{MSE}}\cdot\left(\frac{H_{i,i}}{(1-H_{i,i})^2}\right)}\) with \(\boldsymbol{H}=\boldsymbol{X}(\boldsymbol{X}^{\text{T}}\boldsymbol{X})^{-1}\boldsymbol{X}^{\text{T}}=[H_{i,i}]\)

C=cooks.distance(model)

Diagnostic and Errors

\(H_{i,i}\) are leverages, and define Studentized residuals as \[ \widehat{r}_i=\frac{\widehat{\varepsilon}_i}{ \widehat{\sigma} \sqrt{1-H_{i,i}}} \]

rstudent(model)
##           1           2           3           4           5           6 
##  0.26345000  0.81607841 -0.39781154  0.81035256  0.14070334 -0.51716052 
##           7           8           9          10          11          12 
## -0.24624632  0.27983408  0.81090388 -0.57004675  0.15209173 -1.03037790 
##          13          14          15          16          17          18 
## -0.62992492 -0.36670509 -0.10509307 -0.49251696  0.02981715  0.02981715 
##          19          20          21          22          23          24 
##  0.81716230 -0.75078438 -0.09592079  1.49972043  3.02282876 -1.42097720 
##          25          26          27          28          29          30 
## -1.01227617  0.82440767 -0.87411459 -0.34752195 -1.13903469 -0.60553485 
##          31          32          33          34          35          36 
##  0.04737114 -0.73422040  0.18222855  1.52145888  2.09848208 -1.40929208 
##          37          38          39          40          41          42 
## -0.73145948  0.71330941 -1.98238877 -0.86293622 -0.59637646 -0.33247538 
##          43          44          45          46          47          48 
##  0.19208548 -0.19393283 -1.27493857 -0.45557342  1.02773460  1.09701943 
##          49          50 
##  3.18499284  0.28774529

Diagnostic and Errors

plot(model,which=5)

plot of chunk unnamed-chunk-278

Scatterplot \((H_{i,i},\widehat{r}_i)\)

Diagnostic and Errors

plot(model,which=6)

plot of chunk unnamed-chunk-279

Scatterplot \((H_{i,i},C_i)\)

Diagnostic and Errors

hist(residuals(model),probability=TRUE, col="light green")
lines(density(residuals(model)),lwd=2,col="red")
boxplot(residuals(model),horizontal=TRUE,add=TRUE,at=.024, 
  pars=list(boxwex=.004),col=rgb(0,0,1,.25))

plot of chunk unnamed-chunk-280

Diagnostic and Errors

sigma=summary(model)$sigma
plot(ecdf(residuals(model)/sigma))
lines(seq(-3,3,by=.1),pnorm(seq(-3,3,by=.1)),lwd=2,col="red")

plot of chunk unnamed-chunk-281

Diagnostic and Errors

Kolmogorov-Smirnov $ d=\sup_{x\in\mathbb{R}}\lbrace \vert\Phi(x)-\widehat{F}_n(x)\vert \rbrace $

ks.test(residuals(model)/sigma,"pnorm")
## Warning in ks.test(residuals(model)/sigma, "pnorm"): ties should not be present
## for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  residuals(model)/sigma
## D = 0.13067, p-value = 0.3605
## alternative hypothesis: two-sided

Diagnostic and Errors

Anderson-Darling, Cramer-von Mises,

library(nortest,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
ad.test(residuals(model))
## 
##  Anderson-Darling normality test
## 
## data:  residuals(model)
## A = 0.79406, p-value = 0.0369
cvm.test(residuals(model))
## 
##  Cramer-von Mises normality test
## 
## data:  residuals(model)
## W = 0.12573, p-value = 0.0483

Model Choice

AIC \(AIC = 2k - 2\log(\mathcal{L}) = 2k + n\left[\log\left(2\pi \frac{1}{n}\sum_{i=1}^n \widehat{\varepsilon}_i^2 \right) + 1\right]\)

BIC \(BIC = { k \log(n) -2 \log(\mathcal{L}) } = k \log(n) + n\left[\log\left(2\pi \frac{1}{n}\sum_{i=1}^n \widehat{\varepsilon}_i^2 \right) + 1\right]\)

AIC(model)
## [1] 419.1569
AIC(model,k=log(n))
## [1] 424.8929

Testing Linear Assumptions

library(splines,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_brk <- lm(dist ~ bs(speed,degree=1,knots=(c(4,15,25))), data=cars)
x=seq(4,25,by=.1)
y=predict(model_brk, newdata=data.frame(speed=x))
## Warning in predict.lm(model_brk, newdata = data.frame(speed = x)): prediction
## from a rank-deficient fit may be misleading

see \(b\)-splines, \[ y_i=\beta_0+\beta_1 x_i + \beta_2 (x_i-15)_+ + \varepsilon_i \]

Testing Linear Assumptions

plot(cars)
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-286

Testing Linear Assumptions

positive=function(x,s) ifelse(x>s,x-s,0)
model_brk <- lm(dist ~ speed + positive(speed,15), data=cars)
x=seq(4,25,by=.1)
y=predict(model_brk, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-287

Testing Linear Assumptions

plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")
abline(coefficients(model_brk)[1],coefficients(model_brk)[2],col="blue",lty=2)

plot of chunk unnamed-chunk-288

summary(model_brk)$coefficients
##                      Estimate Std. Error    t value    Pr(>|t|)
## (Intercept)         -7.651916 10.6253642 -0.7201557 0.474995540
## speed                3.018648  0.8626766  3.4991649 0.001032415
## positive(speed, 15)  1.756247  1.4551278  1.2069365 0.233496132

Testing Linear Assumptions

library(strucchange,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(Fstats(dist ~ speed,data=cars,from=7/50))

plot of chunk unnamed-chunk-289

see Chow Test

Testing Linear Assumptions

\[ W_t=\frac{1}{\widehat{\sigma}\sqrt{n}}\sum_{i=1}^{ \lfloor nt \rfloor} \widehat{\varepsilon}_i \]

cusum <- efp(dist ~ speed, type = "OLS-CUSUM",data=cars)
plot(cusum,ylim=c(-2,2))

plot of chunk unnamed-chunk-290

Testing Linear Assumptions

plot(cusum, alpha = 0.05, alt.boundary = TRUE,ylim=c(-2,2))

plot of chunk unnamed-chunk-291

see CUSUM test

Model with Categorical Covariates

model_cut=lm(dist~ cut(speed, breaks=c(0,10,15,20,25)),data=cars)
y=predict(model_cut, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue",type="s")

plot of chunk unnamed-chunk-292

Model with Categorical Covariates

library(rpart,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
tree=rpart(dist~speed,data=cars)
y=predict(tree,newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue",type="s")

plot of chunk unnamed-chunk-293

Smoothing : Polynomial Regression

model_poly=lm(dist~ poly(speed, df=3),data=cars)
y=predict(model_poly, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-294

Smoothing : Local Regression

library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
plot(cars)
bw <- dpill(cars$speed,cars$dist) 
lines( locpoly(cars$speed,cars$dist,degree=0, bandwidth=bw), col='red' )
lines( locpoly(cars$speed,cars$dist,degree=1, bandwidth=bw), col='green' )
lines( locpoly(cars$speed,cars$dist,degree=2, bandwidth=bw), col='blue' )

plot of chunk unnamed-chunk-295

Smoothing : \(k\)-Nearest Neighboors

library(FNN,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
knn=knn.reg(train=cars$speed,y=cars$dist,k=5)
plot(cars)
lines(cars$speed,knn$pred,col="red")

plot of chunk unnamed-chunk-296

Smoothing : \(b\) Splines

library(splines,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_bs <- lm(dist ~ bs(speed), data=cars)
y=predict(model_bs, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-297

Smoothing Linear Model

Tuning parameter selection, Silverman's Rule \[ b^\star=0.9 \cdot\frac{ \min\lbrace \sigma,F^{-1}(.75)-F^{-1}(.25)\rbrace}{1.34 \cdot n^{1/5}} \]

bw.nrd0(cars$speed)
## [1] 2.150016

See kernel regression

Smoothing Linear Model

Tuning parameter selection, Cross Validation \[ b^\star=\text{argmin}\left\lbrace \sum_{i=1}^n \left(y_i - \widehat{m}_{(i)}(\boldsymbol{x}_i)\right)^2\right\rbrace \]

bw.ucv(cars$speed)
## Warning in bw.ucv(cars$speed): minimum occurred at one end of the range
## [1] 2.748934

Smoothing Linear Model

library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
Nadaraya_Watson =with(cars,ksmooth(speed, dist, "normal",bandwidth=2.75))
plot(cars)
abline(model,col="red",lty=2)
lines(Nadaraya_Watson,lwd=2,col="blue")

plot of chunk unnamed-chunk-300

Smoothing Linear Model

library(KernSmooth,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_loess=loess(dist~ speed, data=cars,degree=1, family="gaussian")
y=predict(model_loess, newdata=data.frame(speed=x))
plot(cars)
abline(model,col="red",lty=2)
lines(x,y,lwd=2,col="blue")

plot of chunk unnamed-chunk-301

Multiple Regression

Life Expectancy (1), Homicide Rate (2), Illiteracy Rate (3)

chicago=read.table("http://freakonometrics.free.fr/chicago.txt",header=TRUE,sep=";")
model_c = lm(Fire~X_2+X_3,data=chicago)
y=function(x2,x3) predict(model_c,newdata=data.frame(X_2=x2,X_3=x3))
VX2=seq(0,80,length=26); VX3=seq(5,25,length=26)
VY=outer(VX2,VX3,y)
persp(VX2,VX3,VY,xlab="Homicide",ylab="Illiteracy",zlab="Fire",theta=20)

plot of chunk unnamed-chunk-302

Multiple Regression

VX2=seq(0,80,length=251); VX3=seq(5,25,length=251)
VY=outer(VX2,VX3,y)
image(VX2,VX3,VY,xlab="Homicide",ylab="Illiteracy")
contour(VX2,VX3,VY,add=TRUE)

plot of chunk unnamed-chunk-303

Model selection

model_c = lm(Fire~.,data=chicago)
summary(model_c)$r.squared
## [1] 0.4416723
summary(model_c)$adj.r.squared
## [1] 0.4027192

Model selection

logLik(model_c)
## 'log Lik.' -152.7678 (df=5)
AIC(model_c, k=2)               # AIC
## [1] 315.5357
AIC(model_c, k=log(nrow(cars))) # BIC
## [1] 325.0958

Penalization

\[ \widehat{\boldsymbol{\beta}} = \text{argmin}\left\lbrace \sum_{i=1}^n \left[y_i- \left(\beta_0+\sum_{j=1}^k \beta_j x_{j,i} \right)\right] + \color{red}{\lambda} \sum_{j=1}^k \beta_j ^2\right\rbrace \] with an explicit solution \(\widehat{\boldsymbol{\beta}}=(\boldsymbol{X}^{\text{T}}\boldsymbol{X}-\color{red}{\lambda} \mathbb{I})^{-1} \boldsymbol{X}^{\text{T}}\boldsymbol{Y}\).

library(MASS,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_ridge <- lm.ridge(Fire ~ ., data=chicago, lambda=1)

see more generally Tikhonov regularization

Optimal Penalization

mse <- NULL
n=100
v <- matrix(c(0,coefficients(model_c)[-1]), nr=n, nc=4, byrow=TRUE)
kl <- c(1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 
        .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.2, 1.4, 1.6, 1.8, 2)
for (k in kl) {
  r <- matrix(NA, nr=n, nc=4)
  for (i in 1:n) {
    boot_c <- chicago[sample(1:nrow(chicago),nrow(chicago),replace=TRUE),]
    r[i,2:4] <- model_ridge <- lm.ridge(Fire ~ ., data=boot_c, lambda=k)$coef
    r[i,1] <- mean(boot_c[,"Fire"])
  }
  mse <- append(mse, apply( (r-v)^2, 2, mean )[2])
}

Optimal Penalization

plot( mse ~ kl, type='l' )  

plot of chunk unnamed-chunk-308

Variable Selection

step(lm(Fire ~ .,data=chicago))
## Start:  AIC=180.16
## Fire ~ X_1 + X_2 + X_3
## 
##        Df Sum of Sq    RSS    AIC
## - X_1   1      0.60 1832.4 178.17
## <none>              1831.8 180.16
## - X_2   1    561.94 2393.7 190.73
## - X_3   1    702.09 2533.8 193.41
## 
## Step:  AIC=178.17
## Fire ~ X_2 + X_3
## 
##        Df Sum of Sq    RSS    AIC
## <none>              1832.4 178.17
## - X_2   1    620.98 2453.3 189.89
## - X_3   1   1003.70 2836.1 196.70
## 
## Call:
## lm(formula = Fire ~ X_2 + X_3, data = chicago)
## 
## Coefficients:
## (Intercept)          X_2          X_3  
##     21.4965       0.2213      -1.5248

Variable Selection

LASSO (least absolute shrinkage and selection operator) \[ \widehat{\boldsymbol{\beta}} = \text{argmin}\left\lbrace \sum_{i=1}^n \left[y_i- \left(\beta_0+\sum_{j=1}^k \beta_j x_{j,i} \right)\right] + \color{blue}{\lambda} \sum_{j=1}^k \vert\beta_j\vert\right\rbrace \]

library(glmnet,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
fit = glmnet(x = as.matrix(chicago[,2:4]), y = chicago[,1], family = "gaussian", alpha = 1)
plot(fit, xvar="lambda", label = TRUE )

plot of chunk unnamed-chunk-311

Variable Selection

step(model)
## Start:  AIC=275.26
## dist ~ speed
## 
##         Df Sum of Sq   RSS    AIC
## <none>               11354 275.26
## - speed  1     21186 32539 325.91
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Coefficients:
## (Intercept)        speed  
##     -17.579        3.932

Dimension Reduction

model_acp=lm(Fire ~ princomp(cbind(X_1,X_2,X_3))$scores[,1:3],data=chicago)
predict(model_c)[1:5]
##         1         2         3         4         5 
##  9.975499 16.985083 14.244580 13.411960 18.337225
predict(model_acp)[1:5]
##         1         2         3         4         5 
##  9.975499 16.985083 14.244580 13.411960 18.337225

Dimension Reduction

model_acp=lm(Fire ~ princomp(cbind(X_1,X_2,X_3))$scores[,1:2],data=chicago)
predict(model_c)[1:5]
##         1         2         3         4         5 
##  9.975499 16.985083 14.244580 13.411960 18.337225
predict(model_acp)[1:5]
##        1        2        3        4        5 
## 10.01032 17.02103 14.29953 13.43876 18.39402

Logistic Regression

\[ \mathbb{E}[Y\vert \boldsymbol{X}=\boldsymbol{x}]= p(\boldsymbol{x})=\frac{\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]}{1+\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]}=H(\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}) \]

logistic regression

Logistic Regression

myocarde <-read.table("http://freakonometrics.free.fr/myocarde.csv", head=TRUE,sep=";")
model_bernoulli = glm(PRONO~., data=myocarde, family=binomial)
coefficients(summary(model_bernoulli))
##                  Estimate   Std. Error     z value  Pr(>|z|)
## (Intercept) -10.187641685 11.895227082 -0.85644785 0.3917501
## FRCAR         0.138178119  0.114112163  1.21089738 0.2259347
## INCAR        -5.862429035  6.748785486 -0.86866430 0.3850308
## INSYS         0.717084018  0.561444684  1.27721223 0.2015273
## PRDIA        -0.073668171  0.291636009 -0.25260314 0.8005749
## PAPUL         0.016756506  0.341942251  0.04900391 0.9609162
## PVENT        -0.106776012  0.110550096 -0.96586087 0.3341138
## REPUL        -0.003154187  0.004890917 -0.64490712 0.5189874

Logistic Regression

X=cbind(1,as.matrix(myocarde[,1:7]))
Y=myocarde$PRONO=="Survival"
beta=as.matrix(lm(Y~0+X)$coefficients,ncol=1)
for(s in 1:9){
   pi=exp(X%*%beta[,s])/(1+exp(X%*%beta[,s]))
   gradient=t(X)%*%(Y-pi)
   omega=matrix(0,nrow(X),nrow(X));diag(omega)=(pi*(1-pi))
   Hessian=-t(X)%*%omega%*%X
   beta=cbind(beta,beta[,s]-solve(Hessian)%*%gradient)}

Logistic Regression

 beta[,1]
##      X XFRCAR XINCAR XINSYS XPRDIA XPAPUL XPVENT XREPUL 
##      0      0      0      0      0      0      0      0
 beta[,9]
##             X        XFRCAR        XINCAR        XINSYS        XPRDIA 
## -9.202735e+00  4.772571e-14 -5.755396e-13  4.540812e-14  4.996004e-16 
##        XPAPUL        XPVENT        XREPUL 
##  2.664535e-15  1.387779e-15 -2.081668e-17
# -solve(Hessian)
sqrt(-diag(solve(Hessian)))
##                     FRCAR        INCAR        INSYS        PRDIA        PAPUL 
## 180.18368217   1.87748780  93.03024470   6.52098379   6.34628288   5.47188241 
##        PVENT        REPUL 
##   3.05900360   0.05472085

Logistic Regression

GLM <- glm(PRONO ~ PVENT + REPUL, data = myocarde, family = binomial)
pred_GLM = function(p,r){
 return(predict(GLM, newdata =
 data.frame(PVENT=p,REPUL=r), type="response"))}
vp=seq(min(myocarde$PVENT),max(myocarde$PVENT),length=251)
vr=seq(min(myocarde$REPUL),max(myocarde$REPUL),length=251)
vd=outer(vp,vr,pred_GLM)
library(RColorBrewer,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
CL=colorRampPalette(brewer.pal(name="RdYlBu", 11))(100)

Logistic Regression

image(vp,vr,vd,col=CL,xlab="PVENT",ylab="REPUL")

plot of chunk unnamed-chunk-320

Logistic Regression

probabilities <- predict(GLM, myocarde, type="response")
predictions <- levels(myocarde$PRONO)[(probabilities>.5)+1] 
table(predictions, myocarde$PRONO)
##            
## predictions DECES SURVIE
##      DECES     24      4
##      SURVIE     5     38

Residuals and GLM

Pearson's residuals \[ \widehat{\varepsilon}_i=\frac{y_i - \widehat{\pi}_i}{ \sqrt{\widehat{\pi}_i\cdot[1-\widehat{\pi}_i]}} \] Deviance residuals \[ \widehat{\varepsilon}_i=\pm \left( y_i \log[\widehat{\pi}_i]+ (1-y_i) \log[1-\widehat{\pi}_i]\right) \]

E1=residuals(model_bernoulli,type="pearson")
E2=residuals(model_bernoulli,type="deviance")

Residuals and GLM

plot(1:nrow(myocarde),E1)

plot of chunk unnamed-chunk-323

Multinomial Regression

Binomial case, \(Y\in\lbrace 0,1\rbrace\),

\(\mathbb{P}[Y=1\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\boldsymbol{\beta}]\)

\(\mathbb{P}[Y=0\vert \boldsymbol{X}=\boldsymbol{x}]\propto1\)

Multinomial case, \(Y\in\lbrace A,B,C\rbrace\),

\(\mathbb{P}[Y=\color{red}{A}\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\color{red}{\boldsymbol{\beta}_A}]\)

\(\mathbb{P}[Y=\color{blue}{B}\vert \boldsymbol{X}=\boldsymbol{x}]\propto\exp[\boldsymbol{x}^{\text{ T}}\color{blue}{\boldsymbol{\beta}_B}]\)

\(\mathbb{P}[Y=\color{green}{C}\vert \boldsymbol{X}=\boldsymbol{x}]\propto1\)

Multinomial Regression

library(VGAM,verbose=FALSE,quietly=TRUE,warn.conflicts=FALSE)
model_multi = vglm(PRONO~., data=myocarde, family="multinomial")
model_multi
## 
## Call:
## vglm(formula = PRONO ~ ., family = "multinomial", data = myocarde)
## 
## 
## Coefficients:
##  (Intercept)        FRCAR        INCAR        INSYS        PRDIA        PAPUL 
## 10.187641072 -0.138178115  5.862428927 -0.717083993  0.073668174 -0.016756516 
##        PVENT        REPUL 
##  0.106776011  0.003154187 
## 
## Degrees of Freedom: 71 Total; 63 Residual
## Residual deviance: 41.04314 
## Log-likelihood: -20.52157 
## 
## This is a multinomial logit model with 2 levels

Generalized Linear Models

Poisson and Gamma distributions in the exponential family

base=data.frame(x=c(1,2,3,4,5),y=c(1,2,4,2,6))
regNId <- glm(y~x,family=gaussian(link="identity"),data=base)
regNlog <- glm(y~x,family=gaussian(link="log"),data=base)
regPId <- glm(y~x,family=poisson(link="identity"),data=base)
regPlog <- glm(y~x,family=poisson(link="log"),data=base)
regGId <- glm(y~x,family=Gamma(link="identity"),data=base)
regGlog <- glm(y~x,family=Gamma(link="log"),data=base)

Generalized Linear Models

visuel=function(regression,titre=""){
 plot(base$x,base$y,pch=19,cex=1.5,main=titre,xlab="",ylab="")
 abs <- seq(0,7,by=.1)
 yp <- predict(regression,newdata=data.frame(x=abs),se.fit = TRUE, type="response")
 polygon(c(abs,rev(abs)),c(yp$fit+2*yp$se.fit,rev(yp$fit-2*yp$se.fit)), col="light green",border=NA)
 points(base$x,base$y,pch=19,cex=1.5)
 lines(abs,yp$fit,lwd=2,col="red")
 lines(abs,yp$fit+2*yp$se.fit,lty=2)
 lines(abs,yp$fit-2*yp$se.fit,lty=2)}

Generalized Linear Models

visuel(regNId,"Gaussienne, lien identité")

plot of chunk unnamed-chunk-327

Generalized Linear Models

visuel(regPId,"Poisson, lien identité")

plot of chunk unnamed-chunk-328

Generalized Linear Models

visuel(regGId,"Gamma, lien identité")

plot of chunk unnamed-chunk-329

Generalized Linear Models

visuel(regNlog,"Gaussienne, lien logarithmique")

plot of chunk unnamed-chunk-330

Generalized Linear Models

visuel(regPlog,"Poisson, lien logarithme")

plot of chunk unnamed-chunk-331

Generalized Linear Models

visuel(regGlog,"Gamma, lien logarithme")

plot of chunk unnamed-chunk-332