##### ARTHUR CHARPENTIER ##### DATA SCIENCE POUR L'ACTUARIAT, INSTITUT DES ACTUAIRES ##### DATA MINING & R date: March, 16, 2015 mel: charpentier.arthur@uqam.ca ##### R http://freakonometrics.hypotheses.org http://adv-r.had.co.nz/ http://cran.r-project.org/manuals.html http://stackoverflow.com/questions/tagged/r http://www.r-bloggers.com/ http://cran.r-project.org/ ##### R : PACKAGES getOption("defaultPackages") (.packages(all.available=TRUE)) install.packages("quantreg", dependencies=TRUE) ?rq library(quantreg) ?rq rq detach(package:quantreg, unload=TRUE) ?rq apropos("norm") ##### R : S3 vs S4 person3 <- function(name,age,weight,height){ characteristics<-list(name=name,age=age,weight=weight,height=height) class(characteristics)<-"person3" return(characteristics)} JohnDoe3 <- person3(name="John",age=28, weight=76, height=182) JohnDoe3 JohnDoe3$age BMI3 <- function(object,...) {return(object$weight*1e4/object$height^2)} BMI3(JohnDoe3) reg3 <- lm(dist~speed,data=cars) reg3$coefficients coef(reg3) summary setClass("person4", representation(name="character", age="numeric", weight="numeric", height="numeric")) JohnDoe4 <- new("person4",name="John",age=28, weight=76, height=182) JohnDoe4 JohnDoe4@age setGeneric("BMI4",function(object,separator) return(standardGeneric("BMI4"))) setMethod("BMI4", "person4", function(object){return(object@weight*1e4/object@height^2)}) BMI4(JohnDoe4) library(VGAM) reg4 <- vglm(dist~speed,data=cars,family=gaussianff) reg4@coefficients coef(reg4) qnorm(0.25,mean=5,sd=2) library(distr) library(distrEx) X <- Norm(mean=5,sd=2) q(X) q(X)(0.25) mean(X) X1 <- Norm(mean=5,sd=2) X2 <- Norm(mean=2,sd=1) S <- X1+X2 plot(S) q(S)(.5) ##### R objects : NUMBERS set.seed(1) U <- runif(20) U[1:4] options(digits = 3) U[1:4] options(digits = 22) U[1:4] options(digits = 3) x <- exp(1) y <- x x <- 2 y class(x) 0/0 1/0 .Machine$double.xmax 2e+3070]) x[-(3:8)] x[-3:8] x[c(TRUE,FALSE)] x[c(TRUE,NA,FALSE,TRUE)] x[2] x[[2]] for(i in 1:2){ nom_var <- paste0("x", i) assign(nom_var, rpois(5,7)) } x1 x2 x <- runif(4) x x library(pryr) x %.5) {("greater than 50%")} else {("smaller than 50%")} ifelse(u>.5,("greater than 50%"),("smaller than 50%")) u u <- runif(3) if(u>.5) {print("greater than 50%")} else {("smaller than 50%")} ifelse(u>.5,("greater than 50%"),("smaller than 50%")) u ## LOOPS v_x <- runif(1e5) sqrt_x <- NULL for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x)) sqrt_x <- NULL system.time(for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x))) sqrt_x <- numeric(length(v_x)) for(x in seq_along(v_x)) sqrt_x[i] <- sqrt(x[i]) sqrt_x <- numeric(length(v_x)) system.time(for(x in seq_along(v_x)) sqrt_x[i] <- sqrt(x[i])) system.time(Vectorize(sqrt)(v_x)) sqrt_x <- sapply(v_x,sqrt) sqrt_x <- unlist(lapply(v_x,sqrt)) system.time(unlist(lapply(v_x,sqrt))) library(parallel) (allcores <- detectCores(all.tests=TRUE)) sqrt_x <- unlist(mclapply(v_x,sqrt,mc.cores=4)) system.time(unlist(mclapply(v_x,sqrt,mc.cores=4))) f<-function(x) dlnorm(x) integrate(f,0,Inf) integrate(f,0,1e5) integrate(f,0,1e3)$value+integrate(f,1e3,1e5)$value ## APPLY N<-rpois(20,5) dim(N)=c(4,5) rN.Poisson <- function(n) rpois(n,5) rX.Exponential <- function(n) rexp(n,2) rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ V <- rep(0,n) for(i in 1:n){ N <- rN(1) if(N>0){V[i] <- sum(rX(N))} } return(V)} rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ V <- rep(0,n) for (i in 1:n) V[i] <- sum(rX(rN(1))) return(V)} rcpd2 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ N <- rN(n) X <- rX(sum(N)) I <- factor(rep(1:n,N),levels=1:n) return(as.numeric(xtabs(X ~ I)))} rcpd3 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ N <- rN(n) X <- rX(sum(N)) I <- factor(rep(1:n,N),levels=1:n) V <- tapply(X,I,sum) V[is.na(V)] <- 0 return(as.numeric(V))} rcpd4 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ return(sapply(rN(n), function(x) sum(rX(x))))} rcpd5 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ return(sapply(Vectorize(rX)(rN(n)),sum))} rcpd6 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ return(unlist(lapply(lapply(t(rN(n)),rX),sum)))} n <- 100 library(microbenchmark) options(digits=1) microbenchmark(rcpd1(n),rcpd2(n),rcpd3(n),rcpd4(n),rcpd5(n),rcpd6(n)) ## VECTORIZATION f <- function(x,m=0,s=1){ H<-function(t) 1-pnorm(t,m,s) integral<-integrate(H,lower=x,upper=Inf)$value res<-H(x)/integral return(res) } f(0:1) y <- rep(NA,2) x <- 0:1 for(i in 1:2) y[i] <- f(x[i]) y sapply(x,"f") Vectorize(f)(x) binorm <- function(x1,x2,r=0){ exp(-(x1^2+x2^2-2*r*x1*x2)/(2*(1-r^2)))/(2*pi*sqrt(1-r^2)) } u <- seq(-2,2) binorm(u,u) outer(u,u,binorm) (uv<-expand.grid(u,u)) matrix(binorm(uv$Var1,uv$Var2),5,5) u <- seq(-2,2,length=31) persp(u,u,outer(u,u,binorm)) library(rgl) ## COMPUTATIONAL EFFICIENCY library(microbenchmark) n <- 1000 A<-matrix(seq(1,n^2),n,n) B<-matrix(seq(1,n^2),n,n) C<-1:n microbenchmark((A%*%B)%*%C,A%*%(B%*%C),replications=1) m <- 1000 n <- 50 X <- matrix(rnorm(m * n, mean = 10, sd = 3), nrow = m) group <- rep(c("A","B"), each = n / 2) system.time(for(i in 1:m) t.test(X[i, ] ~ group)$stat) system.time(for(i in 1:m) t.test(X[i, group == "A"], X[i, group == "B"])$stat) t.test_1 <- function(x, grp){ t.test(x[ group == "A"], x[ group == "B"])$stat } system.time(t1 <- apply(X, 1, t.test_1, grp = grp)) t.test_2 <- function(x, grp) { t_stat <- function(x) { m <- mean(x) n <- length(x) var <- sum((x - m) ^ 2) / (n - 1) list(m = m, n = n, var = var) } g1 <- t_stat(x[grp == "A"]) g2 <- t_stat(x[grp == "B"]) se_total <- sqrt(g1$var / g1$n + g2$var / g2$n) (g1$m - g2$m) / se_total } system.time(apply(X, 1, t.test_2, grp = group)) t.test_3 <- function(X, grp){ t_stat <- function(X) { m <- rowMeans(X) n <- ncol(X) var <- rowSums((X - m) ^ 2) / (n - 1) list(m = m, n = n, var = var) } g1 <- t_stat(X[, grp == "A"]) g2 <- t_stat(X[, grp == "B"]) se_total <- sqrt(g1$var / g1$n + g2$var / g2$n) (g1$m - g2$m) / se_total } system.time( t.test_3(X, group)) "%pm%" <- function(x,s) x + c(qnorm(.05),qnorm(.95))*s 100 %pm% 10 'second<-' <- function(x, value) { x[2] <- value return(x) } x <- 1:10 second(x) <- 5 x library(pryr) x <- 1:10 address(x) second(x) <- 6 address(x) ls(globalenv()) find("x") find("pi") environment(sd) e <- new.env() e$d <- 1 e$f <- 1:5 e$g <- 1:5 ls(e) str(e) identical(globalenv(),e) search() f1 <- function(x1){ f2 <- function(x2){ f3 <- function(x3){ x1+x2+x3 } f3(3) } f2(2) } f1(1) f <- function(x) log(x) f("x") try(f("x")) inherits(try(f("x")), "try-error") inherits(try(f("x"), silent=TRUE), "try-error") a=0 try(a<-f("x"), silent=TRUE) a try(a<-f(x), silent=TRUE) a x=1:10 g=function(f) f(x) g(mean) fibonacci <- function(n){ if(n<2) return(1) fibonacci(n-2)+fibonacci(n-1) } fibonacci(20) system.time(fibonacci(30)) library(memoise) fibonacci <- memoise(function(n){ if(n<2) return(1) fibonacci(n-2)+fibonacci(n-1) }) system.time(fibonacci(30)) f <- function(x) x^2 f formals(f) body(f) set.seed(1) x<-rexp(6) sum(x) .Primitive("sum")(x) library(Rcpp) cppFunction('double sum_C(NumericVector x) { int n = x.size(); double total = 0; for(int i = 0; i < n; ++i) { total += x[i]; } return total; }') sum_C(x) library(rPython) sum_P=function(z) python.call("sum",z) sum_P(x) library(microbenchmark) x <- runif(1e6) microbenchmark( sum(x), .Primitive("sum")(x), sum_C(x)) x<-10 f=function() x<-5 f f() x f=function(){ x<-5 return(x) } f() x f=function(){ x<<-5 return(x) } f() x f=function(){ z<-5 return(z) } f() z f=function(x){ z<-x+5 return(z) } f(2) z x=function(y) y/2 x x <- 5 x(x) x=function(y) y/2 y=function(){ x <- 10 x(x) } y() names_list <- function(...){ names(list(...)) } names_list(a=5,b=7) power <- function(exponent) { function(x) { x ^ exponent } } square <- power(2) square(4) cube <- power(3) cube(4) cube total <- 20 pb <- txtProgressBar(min = 0, max = total, style = 3) for(i in 1:(10*total)){ Sys.sleep(0.01) if(i %%10 == 0) setTxtProgressBar(pb, i%/%10) } close(pb) library(tcltk) total <- 20 pb <- tkProgressBar(title = "progress bar", min = 0,max = total, width = 300) for(i in 1:(10*total)){ Sys.sleep(0.01) if(i %%10 == 0) setTkProgressBar(pb, i, label=paste( round(i%/%10/total*100, 0),"% done")) } close(pb) ##### R objects : DATA FRAMES df <- data.frame(x=1:3,y=letters[1:3]) str(df) typeof(df) class(df) cbind(df,z=9:7) df$z<-5:3 df df[1] df[,1,drop=FALSE] df[,1,drop=TRUE] df[[1]] df$x df[,"x"] df[["x"]] set.seed(1) df[sample(nrow(df)),] set.seed(1) df[sample(nrow(df),nrow(df)*2,replace=TRUE),] subset(df, x>1) set.seed(123) df <- data.frame(Y=rnorm(50), X1=as.factor(sample(LETTERS[1:4],size=50, replace=TRUE)), X2=as.factor(sample(1:3,size=50,replace=TRUE))) tail(df) reg <- lm(Y~X1+X2,data=df) model.matrix(reg)[45:50,] reg <- lm(Y~(X1=="A")+X2,data=df) model.matrix(reg)[45:50,] reg <- lm(Y~X1*X2,data=df) model.matrix(reg)[45:50,] reg <- lm(Y~X1+X2%in%X1,data=df) model.matrix(reg)[45:50,] # http://freakonometrics.free.fr/superheroes.RData getwd() load("superheroes.RData") superheroes publishers library(dplyr) inner_join(superheroes, publishers) inner_join(publishers, superheroes) merge(superheroes, publishers, all = TRUE) semi_join(superheroes, publishers) semi_join(publishers, superheroes) left_join(superheroes, publishers) left_join(publishers, superheroes) anti_join(superheroes, publishers) anti_join(publishers, superheroes) # http://freakonometrics.free.fr/gapminderDataFiveYear.txt gdf <- read.delim("gapminderDataFiveYear.txt") head(gdf,4) str(gdf) library(dplyr) gtbl <- tbl_df(gdf) library(data.table) gdfdt <- data.table(gdf) subset(gdf, lifeExp < 30) gdf[lifeExp < 30,] gdf[gdf$lifeExp < 30,] filter(gtbl, lifeExp < 30) subset(gdfdt, lifeExp < 30) gdfdt[lifeExp < 30,] gdf[gdf$country == "Italy", c("year", "lifeExp")] gtbl %>% filter(country == "Italy") %>% select(year, lifeExp) gdfdt[country == "Italy",cbind(year, lifeExp)] setkey(gdfdt,country) gdfdt["Italy",cbind(year, lifeExp)] ##### WEB SCRAPPING WITH R library(RCurl) library(XML) library(stringr) library(maptools) library(rgdal) library(maps) library(TeachingDemos) tb <- getForm("http://www.dastelefonbuch.de/", .params = c(kw = "Petersen", cmd = "search", ao1 = "1", reccount = "2000")) setwd("/home/arthur/Téléchargements/") write(tb, file = "phonebook/phonebook_feuerstein.html") tb_parse <- htmlParse("phonebook/phonebook_feuerstein.html", encoding = "UTF-8") xpath <- "//ul/li/a[contains(text(), 'Privat')]" num_results <- xpathSApply(tb_parse, xpath, xmlValue) num_results num_results <- as.numeric(str_extract(num_results, "[[:digit:]]+")) num_results xpath <- "//div[@class='name']/a[@title]" surnames <- xpathSApply(tb_parse, xpath, xmlValue) surnames[1:3] xpath <- "//span[@itemprop='postal-code']" zipcodes <- xpathSApply(tb_parse, xpath, xmlValue) zipcodes[1:3] xpath <- "//span[@itemprop='postal-code']/ancestor::div[@class='popupMenu']/preceding-sibling::div[@class='name']" names_vec <- xpathSApply(tb_parse, xpath, xmlValue) xpath <- "//div[@class='name']/following-sibling::div[@class='popupMenu']//span[@itemprop='postal-code']" zipcodes_vec <- xpathSApply(tb_parse, xpath, xmlValue) names_vec <- str_replace_all(names_vec, "(\\n|\\t|\\r| {2,})", "") zipcodes_vec <- as.numeric(zipcodes_vec) entries_df <- data.frame(plz = zipcodes_vec, name = names_vec) head(entries_df) download.file("http://fa-technik.adfc.de/code/opengeodb/PLZ.tab", destfile = "geo_germany/plz_de.txt") plz_df <- read.delim("geo_germany/plz_de.txt", stringsAsFactors = FALSE, encoding = "UTF-8") plz_df[1:3, ] places_geo <- merge(entries_df, plz_df, by = "plz", all.x = TRUE) places_geo[1:3, ] download.file("http://biogeo.ucdavis.edu/data/gadm2/shp/DEU_adm.zip", destfile = "geo_germany/ger_shape.zip") unzip("geo_germany/ger_shape.zip", exdir = "geo_germany") dir("geo_germany") projection <- CRS("+proj=longlat +ellps=WGS84 +datum=WGS84") map_germany <- readShapePoly(str_c(getwd(), "/geo_germany/DEU_adm0.shp"), proj4string = projection) map_germany_laender <- readShapePoly(str_c(getwd(),"/geo_germany/DEU_adm1.shp"), proj4string=projection) SP=cbind(places_geo$lon, places_geo$lat) SP=SP[-which(is.na(SP[,1])),] coords <- SpatialPoints(SP) proj4string(coords) <- CRS("+proj=longlat +ellps=WGS84 +datum=WGS84") data("world.cities") cities_ger <- subset(world.cities, country.etc == "Germany" & (world.cities$pop > 450000 | world.cities$name %in% c("Mannheim", "Jena"))) coords_cities <- SpatialPoints(cbind(cities_ger$long,cities_ger$lat)) plot(map_germany) plot(map_germany_laender, add = T) points(coords$coords.x1, coords$coords.x2, pch = 20, col = "purple") points(coords_cities, col = "black", , bg = "grey", pch = 23) text(cities_ger$long, cities_ger$lat, labels = cities_ger$name, pos = 4)