# Actuary Data Science # Arthur Charpentier - @freakonometrics # R & Packages getOption("defaultPackages") (.packages(all.available=TRUE)) ?rq require(quantreg) # install.packages("quantreg") library(quantreg, verbose=FALSE) ?rq rq detach(package:quantreg, unload=TRUE) ?rq apropos("norm") # S3 vs S4 Classes person3 <- function(name,age,weight,height){ characteristics<-list(name=name,age=age,weight=weight,height=height) class(characteristics)<-"person3" return(characteristics)} JohnDoe3 <- person3(name="John",age=28, weight=76, height=182) JohnDoe3 JohnDoe3$age BMI3 <- function(object,...) {return(object$weight*1e4/object$height^2)} BMI3(JohnDoe3) reg3 <- lm(dist~speed,data=cars) reg3$coefficients coef(reg3) summary(reg3) setClass("person4", representation(name="character", age="numeric", weight="numeric", height="numeric"), where = .GlobalEnv) JohnDoe4 <- new("person4",name="John",age=28, weight=76, height=182) JohnDoe4 eval(JohnDoe4,.GlobalEnv) JohnDoe4@age setGeneric("BMI4",function(object,separator) return(standardGeneric("BMI4")), where = .GlobalEnv) setMethod("BMI4", "person4", function(object){return(object@weight*1e4/object@height^2)}, where = .GlobalEnv) BMI4(JohnDoe4) # install.packages("VGAM") library(VGAM) reg4 <- vglm(dist~speed,data=cars,family=gaussianff) reg4@coefficients coef(reg4) qnorm(0.25,mean=5,sd=2) # install.packages("distr") # install.packages("distrEx") library(distr, verbose=FALSE) library(distrEx, verbose=FALSE) X <- Norm(mean=5,sd=2) q(X) q(X)(0.25) mean(X) X1 <- Norm(mean=5,sd=2) X2 <- Norm(mean=2,sd=1) S <- X1+X2 q(S)(.5) plot(S) # R objects : Numbers set.seed(1) U <- runif(20) U[1:4] options(digits = 3) U[1:4] options(digits = 22) U[1:4] options(digits = 3) x <- exp(1) y <- x x <- 2 y class(x) 0/0 1/0 .Machine$double.xmax 2e+3070]) x[-(3:8)] # x[-3:8] x[c(TRUE,FALSE)] x[c(TRUE,NA,FALSE,TRUE)] x[2] x[[2]] for(i in 1:2){ nom_var <- paste0("x", i) assign(nom_var, rpois(5,7)) } x1 x2 x <- runif(4) x x set.seed(1) x <- runif(4) x # install.packages("pryr") library(pryr) x %.5) {("greater than 50%")} else {("smaller than 50%")} ifelse(u>.5,("greater than 50%"),("smaller than 50%")) u u <- runif(3) if(u>.5) {print("greater than 50%")} else {("smaller than 50%")} ifelse(u>.5,("greater than 50%"),("smaller than 50%")) u # For Loops v_x <- runif(1e5) sqrt_x <- NULL for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x)) sqrt_x <- NULL system.time(for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x))) sqrt_x <- numeric(length(v_x)) for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i]) sqrt_x <- numeric(length(v_x)) system.time(for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i])) # Apply system.time(Vectorize(sqrt)(v_x)) sqrt_x <- sapply(v_x,sqrt) sqrt_x <- unlist(lapply(v_x,sqrt)) system.time(unlist(lapply(v_x,sqrt))) # Parallel Apply # install.packages("parallel") library(parallel) (allcores <- detectCores(all.tests=TRUE)) sqrt_x <- unlist(mclapply(v_x,sqrt,mc.cores=4)) system.time(unlist(mclapply(v_x,sqrt,mc.cores=4))) # Functions f<-function(x) dlnorm(x) integrate(f,0,Inf) integrate(f,0,1e5) integrate(f,0,1e3)$value+integrate(f,1e3,1e5)$value # Compound Poisson rN.Poisson <- function(n) rpois(n,5) rX.Exponential <- function(n) rexp(n,2) rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ V <- rep(0,n) for(i in 1:n){ N <- rN(1) if(N>0){V[i] <- sum(rX(N))} } return(V)} rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ V <- rep(0,n) for (i in 1:n) V[i] <- sum(rX(rN(1))) return(V)} rcpd2 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ N <- rN(n) X <- rX(sum(N)) I <- factor(rep(1:n,N),levels=1:n) return(as.numeric(xtabs(X ~ I)))} rcpd3 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ N <- rN(n) X <- rX(sum(N)) I <- factor(rep(1:n,N),levels=1:n) V <- tapply(X,I,sum) V[is.na(V)] <- 0 return(as.numeric(V))} rcpd4 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ return(sapply(rN(n), function(x) sum(rX(x))))} rcpd5 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ return(sapply(Vectorize(rX)(rN(n)),sum))} rcpd6 <- function(n,rN=rN.Poisson,rX=rX.Exponential){ return(unlist(lapply(lapply(t(rN(n)),rX),sum)))} n <- 100 # install.packages("microbenchmark") library(microbenchmark) options(digits=1) microbenchmark(rcpd1(n),rcpd2(n),rcpd3(n),rcpd4(n),rcpd5(n),rcpd6(n)) # Vectorization f <- function(x,m=0,s=1){ H<-function(t) 1-pnorm(t,m,s) integral<-integrate(H,lower=x,upper=Inf)$value res<-H(x)/integral return(res) } f(0:1) y <- rep(NA,2) x <- 0:1 for(i in 1:2) y[i] <- f(x[i]) y sapply(x,"f") Vectorize(f)(x) binorm <- function(x1,x2,r=0){ exp(-(x1^2+x2^2-2*r*x1*x2)/(2*(1-r^2)))/(2*pi*sqrt(1-r^2)) } u <- seq(-2,2) binorm(u,u) outer(u,u,binorm) (uv<-expand.grid(u,u)) matrix(binorm(uv$Var1,uv$Var2),5,5) u <- seq(-2,2,length=31) persp(u,u,outer(u,u,binorm)) # Computational Efficiency library(microbenchmark) n <- 1000 A<-matrix(seq(1,n^2),n,n) B<-matrix(seq(1,n^2),n,n) C<-1:n microbenchmark((A%*%B)%*%C,A%*%(B%*%C),replications=1) m <- 1000 n <- 50 X <- matrix(rnorm(m * n, mean = 10, sd = 3), nrow = m) group <- rep(c("A","B"), each = n / 2) system.time(for(i in 1:m) t.test(X[i, ] ~ group)$stat) system.time(for(i in 1:m) t.test(X[i, group == "A"], X[i, group == "B"])$stat) t.test_1 <- function(x, grp){ t.test(x[ group == "A"], x[ group == "B"])$stat } system.time(t1 <- apply(X, 1, t.test_1, grp = grp)) t.test_2 <- function(x, grp) { t_stat <- function(x) { m <- mean(x) n <- length(x) var <- sum((x - m) ^ 2) / (n - 1) list(m = m, n = n, var = var) } g1 <- t_stat(x[grp == "A"]) g2 <- t_stat(x[grp == "B"]) se_total <- sqrt(g1$var / g1$n + g2$var / g2$n) (g1$m - g2$m) / se_total } system.time(apply(X, 1, t.test_2, grp = group)) t.test_3 <- function(X, grp){ t_stat <- function(X) { m <- rowMeans(X) n <- ncol(X) var <- rowSums((X - m) ^ 2) / (n - 1) list(m = m, n = n, var = var) } g1 <- t_stat(X[, grp == "A"]) g2 <- t_stat(X[, grp == "B"]) se_total <- sqrt(g1$var / g1$n + g2$var / g2$n) (g1$m - g2$m) / se_total } system.time( t.test_3(X, group)) "%pm%" <- function(x,s) x + c(qnorm(.05),qnorm(.95))*s 100 %pm% 10 'second<-' <- function(x, value) { x[2] <- value return(x) } x <- 1:10 second(x) <- 5 x # Memory Issues library(pryr) x <- 1:10 address(x) second(x) <- 6 address(x) ls(globalenv()) find("x") find("pi") environment(sd) e <- new.env() e$d <- 1 e$f <- 1:5 e$g <- 1:5 ls(e) str(e) identical(globalenv(),e) search() # Memory Issues f1 <- function(x1){ f2 <- function(x2){ f3 <- function(x3){ x1+x2+x3 } f3(3) } f2(2) } f1(1) f <- function(x) log(x) f("x") try(f("x")) inherits(try(f("x")), "try-error") inherits(try(f("x"), silent=TRUE), "try-error") a=0 try(a<-f("x"), silent=TRUE) a try(a<-f(x), silent=TRUE) a x=1:10 g=function(f) f(x) g(mean) fibonacci <- function(n){ if(n<2) return(1) fibonacci(n-2)+fibonacci(n-1) } fibonacci(20) system.time(fibonacci(30)) # install.packages("memoise") library(memoise) fibonacci <- memoise(function(n){ if(n<2) return(1) fibonacci(n-2)+fibonacci(n-1) }) system.time(fibonacci(30)) f <- function(x) x^2 f formals(f) body(f) set.seed(1) x<-rexp(6) sum(x) .Primitive("sum")(x) # install.packages("Rcpp") library(Rcpp) cppFunction('double sum_C(NumericVector x) { int n = x.size(); double total = 0; for(int i = 0; i < n; ++i) { total += x[i]; } return total; }') sum_C(x) # install.packages("rPython") library(rPython) sum_P=function(z) python.call("sum",z) sum_P(x) library(microbenchmark) x <- runif(1e6) microbenchmark( sum(x), .Primitive("sum")(x), sum_C(x)) x<-10 f=function() x<-5 f f() x f=function(){ x<-5 return(x) } f() x f=function(){ x<<-5 return(x) } f() x f=function(){ z<-5 return(z) } f() z f=function(x){ z<-x+5 return(z) } f(2) z x=function(y) y/2 x x <- 5 x(x) x=function(y) y/2 y=function(){ x <- 10 x(x) } y() names_list <- function(...){ names(list(...)) } names_list(a=5,b=7) power <- function(exponent) { function(x) { x ^ exponent } } square <- power(2) square(4) cube <- power(3) cube(4) cube total <- 20 pb <- txtProgressBar(min = 0, max = total, style = 3) for(i in 1:(10*total)){ Sys.sleep(0.01) if(i %%10 == 0) setTxtProgressBar(pb, i%/%10) } close(pb) # R objects : Data Frames df <- data.frame(x=1:3,y=letters[1:3]) str(df) typeof(df) class(df) cbind(df,z=9:7) df$z<-5:3 df df[1] df[,1,drop=FALSE] df[,1,drop=TRUE] df[[1]] df$x df[,"x"] df[["x"]] set.seed(1) df[sample(nrow(df)),] set.seed(1) df[sample(nrow(df),nrow(df)*2,replace=TRUE),] subset(df, x>1) set.seed(123) df <- data.frame(Y=rnorm(50), X1=as.factor(sample(LETTERS[1:4],size=50, replace=TRUE)), X2=as.factor(sample(1:3,size=50,replace=TRUE))) tail(df) reg <- lm(Y~X1+X2,data=df) model.matrix(reg)[45:50,] reg <- lm(Y~(X1=="A")+X2,data=df) model.matrix(reg)[45:50,] reg <- lm(Y~X1*X2,data=df) model.matrix(reg)[45:50,] reg <- lm(Y~X1+X2%in%X1,data=df) model.matrix(reg)[45:50,] download.file("http://freakonometrics.free.fr/superheroes.RData","superheroes.RData") load("superheroes.RData") superheroes publishers # install.packages("dplyr") library(dplyr, verbose=FALSE) inner_join(superheroes, publishers) inner_join(publishers, superheroes) merge(superheroes, publishers, all = TRUE) semi_join(superheroes, publishers) semi_join(publishers, superheroes) left_join(superheroes, publishers) left_join(publishers, superheroes) anti_join(superheroes, publishers) anti_join(publishers, superheroes) download.file("http://freakonometrics.free.fr/gapminderDataFiveYear.txt","gapminderDataFiveYear.txt") gdf <- read.delim("gapminderDataFiveYear.txt") head(gdf,4) str(gdf) library(dplyr) gtbl <- tbl_df(gdf) # install.packages("data.table") library(data.table) gdfdt <- data.table(gdf) subset(gdf, lifeExp < 30) gdf[lifeExp < 30,] gdf[gdf$lifeExp < 30,] filter(gtbl, lifeExp < 30) subset(gdfdt, lifeExp < 30) gdfdt[lifeExp < 30,] gdf[gdf$country == "Italy", c("year", "lifeExp")] gtbl %>% filter(country == "Italy") %>% select(year, lifeExp) gdfdt[country == "Italy",cbind(year, lifeExp)] setkey(gdfdt,country) gdfdt["Italy",cbind(year, lifeExp)] rm(list=ls()) loc="http://freakonometrics.free.fr/gapminderDataFiveYear.txt" download.file(loc,destfile="temp") df <- read.delim("temp") tail(df,4) str(df) sum(duplicated(df)) df$lifeExp[1:10] df[4:5,] library(dplyr) ldf <- tbl_df(df) glimpse(ldf) select(ldf,lifeExp) select(ldf,lifeExp)$lifeExp[1:10] #library(modify) #s_select(ldf,"lifeExp") ldf[4:5,] slice(ldf,4:5) filter(ldf,lifeExp<30 | gdpPercap<300) rename(ldf,population = pop) library(data.table) dt <- data.table(df) is.data.frame(dt) ?fread tables() object.size(df) object.size(ldf) object.size(dt) df[df$lifeExp<30,] subset(df, lifeExp < 30) filter(ldf, lifeExp < 30) subset(dt, lifeExp < 30) dt[lifeExp<30] setkey(dt, country) dt["Italy"] dt["Italy",mult="first"] dt["Italy",max(lifeExp)] max(df$lifeExp[df$country=="Italy"]) setkey(dt, country, lifeExp) dt["Italy"] dt[.("Italy",77.44)] dt[.("Italy",77)] dt[.("Italy",77),roll="nearest"] df[df$country == "Italy", c("year", "lifeExp")] ldf %>% filter(country == "Italy") %>% select(year, lifeExp) small_df <- df[df$country %in% c("France","Italy","Spain"), c("country","year", "lifeExp")] # install.packages("tidyr") library(tidyr) str(small_df) table_df=spread(small_df,"country","lifeExp") table_df data_df=gather(table_df,"country","lifeExp",2:4) data_df # install.packages("reshape2") library(reshape2) dcast(small_df, formula = country~year, value.var="lifeExp") table_df= dcast(small_df, formula = year~country, value.var="lifeExp") table_df melt(table_df, id.vars=c("year"), value.name = "lifeExp", variable.name="country") aggregate(small_df$lifeExp,FUN=max,by=list(small_df$country)) tapply(small_df$lifeExp,factor(small_df$country),max) small_ldf = filter(ldf,country %in% c("France","Italy","Spain")) group_by(small_ldf,country) %>% summarize(max(lifeExp)) filter(ldf,country %in% c("France","Italy","Spain")) %>% group_by(country) %>% summarize(max(lifeExp)) filter(ldf,country %in% c("France","Italy","Spain")) %>% group_by(country) %>% summarize(max(lifeExp)) plot(df$gdpPercap,df$lifeExp) select(ldf,gdpPercap,lifeExp) %>% plot() setkey(dt, country) dt[c("France","Italy","Spain"),max(lifeExp),by="country"] head(df[with(df,order(-lifeExp)),]) head(dt[order(-lifeExp)]) head(setorder(dt,-lifeExp)) arrange(ldf,-lifeExp) df$gdp=df$pop*df$gdpPercap df = within(df,gdp = pop*gdpPercap) df = within(df,gdp <- pop*gdpPercap) dt[,gdp:=pop*gdpPercap] head(dt) mutate(ldf,gdp=pop*gdpPercap) countries=read.csv2("http://freakonometrics.free.fr/countries.csv",header=TRUE) countries = countries[-1,] head(countries) df2 = countries[,c("Country","Capital")] names(df2)=c("countries","capital") df1 = aggregate(df$lifeExp,FUN=max,by=list(df$country)) names(df1)=c("countries","lifeExp") dim(df1) dim(df2) tail(merge(df1,df2)) tail(merge(df1,df2,all.x=TRUE)) tail(merge(df1,df2,all.y=TRUE)) ldf1 = group_by(ldf, country) %>% summarize(max(lifeExp)) names(ldf1)=c("countries","lifeExp") ldf2 <- tbl_df(df2) inner_join(ldf1,ldf2) left_join(ldf1,ldf2) ldf1 %>% inner_join(ldf2) dt1 = dt[,max(lifeExp),by="country"] setnames(dt1,c("countries","lifeExp")) dt2 <- data.table(df2) setkey(dt1,"countries") setkey(dt2,"countries") dt1[dt2,nomatch=0] dt1[dt2] dt2[dt1] # install.packages("stringr") # install.packages("lubridate") # install.packages("plyr") # install.packages("LaF") library(stringr) library(lubridate, verbose=FALSE) library(plyr, verbose=FALSE) library(data.table, verbose=FALSE) library(LaF) loc="http://freakonometrics.free.fr/gapminderDataFiveYear.txt" download.file(loc,destfile="temp") ssn <- laf_open_csv("temp",sep="\t", column_types=c("character","integer","numeric","character","numeric","numeric"), column_names=names(df)[1:6]) object.size(ssn) nrow(ssn) # Graphs with R # install.packages("gamair") library(gamair) data(chicago) head(chicago) base=data.frame(death=chicago$death, temp_F=chicago$tmpd, o3=chicago$o3median, date=seq(as.Date("1987-01-01"), as.Date("2000-12-31"),by=1)) base$temp_C <- (base$temp_F-32)/1.8 base$year <- substring(base$date,1,4) date2season <- function(date){ m <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%m")) d <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%d")) s <- NA if(m %in% c(1,2) | ((m==12)&(d>=21)) | ((m==3)&(d<21))) s <- "winter" if(m %in% c(4,5) | ((m==3)&(d>=21)) | ((m==6)&(d<21))) s <- "spring" if(m %in% c(7,8) | ((m==6)&(d>=21)) | ((m==9)&(d<21))) s <- "summer" if(m %in% c(10,11) | ((m==9)&(d>=21)) | ((m==12)&(d<21))) s <- "autumn" return(s)} base$season <- sapply(base$date,date2season) head(base) plot(base[,c("date", "temp_C")]) plot(base[,c("date", "temp_C")],main="Temperature in Chicago", xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )"))) plot(base[,c("date", "temp_C")],main="Temperature in Chicago", xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")), ylim=c(-20,60)) plot(base[,c("date", "temp_C")],main="Temperature in Chicago", xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")), col=as.integer(as.factor(base$season)), xlim=as.Date(c("1995-01-01","1999-01-01"))) CL <- function(x) rgb(colorRamp(c("blue", "red"))(x)/255) plot(base[,c("date", "temp_C")],main="Temperature in Chicago", xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")), col=CL(rank(base$temp_C)/(length(base$temp_C)+1))) plot(base[,c("date", "temp_C")],main="Temperature in Chicago", xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")), type="l",col="grey", xlim=as.Date(c("1995-01-01","1999-01-01"))) plot(base[,c("date", "temp_C")],main="Temperature in Chicago", xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")), xlim=as.Date(c("1995-01-01","1999-01-01"))) rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "light yellow") points(base[,c("date", "temp_C")],col=as.integer(as.factor(base$season))) boxplot(base$temp_C~as.factor(base$year),main="Temperature in Chicago", xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )"))) x <- 1:14 text(x, rep(-25,length(x)), x, col="red") bp=boxplot(base$temp_C~as.factor(base$year),main="Temperature in Chicago", xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )"))) points(x,bp$stats[5,],pch=19,col="purple") abline(lm(bp$stats[5,]~x),col="purple",lwd=2) hist(base$temp_C,xlab=expression(paste("Temperature ( ", degree ~ C, " )")),ylab="",probability=TRUE, col="yellow",border="white",main="",ylim=c(0,.0375)) u=seq(-30,40,by=.25) lines(u,dnorm(u,mean(base$temp_C),sd(base$temp_C)),lty=2) lines(density(base$temp_C),col="red",lwd=2) # Graphs with ggplot2 # install.packages("ggplot2") library(ggplot2) g <- ggplot(base, aes(date, temp_C)) + geom_point() g ggsave(file="figure.pdf", scale=3) g <- g+labs(title='Temperature') g <- g+ggtitle('Temperature') g g+theme(plot.title = element_text(size=20, face="bold", vjust=2, color="red")) g<-g+labs( x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="Temperature in Chicago") g g+ theme( axis.title.x = element_text(color="blue"), axis.title.y = element_text(color="blue") ) g + ylim(c(-20,60)) g <- g + xlim(as.Date(c("1995-01-01","1999-01-01"))) g base2=data.frame( today=base$temp_C[1:5113], tomorrow=base$temp_C[2:5114]) ggplot(base2, aes(today, tomorrow))+ geom_point()+ coord_equal() g <- ggplot(base, aes(date, temp_C, color=factor(season)))+ xlim(as.Date(c("1995-01-01","1999-01-01")))+ geom_point() g+theme(legend.title=element_blank())+ guides(colour = guide_legend(override.aes = list(size=4))) g<-ggplot(base, aes(date, temp_C))+ xlim(as.Date(c("1995-01-01","1999-01-01"))) g+geom_line(color="grey") g+geom_line(aes(color="temperature"))+ scale_colour_manual(name='', values=c('temperature'='grey')) ggplot(base, aes(date, temp_C, color=factor(season)))+geom_point() + scale_color_manual(values=c("dodgerblue4", "darkolivegreen4","darkorchid3", "goldenrod1")) ggplot(base, aes(date, temp_C, color=factor(season)))+ geom_point() + scale_color_brewer(palette="Set1") ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+ scale_color_gradient(low="blue", high="red") ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+ scale_color_gradient2(low="blue", mid="white", high="red") ggplot(base, aes(date, temp_C))+geom_point(color="firebrick")+ theme(panel.background = element_rect(fill = 'light yellow')) ggplot(base, aes(date,temp_C))+geom_point(color="firebrick")+ facet_wrap(~year, ncol=2, scales="free") # install.packages("ggthemes") library(ggthemes, verbose=FALSE) ggplot(base, aes(date, temp_C, color=factor(season)))+ geom_point()+ labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like MS Excel")+ theme_excel()+scale_colour_excel()+ theme(legend.title=element_blank()) # Spatial Models # install.packages("sp") # install.packages("rworldmap") library(sp) library(rworldmap) data(countriesLow) crs.WGS84 <- CRS("+proj=longlat +datum=WGS84") countries.WGS84 <- spTransform(countriesLow, crs.WGS84) plot(countries.WGS84,col="light green") crs.laea <- CRS("+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 +y_0=3210000 +ellps=GRS80 +units=m +no_defs") countries.laea <- spTransform(countriesLow, crs.laea) plot(countries.laea,col="light green") # OpenStreetMap # install.packages("OpenStreetMap") library(OpenStreetMap) map <- openmap(c(lat= 51.516, lon= -.141), c(lat= 51.511, lon= -.133)) map <- openproj(map, projection = "+init=epsg:27700") plot(map) loc="http://freakonometrics.free.fr/cholera.zip" download.file(loc,destfile="cholera.zip") unzip("cholera.zip", exdir="./") library(maptools) deaths<-readShapePoints("Cholera_Deaths") head(deaths@coords) plot(map) points(deaths@coords,col="red", pch=19, cex=.7 ) X <- deaths@coords # install.packages("KernSmooth") library(KernSmooth) kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2]))) # install.packages("grDevices") library(grDevices) clrs <- colorRampPalette(c(rgb(0,0,1,0), rgb(0,0,1,1)), alpha = TRUE)(20) plot(map) image(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE,col=clrs) contour(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE) # Google Map #install.packages("ggmap") library(ggmap) get_london <- get_map(c(-.137,51.513), zoom=17, maptype = "roadmap", source="google") london <- ggmap(get_london) london # install.packages("rgdal") df_deaths <- data.frame(X) # install.packages("sp") # install.packages("rgdal") library(sp) library(rgdal) coordinates(df_deaths)=~coords.x1+coords.x2 proj4string(df_deaths)=CRS("+init=epsg:27700") df_deaths = spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84")) london + geom_point(aes(x=coords.x1,y=coords.x2),data=data.frame(df_deaths@coords),col="red") london + geom_point(aes(x=coords.x1, y=coords.x2), data=data.frame(df_deaths@coords),col="red") + geom_density2d(data = data.frame(df_deaths@coords), aes(x = coords.x1, y=coords.x2), size = 0.3) + stat_density2d(data = data.frame(df_deaths@coords), aes(x = coords.x1, y=coords.x2,fill = ..level.., alpha = ..level..), size = 0.01, geom = "polygon") + scale_fill_gradient(low = "green", high = "red", guide = FALSE) + scale_alpha(range = c(0, 0.3), guide = FALSE) # Google Map & Leaflet require(leaflet) deaths <- readShapePoints("Cholera_Deaths") df_deaths <- data.frame(deaths@coords) coordinates(df_deaths)=~coords.x1+coords.x2 proj4string(df_deaths)=CRS("+init=epsg:27700") df_deaths=spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84")) df=data.frame(df_deaths@coords) lng <- df$coords.x1 lat <- df$coords.x2 m = leaflet()%>% addTiles() %>% fitBounds(-.141, 51.511, -.133, 51.516) m rd=.5 op=.8 clr="blue" m = leaflet() %>% addTiles() %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) m X=cbind(lng,lat) kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2]))) x=kde2d$x1 y=kde2d$x2 z=kde2d$fhat CL=contourLines(x , y , z) m = leaflet() %>% addTiles() %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) m m = leaflet() %>% addTiles() %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) m = leaflet() %>% addTiles() %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>% addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>% addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE) m m = leaflet() %>% addTiles() %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>% addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>% addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>% addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red") m # Web Scrapping & Hurricanes # install.packages("XML") library(XML) year <- 2012 loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="") tabs <- readHTMLTable(htmlParse(loc)) tabs[1] storms <- unlist(strsplit(as.character(tabs[[1]]$Name),split=" ")) storms index <- storms%in%c("Tropical", "Storm", paste("Hurricane-",1:6,sep=""), "Depression", "Subtropical", "Extratropical", "Low", paste("Storm-",1:6,sep=""), "Xxx") nstorms <- storms[!index] nstorms TRACK=NULL for(i in length(nstorms):1){ if((nstorms[i]=="SIXTEE")&(year==2008)) nstorms[i] <- "SIXTEEN" if((nstorms[i]=="LAUR")&(year==2008)) nstorms[i] <- "LAURA" if((nstorms[i]=="FIFTEE")&(year==2007)) nstorms[i] <- "FIFTEEN" if((nstorms[i]=="CHANTA")&(year==2007)) nstorms[i] <- "CHANTAL" if((nstorms[i]=="ANDR")&(year==2007)) nstorms[i] <- "ANDREA" if((nstorms[i]=="NINETE")&(year==2005)) nstorms[i] <- "NINETEEN" if((nstorms[i]=="JOSEPH")&(year==2002)) nstorms[i] <- "JOSEPHINE" if((nstorms[i]=="FLOY")&(year==1993)) nstorms[i] <- "FLOYD" if((nstorms[i]=="KEIT")&(year==1988)) nstorms[i] <- "KEITH" if((nstorms[i]=="CHARLI")&(year==1972)) nstorms[i] <- "CHARLIE"} for(i in length(nstorms):1){ loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="") track <- read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20)) names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT") track$LAT=as.numeric(as.character(track$LAT)) track$LON=as.numeric(as.character(track$LON)) track$WIND=as.numeric(as.character(track$WIND)) track$PR=as.numeric(as.character(track$PR)) track$year=year track$name=nstorms[i] TRACK=rbind(TRACK,track)} tail(TRACK,10) loc="http://freakonometrics.free.fr/TRACK-ATLANTIC.Rdata" download.file(loc,destfile="TRACK-ATLANTIC.Rdata") load("TRACK-ATLANTIC.Rdata") dim(TOTTRACK) # install.packages("grDevices") # install.packages("maps") library(grDevices) library(maps) map("world", col="light yellow", fill=TRUE) for(n in unique(TOTTRACK$name)){ lines(TOTTRACK$LON[TOTTRACK$name==n],TOTTRACK$LAT[TOTTRACK$name==n],lwd=.5,col=rgb(1, 0, 0, alpha=.5))} # Shapefiles loc="http://freakonometrics.free.fr/Italy.zip" download.file(loc,destfile="Italy.zip") unzip("Italy.zip", exdir="./") library(rgdal) ita1<-readOGR(dsn="./Italy/",layer="ITA_adm1",verbose=FALSE) plot(ita1,col="light green") ita1@data[,"NAME_1"][1:6] pos<-which(ita1@data[,"NAME_1"]=="Sicily") Poly_Sicily<-ita1[pos,] plot(ita1,col="light green") plot(Poly_Sicily,col="yellow",add=TRUE) plot(ita1,col="light green") plot(Poly_Sicily,col="yellow",add=TRUE) abline(v=5:20,col="light blue") abline(h=35:50,col="light blue") axis(1) axis(2) pos<-which(ita1@data[,"NAME_1"] %in% c("Abruzzo","Apulia","Basilicata","Calabria","Campania","Lazio","Molise","Sardegna","Sicily")) ita_south <- ita1[pos,] ita_north <- ita1[-pos,] plot(ita1) plot(ita_south,col="red",add=TRUE) plot(ita_north,col="blue",add=TRUE) # install.packages("xlsx") library(xlsx) loc="http://freakonometrics.free.fr/Italy_codes.xlsx" download.file(loc,destfile="Italy_codes.xlsx") data_codes<-read.xlsx2(file = "Italy_codes.xlsx",sheetName="ITALY", startRow=1, header=TRUE) names(data_codes)[1]="NAME_1" ita2<-merge(ita1, data_codes, by = "NAME_1", all.x=TRUE) pos<-which(ita2@data[,"GROUP"] == "SOUTH") # install.packages("rgeos") library(rgeos) ita_s<-gUnionCascaded(ita_south) ita_n<-gUnionCascaded(ita_north) plot(ita1) plot(ita_s,col="red",add=TRUE) plot(ita_n,col="blue",add=TRUE) plot(ita1,col="light green") plot(Poly_Sicily,col="yellow",add=TRUE) gCentroid(Poly_Sicily,byid=TRUE) G <- as.data.frame(gCentroid(ita1,byid=TRUE)) plot(ita1,col="light green") text(G$x,G$y,1:20) c1 <- G[c(17,20,6,16,8,12,2),] c2 <- G[c(13,11,1,5,3,4),] L1<-SpatialLines(list(Lines(list(Line(c1)),"Line1"))) L2<-SpatialLines(list(Lines(list(Line(c2)),"Line2"))) L1@proj4string<-ita1@proj4string L2@proj4string<-ita1@proj4string cross_road <-gIntersection(L1,L2) cross_road@coords plot(ita1,col="light green") plot(L1,col="red",lwd=2,add=TRUE) plot(L2,col="blue",lwd=2,add=TRUE) plot(cross_road,pch=19,cex=1.5,add=TRUE) library(maptools) library(rgdal) # install.packages("classInt") library(classInt) #paris <- readShapeSpatial("paris-cartelec.shp") paris<-readOGR(dsn="./",layer="paris-cartelec",verbose=FALSE) proj4string(paris) <- CRS("+init=epsg:2154") paris <- spTransform(paris, CRS("+proj=longlat +ellps=GRS80")) plot(paris) length(paris) poly_paris <- SpatialPolygons2PolySet(paris) sub_poly <- poly_paris[poly_paris$PID==100,] plot(sub_poly[,c("X","Y")]) polygon(sub_poly[,c("X","Y")]) length(paris) poly_paris <- SpatialPolygons2PolySet(paris) sub_poly <- poly_paris[poly_paris$PID==100,] plot(sub_poly[,c("X","Y")]) polygon(sub_poly[,c("X","Y")]) point <- c(2.33,48.859) point.in.polygon(point[1],point[2],sub_poly[,"X"],sub_poly[,"Y"]) point_in_i=function(i,point) point.in.polygon(point[1],point[2],poly_paris[poly_paris$PID==i,"X"],poly_paris[poly_paris$PID==i,"Y"]) where_is_point=function(point) which(Vectorize(function(i) point_in_i(i,point))(1:length(paris))>0) where_is_point(point) # install.packages("RColorBrewer") library(RColorBrewer) plotclr <- brewer.pal(7,"RdYlBu") vizualize_point <- function(point){ wp <- where_is_point(point) colcode <- rep(plotclr[4],length(paris)) colcode[wp] <- plotclr[1] plot(paris,col=colcode,border="grey")} vizualize_point(c(2.4,48.87)) # install.packages("geosphere") library(ggmap,verbose=FALSE) library(geosphere) (ADS <- geocode("4 rue chauveau lagarde paris", source="google")) vizualize_point(ADS)