# Actuary Data Science
# Arthur Charpentier - @freakonometrics

# R & Packages

getOption("defaultPackages")
(.packages(all.available=TRUE))
?rq
require(quantreg)
# install.packages("quantreg")
library(quantreg, verbose=FALSE)
?rq
rq
detach(package:quantreg, unload=TRUE)
?rq

apropos("norm")

# S3 vs S4 Classes

person3 <- function(name,age,weight,height){
  characteristics<-list(name=name,age=age,weight=weight,height=height)
  class(characteristics)<-"person3"
  return(characteristics)}
JohnDoe3 <- person3(name="John",age=28, weight=76, height=182)
JohnDoe3
JohnDoe3$age

BMI3 <- function(object,...) {return(object$weight*1e4/object$height^2)}
BMI3(JohnDoe3)

reg3 <- lm(dist~speed,data=cars)
reg3$coefficients
coef(reg3)
summary(reg3)

setClass("person4", representation(name="character",
age="numeric", weight="numeric", height="numeric"), where = .GlobalEnv)
JohnDoe4 <- new("person4",name="John",age=28, weight=76, height=182)
JohnDoe4
eval(JohnDoe4,.GlobalEnv)
JohnDoe4@age

setGeneric("BMI4",function(object,separator) return(standardGeneric("BMI4")), where = .GlobalEnv)
setMethod("BMI4", "person4",
          function(object){return(object@weight*1e4/object@height^2)}, where = .GlobalEnv)
BMI4(JohnDoe4)

# install.packages("VGAM")
library(VGAM)
reg4 <- vglm(dist~speed,data=cars,family=gaussianff)
reg4@coefficients
coef(reg4)

qnorm(0.25,mean=5,sd=2)
# install.packages("distr")
# install.packages("distrEx")
library(distr, verbose=FALSE)
library(distrEx, verbose=FALSE)
X <- Norm(mean=5,sd=2)
q(X)
q(X)(0.25)
mean(X)
X1 <- Norm(mean=5,sd=2)
X2 <- Norm(mean=2,sd=1)
S <- X1+X2
q(S)(.5)
plot(S)

# R objects : Numbers

set.seed(1)
U <- runif(20)
U[1:4]
options(digits = 3)
U[1:4]
options(digits = 22)
U[1:4]
options(digits = 3)

x <- exp(1)
y <- x
x <- 2
y
class(x)

0/0
1/0
.Machine$double.xmax
2e+307<Inf
2e+308<Inf

(3/10-1/10)
(3/10-1/10)==(7/10-5/10)
(3/10-1/10)-(7/10-5/10)
all.equal((3/10-1/10),(7/10-5/10))
sqrt(2)^2 == 2
.Machine$double.eps
.Machine$double.xmin

x <- rnorm(8)
names(x) <- letters[1:8]
x
x[2:4]
x[c("b","c","d")]
min(x[x>0])
x[-(3:8)]
# x[-3:8]
x[c(TRUE,FALSE)]
x[c(TRUE,NA,FALSE,TRUE)]
x[2]
x[[2]]

for(i in 1:2){
  nom_var <- paste0("x", i)
  assign(nom_var, rpois(5,7))
}
x1
x2

x <- runif(4)
x
x
set.seed(1)
x <- runif(4)
x

# install.packages("pryr")
library(pryr)
x %<a-% runif(1)
x
x
rm("x")
x

(x_num=c(1,6,10))
(x_int=c(1L,6L,10L))
object.size(x_num)
object.size(x_int)

typeof(x_num)
typeof(x_int)
is.integer(x_num)
is.integer(x_int)
str(x_num)
str(x_int)
c(1,c(2,c(3,c(4,5))))

# R objects : Matrices

M<-1:24
dim(M)=c(6,4)
M
str(M)
colnames(M)=letters[1:4]
rownames(M)=LETTERS[10:15]
M
M["K",]
M[c("K","N"),]
M[c(2,5),]
M[c("K",5),]

# On Windows machines

rm(list=ls())
memory.size()
memory.limit()
memory.limit(size=8000)
memory.limit(size=4000)

x3 <- 1:1e3; x4 <- 1:1e4; x5 <- 1:1e5; x6 <- 1:1e6
object.size(x3)
object.size(x4)
object.size(x5)
object.size(x6)

#z1 <- matrix(0,1e+06,3) 
#z2 <- matrix(as.integer(0),1e+06,3) 
#object.size(z1)
#object.size(z2)
#z3 <- matrix(0,1e+07,30) 

mydat_r <- matrix(rnorm(3*5e6,0,2),5e6,3)
colnames(mydat_r) <- c("first","second","third")
mydat_rdf <- as.data.frame(mydat_r)
object.size(mydat_rdf)

# install.packages("biglm")
# install.packages("bigmemory")
library(biglm, verbose=FALSE)
z1 <- matrix(as.integer(5), 300, 400)
object.size(z1)
library(bigmemory, verbose=FALSE)
z2 <- big.matrix(300, 400, type="integer", init=5)
object.size(z2)
rm(list=ls())

# R objects : Factors

x <- c("A", "A", "B", "B", "C")
model.matrix(~0+x)

n <- 10; nn <- 100
group <- factor(round(n * runif(n * nn)))
x <- rnorm(n * nn) + sqrt(as.numeric(group))
tapply(x,group,mean)

xg <- split(x, group)
sapply(xg, length)
sapply(xg, mean)

# install.packages("pbapply")
library(pbapply)
n <- 10; nn <- 1e6
group <- factor(round(n * runif(n * nn)))
x <- rnorm(n * nn) + sqrt(as.numeric(group))
xg <- split(x, group)
pbsapply(xg, mean)

x <- factor(c("b","a","b"))
levels(x)
x[3] = "c"
x

factor(x,levels=c("b","a"))
x[1]
x[1,drop=TRUE]

U <- runif(20)
cut(U,breaks=2)
cut(U,breaks=2,labels=c("small","large"))

cut(U,breaks=c(0,.3,.8,1),labels=c("small","medium","large"))
table(cut(U,breaks=c(0,.3,.8,1),labels=c("small","medium","large")))

# R objects : Characters

"Be carefull of 'quotes'"
'Be carefull of "quotes"'

cities <- c("New York, NY", "Los Angeles, CA", "Boston, MA")
substr(cities, nchar(cities)-1, nchar(cities))
unlist(strsplit(cities, ", "))[seq(2,6,by=2)]

# install.packages("stringr")
library(stringr)
tweet = "Emerging #climate change, environment and sustainability concerns for #actuaries Apr 9 #Toronto. Register TODAY http://bit.ly/CIAClimateForum"
hash="#[a-zA-Z]{1,}"
str_extract(tweet,hash)
str_extract_all(tweet,hash)

str_locate_all(tweet,hash)
urls="http://([\\da-z\\.-]+)\\.([a-z\\.]{2,6})/[a-zA-Z0-9]{1,}"
str_extract_all(tweet,urls)

email="^([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})$"
grep(pattern = email, x = "charpentier.arthur@uqam.ca")
grepl(pattern = email, x = "charpentier.arthur@uqam.ca")
str_detect(pattern = email, string=c("charpentier.arthur@uqam.ca","@freakonometrics"))

ex_sentence = "This is 1 simple sentence, just to play with, then we'll play with 4, and that will be more difficult"
# install.packages("tm")
library(tm)
ex_corpus <- Corpus(VectorSource(ex_sentence))
ex_corpus
inspect(ex_corpus)

grep("hard", ex_sentence)
grep("difficult", ex_sentence)

# install.packages("stringr")
library(stringr)
word(ex_sentence,4)
word(ex_sentence,1:20)
ex_words <- strsplit(ex_sentence, split=" ")[[1]]
ex_words

grep(pattern="w",ex_words,value=TRUE)
str_count(ex_words,"w")
grep(pattern="[ai]",ex_words,value=TRUE)
grep(pattern="[[:punct:]]",ex_words,value=TRUE)

require(wordcloud)
wordcloud(ex_corpus)

cols<-colorRampPalette(c("lightgrey","blue"))(40)
wordcloud(words = ex_corpus, max.words = 40, random.order=FALSE, scale = c(5, 0.5), colors=cols)

tdm <- TermDocumentMatrix(ex_corpus)
tdm
inspect(tdm)
inspect(ex_corpus)

fix_contractions <- function(doc) {
  doc <- gsub("won't", "will not", doc)
  doc <- gsub("n't", " not", doc)
  doc <- gsub("'ll", " will", doc)
  doc <- gsub("'re", " are", doc)
  doc <- gsub("'ve", " have", doc)
  doc <- gsub("'m", " am", doc)
  doc <- gsub("'s", "", doc) 
  return(doc)
}
ex_corpus <- tm_map(ex_corpus, fix_contractions)
inspect(ex_corpus)

ex_corpus <- tm_map(ex_corpus, tolower)
inspect(ex_corpus)

ex_corpus <- tm_map(ex_corpus, removeNumbers)
inspect(ex_corpus)

ex_corpus <- tm_map(ex_corpus,gsub, pattern= "[[:punct:]]", replacement = " ")
inspect(ex_corpus)

stopwords("en")[sample(1:length(stopwords("en")),size=10)]
inspect(ex_corpus)

# install.packages("SnowballC")
library(SnowballC)
ex_corpus <- tm_map(ex_corpus, stemDocument)
inspect(ex_corpus)

wordcloud(ex_corpus[[1]])
wordcloud(words = ex_corpus[[1]], max.words = 40, random.order=FALSE, scale = c(5, 0.5),colors=cols)

# R objects : Dates

some.dates <- as.Date(c("16/10/12","19/11/12"),format="%d/%m/%y")
diff(some.dates)
sequence.date <- seq(from=some.dates[1],to=some.dates[2],by=7)
format(sequence.date,"%b")
months(sequence.date)
weekdays(some.dates)

substr(as.POSIXct(some.dates), 1, 4)
strftime(some.dates,"%Y")

Sys.setlocale("LC_TIME", "fr_FR")
months(sequence.date)

# R objects : Lists

x <- list(1:5,c(1,2,3,4,5),a="test",
b=c(TRUE,FALSE),rpois(5,8))
x
str(x)
x <- list(
  list(1:5,c(1,2,3,4,5)),a="test",
  b=c(TRUE,FALSE),rpois(5,8))
x
names(x)
str(x)
is.list(x)
is.recursive(x)
x[[3]]
x[["b"]]
x[[1]]
x[[1]][[1]]

f <- function(x) { return(x*(1-x)) }
optimize(f, interval=c(0, 1), maximum=TRUE)

list(abc=1)$a
list(abc=1,a=2)$a
list(bac=1)$a
list(abc=1,b=2)$a

# R objects : Functions

factorial
gamma
?gamma

# If/Else Condition

set.seed(1)
u <- runif(1)
if(u>.5) {("greater than 50%")} else {("smaller than 50%")}
ifelse(u>.5,("greater than 50%"),("smaller than 50%"))
u

u <- runif(3)
if(u>.5) {print("greater than 50%")} else {("smaller than 50%")}
ifelse(u>.5,("greater than 50%"),("smaller than 50%"))
u                

# For Loops

v_x <- runif(1e5)
sqrt_x <- NULL
for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x))

sqrt_x <- NULL
system.time(for(x in v_x) sqrt_x <- c(sqrt_x,sqrt(x)))

sqrt_x <- numeric(length(v_x))
for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i])

sqrt_x <- numeric(length(v_x))
system.time(for(i in seq_along(v_x)) sqrt_x[i] <- sqrt(v_x[i]))

# Apply

system.time(Vectorize(sqrt)(v_x))
sqrt_x <- sapply(v_x,sqrt)
sqrt_x <- unlist(lapply(v_x,sqrt))
system.time(unlist(lapply(v_x,sqrt)))

# Parallel Apply

# install.packages("parallel")
library(parallel)
(allcores <- detectCores(all.tests=TRUE))
sqrt_x <- unlist(mclapply(v_x,sqrt,mc.cores=4))
system.time(unlist(mclapply(v_x,sqrt,mc.cores=4)))

# Functions

f<-function(x) dlnorm(x)
integrate(f,0,Inf)
integrate(f,0,1e5)
integrate(f,0,1e3)$value+integrate(f,1e3,1e5)$value

# Compound Poisson

rN.Poisson <- function(n) rpois(n,5)
rX.Exponential <- function(n) rexp(n,2)

rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  V <- rep(0,n)
  for(i in 1:n){
    N <- rN(1)
    if(N>0){V[i] <- sum(rX(N))}
  }
  return(V)}

rcpd1 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  V <- rep(0,n)
  for (i in 1:n) V[i] <- sum(rX(rN(1)))
  return(V)}

rcpd2 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  N <- rN(n)
  X <- rX(sum(N))
  I <- factor(rep(1:n,N),levels=1:n)
  return(as.numeric(xtabs(X ~ I)))}

rcpd3 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  N <- rN(n)
  X <- rX(sum(N))
  I <- factor(rep(1:n,N),levels=1:n)
  V <- tapply(X,I,sum)
  V[is.na(V)] <- 0
  return(as.numeric(V))}

rcpd4 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(sapply(rN(n), function(x) sum(rX(x))))}

rcpd5 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(sapply(Vectorize(rX)(rN(n)),sum))}

rcpd6 <- function(n,rN=rN.Poisson,rX=rX.Exponential){
  return(unlist(lapply(lapply(t(rN(n)),rX),sum)))}

n <- 100
# install.packages("microbenchmark")
library(microbenchmark)
options(digits=1)
microbenchmark(rcpd1(n),rcpd2(n),rcpd3(n),rcpd4(n),rcpd5(n),rcpd6(n))

# Vectorization

f <- function(x,m=0,s=1){
  H<-function(t) 1-pnorm(t,m,s)
  integral<-integrate(H,lower=x,upper=Inf)$value
  res<-H(x)/integral
  return(res)
}

f(0:1) 

y <- rep(NA,2)
x <- 0:1
for(i in 1:2) y[i] <- f(x[i])
y
sapply(x,"f")
Vectorize(f)(x)

binorm <- function(x1,x2,r=0){
  exp(-(x1^2+x2^2-2*r*x1*x2)/(2*(1-r^2)))/(2*pi*sqrt(1-r^2))
}
u <- seq(-2,2)
binorm(u,u)
outer(u,u,binorm)
(uv<-expand.grid(u,u))
matrix(binorm(uv$Var1,uv$Var2),5,5)

u <- seq(-2,2,length=31)
persp(u,u,outer(u,u,binorm))

# Computational Efficiency

library(microbenchmark)
n <- 1000
A<-matrix(seq(1,n^2),n,n)
B<-matrix(seq(1,n^2),n,n)
C<-1:n
microbenchmark((A%*%B)%*%C,A%*%(B%*%C),replications=1)

m <- 1000
n <- 50
X <- matrix(rnorm(m * n, mean = 10, sd = 3), nrow = m)
group <- rep(c("A","B"), each = n / 2)

system.time(for(i in 1:m) t.test(X[i, ] ~ group)$stat)
system.time(for(i in 1:m) t.test(X[i, group == "A"], X[i, group == "B"])$stat)

t.test_1 <- function(x, grp){
  t.test(x[ group == "A"], x[ group == "B"])$stat
}
system.time(t1 <- apply(X, 1, t.test_1, grp = grp))

t.test_2 <- function(x, grp) {
  t_stat <- function(x) {
    m <- mean(x)
    n <- length(x)
    var <- sum((x - m) ^ 2) / (n - 1)
    list(m = m, n = n, var = var)
  }
  g1 <- t_stat(x[grp == "A"])
  g2 <- t_stat(x[grp == "B"])
  se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
  (g1$m - g2$m) / se_total
}
system.time(apply(X, 1, t.test_2, grp = group))

t.test_3 <- function(X, grp){
  t_stat <- function(X) {
    m <- rowMeans(X)
    n <- ncol(X)
    var <- rowSums((X - m) ^ 2) / (n - 1)
    list(m = m, n = n, var = var)
  }
  g1 <- t_stat(X[, grp == "A"])
  g2 <- t_stat(X[, grp == "B"])
  se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
  (g1$m - g2$m) / se_total
}
system.time( t.test_3(X, group))

"%pm%" <- function(x,s) x + c(qnorm(.05),qnorm(.95))*s
100 %pm% 10

'second<-' <- function(x, value) {
  x[2] <- value
  return(x)
}
x <- 1:10
second(x) <- 5
x

# Memory Issues

library(pryr)
x <- 1:10
address(x)
second(x) <- 6
address(x)

ls(globalenv())
find("x")
find("pi")
environment(sd)
e <- new.env()
e$d <- 1
e$f <- 1:5
e$g <- 1:5
ls(e)
str(e)
identical(globalenv(),e)
search()

# Memory Issues

f1 <- function(x1){
  f2 <- function(x2){
    f3 <- function(x3){
      x1+x2+x3
    }
    f3(3)
  } 
  f2(2)
}
f1(1)

f <- function(x) log(x)
f("x")
try(f("x"))
inherits(try(f("x")), "try-error")
inherits(try(f("x"), silent=TRUE), "try-error")
a=0
try(a<-f("x"), silent=TRUE)
a
try(a<-f(x), silent=TRUE)
a

x=1:10
g=function(f) f(x)
g(mean)

fibonacci <- function(n){
  if(n<2) return(1)
  fibonacci(n-2)+fibonacci(n-1)
}
fibonacci(20)
system.time(fibonacci(30))

# install.packages("memoise")
library(memoise)
fibonacci <- memoise(function(n){
  if(n<2) return(1)
  fibonacci(n-2)+fibonacci(n-1)
})
system.time(fibonacci(30))

f <- function(x) x^2
f
formals(f)
body(f)

set.seed(1)
x<-rexp(6)
sum(x)
.Primitive("sum")(x)

# install.packages("Rcpp")
library(Rcpp)
cppFunction('double sum_C(NumericVector x) {
            int n = x.size();
            double total = 0;
            for(int i = 0; i < n; ++i) {
            total += x[i];
            }
            return total;
            }')
sum_C(x)

# install.packages("rPython")
library(rPython)
sum_P=function(z) python.call("sum",z)
sum_P(x)

library(microbenchmark)
x <- runif(1e6)
microbenchmark(
  sum(x),
  .Primitive("sum")(x),
  sum_C(x))

x<-10
f=function() x<-5
f
f()
x
f=function(){
  x<-5
  return(x)
}
f()
x

f=function(){
  x<<-5
  return(x)
}
f()
x
f=function(){
  z<-5
  return(z)
}
f()
z

f=function(x){
  z<-x+5
  return(z)
}
f(2)
z

x=function(y) y/2
x
x <- 5
x(x)

x=function(y) y/2
y=function(){
  x <- 10
  x(x)
}
y()

names_list <- function(...){
  names(list(...))
}
names_list(a=5,b=7)

power <- function(exponent) {
  function(x) {
    x ^ exponent
  }
}
square <- power(2)
square(4)
cube <- power(3)
cube(4)
cube

total <- 20
pb <- txtProgressBar(min = 0, max = total, style = 3)
for(i in 1:(10*total)){
  Sys.sleep(0.01)
  if(i %%10 == 0) setTxtProgressBar(pb, i%/%10)
}
close(pb)

# R objects : Data Frames

df <- data.frame(x=1:3,y=letters[1:3])
str(df)
typeof(df)
class(df)

cbind(df,z=9:7)
df$z<-5:3
df

df[1]
df[,1,drop=FALSE]
df[,1,drop=TRUE]
df[[1]]
df$x
df[,"x"]
df[["x"]]

set.seed(1)
df[sample(nrow(df)),]
set.seed(1)
df[sample(nrow(df),nrow(df)*2,replace=TRUE),]
subset(df, x>1)

set.seed(123)
df <- data.frame(Y=rnorm(50), X1=as.factor(sample(LETTERS[1:4],size=50,
replace=TRUE)), X2=as.factor(sample(1:3,size=50,replace=TRUE)))
tail(df)

reg <- lm(Y~X1+X2,data=df)
model.matrix(reg)[45:50,]
reg <- lm(Y~(X1=="A")+X2,data=df)
model.matrix(reg)[45:50,]

reg <- lm(Y~X1*X2,data=df)
model.matrix(reg)[45:50,]

reg <- lm(Y~X1+X2%in%X1,data=df)
model.matrix(reg)[45:50,]

download.file("http://freakonometrics.free.fr/superheroes.RData","superheroes.RData")
load("superheroes.RData")
superheroes
publishers

# install.packages("dplyr")
library(dplyr, verbose=FALSE)
inner_join(superheroes, publishers)
inner_join(publishers, superheroes)
merge(superheroes, publishers, all = TRUE)

semi_join(superheroes, publishers)
semi_join(publishers, superheroes)
left_join(superheroes, publishers)
left_join(publishers, superheroes)

anti_join(superheroes, publishers)
anti_join(publishers, superheroes)

download.file("http://freakonometrics.free.fr/gapminderDataFiveYear.txt","gapminderDataFiveYear.txt")
gdf <- read.delim("gapminderDataFiveYear.txt")
head(gdf,4)
str(gdf)

library(dplyr)
gtbl <- tbl_df(gdf)
# install.packages("data.table")
library(data.table)
gdfdt <- data.table(gdf)

subset(gdf, lifeExp < 30)
gdf[lifeExp < 30,]
gdf[gdf$lifeExp < 30,]

filter(gtbl, lifeExp < 30)
subset(gdfdt, lifeExp < 30)
gdfdt[lifeExp < 30,]

gdf[gdf$country == "Italy", c("year", "lifeExp")]
gtbl %>% 
  filter(country == "Italy") %>%
  select(year, lifeExp)
gdfdt[country == "Italy",cbind(year, lifeExp)]
setkey(gdfdt,country)
gdfdt["Italy",cbind(year, lifeExp)]

rm(list=ls())
loc="http://freakonometrics.free.fr/gapminderDataFiveYear.txt"
download.file(loc,destfile="temp")
df <- read.delim("temp")
tail(df,4)

str(df)
sum(duplicated(df))
df$lifeExp[1:10]
df[4:5,]

library(dplyr)
ldf <- tbl_df(df)
glimpse(ldf)
select(ldf,lifeExp)
select(ldf,lifeExp)$lifeExp[1:10]
#library(modify)
#s_select(ldf,"lifeExp")
ldf[4:5,]
slice(ldf,4:5)
filter(ldf,lifeExp<30 | gdpPercap<300)
rename(ldf,population = pop)

library(data.table)
dt <- data.table(df)
is.data.frame(dt)
?fread
tables()

object.size(df)
object.size(ldf)
object.size(dt)

df[df$lifeExp<30,]
subset(df, lifeExp < 30)
filter(ldf, lifeExp < 30)

subset(dt, lifeExp < 30)
dt[lifeExp<30]
setkey(dt, country)
dt["Italy"]

dt["Italy",mult="first"]
dt["Italy",max(lifeExp)]
max(df$lifeExp[df$country=="Italy"])

setkey(dt, country, lifeExp) 
dt["Italy"]
dt[.("Italy",77.44)]
dt[.("Italy",77)]
dt[.("Italy",77),roll="nearest"]

df[df$country == "Italy", c("year", "lifeExp")]
ldf %>%
  filter(country == "Italy") %>%
  select(year, lifeExp)

small_df <- df[df$country %in% c("France","Italy","Spain"), c("country","year", "lifeExp")]
# install.packages("tidyr")
library(tidyr)
str(small_df)
table_df=spread(small_df,"country","lifeExp")
table_df
data_df=gather(table_df,"country","lifeExp",2:4)
data_df

# install.packages("reshape2")
library(reshape2)
dcast(small_df, formula = country~year,
      value.var="lifeExp")
table_df= dcast(small_df, formula = year~country,
                value.var="lifeExp")
table_df
melt(table_df, id.vars=c("year"),
     value.name = "lifeExp",
     variable.name="country")

aggregate(small_df$lifeExp,FUN=max,by=list(small_df$country))
tapply(small_df$lifeExp,factor(small_df$country),max)
small_ldf = filter(ldf,country %in% c("France","Italy","Spain"))
group_by(small_ldf,country) %>% summarize(max(lifeExp))
filter(ldf,country %in% c("France","Italy","Spain")) %>%
  group_by(country) %>% 
  summarize(max(lifeExp))
filter(ldf,country %in% c("France","Italy","Spain")) %>%
  group_by(country) %>% 
  summarize(max(lifeExp)) 

plot(df$gdpPercap,df$lifeExp)

select(ldf,gdpPercap,lifeExp) %>% plot()

setkey(dt, country)
dt[c("France","Italy","Spain"),max(lifeExp),by="country"]
head(df[with(df,order(-lifeExp)),])
head(dt[order(-lifeExp)])
head(setorder(dt,-lifeExp))
arrange(ldf,-lifeExp)

df$gdp=df$pop*df$gdpPercap
df = within(df,gdp = pop*gdpPercap)
df = within(df,gdp <- pop*gdpPercap)
dt[,gdp:=pop*gdpPercap]
head(dt)
mutate(ldf,gdp=pop*gdpPercap)

countries=read.csv2("http://freakonometrics.free.fr/countries.csv",header=TRUE)
countries = countries[-1,]
head(countries)
df2 = countries[,c("Country","Capital")]
names(df2)=c("countries","capital")

df1 = aggregate(df$lifeExp,FUN=max,by=list(df$country))
names(df1)=c("countries","lifeExp")
dim(df1)
dim(df2)
tail(merge(df1,df2))
tail(merge(df1,df2,all.x=TRUE))
tail(merge(df1,df2,all.y=TRUE))

ldf1 = group_by(ldf, country) %>% summarize(max(lifeExp)) 
names(ldf1)=c("countries","lifeExp")
ldf2 <- tbl_df(df2)
inner_join(ldf1,ldf2)
left_join(ldf1,ldf2)
ldf1 %>% inner_join(ldf2)

dt1 = dt[,max(lifeExp),by="country"]
setnames(dt1,c("countries","lifeExp"))
dt2 <- data.table(df2)
setkey(dt1,"countries")
setkey(dt2,"countries")
dt1[dt2,nomatch=0]
dt1[dt2]
dt2[dt1]

# install.packages("stringr")
# install.packages("lubridate")
# install.packages("plyr")
# install.packages("LaF")
library(stringr)
library(lubridate, verbose=FALSE)
library(plyr, verbose=FALSE)
library(data.table, verbose=FALSE)
library(LaF)
loc="http://freakonometrics.free.fr/gapminderDataFiveYear.txt"
download.file(loc,destfile="temp")
ssn <- laf_open_csv("temp",sep="\t", 
                    column_types=c("character","integer","numeric","character","numeric","numeric"),
                    column_names=names(df)[1:6])
object.size(ssn)
nrow(ssn)

# Graphs with R

# install.packages("gamair")
library(gamair)
data(chicago)
head(chicago)
base=data.frame(death=chicago$death,
                temp_F=chicago$tmpd,
                o3=chicago$o3median,
                date=seq(as.Date("1987-01-01"),
                         as.Date("2000-12-31"),by=1))
base$temp_C <- (base$temp_F-32)/1.8
base$year <- substring(base$date,1,4)

date2season <- function(date){
  m <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%m"))
  d <- as.numeric(format(as.Date(date, format = "%d/%m/%Y"), "%d"))
  s <- NA
  if(m %in% c(1,2) | ((m==12)&(d>=21)) |  ((m==3)&(d<21))) s <- "winter"
  if(m %in% c(4,5) | ((m==3)&(d>=21)) |  ((m==6)&(d<21))) s <- "spring"
  if(m %in% c(7,8) | ((m==6)&(d>=21)) |  ((m==9)&(d<21))) s <- "summer"
  if(m %in% c(10,11) | ((m==9)&(d>=21)) |  ((m==12)&(d<21))) s <- "autumn"
  return(s)}
base$season <- sapply(base$date,date2season)
head(base)

plot(base[,c("date", "temp_C")])

plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")))

plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")),
     ylim=c(-20,60))

plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")),
     col=as.integer(as.factor(base$season)),
     xlim=as.Date(c("1995-01-01","1999-01-01")))

CL <- function(x) rgb(colorRamp(c("blue", "red"))(x)/255)
plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")),
     col=CL(rank(base$temp_C)/(length(base$temp_C)+1)))

plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
  xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")),
  type="l",col="grey",
  xlim=as.Date(c("1995-01-01","1999-01-01")))

plot(base[,c("date", "temp_C")],main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")),
     xlim=as.Date(c("1995-01-01","1999-01-01")))
rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "light yellow")
points(base[,c("date", "temp_C")],col=as.integer(as.factor(base$season)))

boxplot(base$temp_C~as.factor(base$year),main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")))
x <- 1:14
text(x, rep(-25,length(x)), x, col="red")

bp=boxplot(base$temp_C~as.factor(base$year),main="Temperature in Chicago",
     xlab="Date", ylab=expression(paste("Temperature ( ", degree ~ C, " )")))
points(x,bp$stats[5,],pch=19,col="purple")
abline(lm(bp$stats[5,]~x),col="purple",lwd=2)

hist(base$temp_C,xlab=expression(paste("Temperature ( ", degree ~ C, " )")),ylab="",probability=TRUE,
     col="yellow",border="white",main="",ylim=c(0,.0375))
u=seq(-30,40,by=.25)
lines(u,dnorm(u,mean(base$temp_C),sd(base$temp_C)),lty=2)
lines(density(base$temp_C),col="red",lwd=2)

# Graphs with ggplot2

# install.packages("ggplot2")
library(ggplot2)
g <- ggplot(base, aes(date, temp_C)) + geom_point()
g
ggsave(file="figure.pdf", scale=3)

g <- g+labs(title='Temperature')
g <- g+ggtitle('Temperature')
g

g+theme(plot.title = element_text(size=20, face="bold", vjust=2, color="red"))

g<-g+labs(
  x="Date", 
  y=expression(paste("Temperature ( ", degree ~ C, " )")), 
  title="Temperature in Chicago")
g

g+ theme(
  axis.title.x = element_text(color="blue"),
  axis.title.y = element_text(color="blue")   
)

g + ylim(c(-20,60))
g <- g + xlim(as.Date(c("1995-01-01","1999-01-01")))
g

base2=data.frame(
  today=base$temp_C[1:5113],
  tomorrow=base$temp_C[2:5114])
ggplot(base2, aes(today, tomorrow))+     geom_point()+ coord_equal()

g <- ggplot(base, aes(date, temp_C, color=factor(season)))+
  xlim(as.Date(c("1995-01-01","1999-01-01")))+ geom_point()
g+theme(legend.title=element_blank())+
  guides(colour = guide_legend(override.aes = list(size=4)))

g<-ggplot(base, aes(date, temp_C))+
  xlim(as.Date(c("1995-01-01","1999-01-01")))
g+geom_line(color="grey")

g+geom_line(aes(color="temperature"))+
  scale_colour_manual(name='', values=c('temperature'='grey'))

ggplot(base, aes(date, temp_C, color=factor(season)))+geom_point() + 
  scale_color_manual(values=c("dodgerblue4", "darkolivegreen4","darkorchid3", "goldenrod1"))

ggplot(base, aes(date, temp_C, color=factor(season)))+
  geom_point() + scale_color_brewer(palette="Set1")

ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
  scale_color_gradient(low="blue", high="red")

ggplot(base, aes(date, temp_C, color=temp_C))+geom_point()+
  scale_color_gradient2(low="blue", mid="white", high="red")

ggplot(base, aes(date, temp_C))+geom_point(color="firebrick")+
  theme(panel.background = element_rect(fill = 'light yellow'))

ggplot(base, aes(date,temp_C))+geom_point(color="firebrick")+
  facet_wrap(~year, ncol=2, scales="free")

# install.packages("ggthemes")
library(ggthemes, verbose=FALSE)
ggplot(base, aes(date, temp_C, color=factor(season)))+  geom_point()+
  labs(x="Date", y=expression(paste("Temperature ( ", degree ~ C, " )")), title="This plot looks like MS Excel")+
  theme_excel()+scale_colour_excel()+ theme(legend.title=element_blank())

# Spatial Models

# install.packages("sp")
# install.packages("rworldmap")
library(sp)
library(rworldmap)
data(countriesLow)
crs.WGS84 <- CRS("+proj=longlat +datum=WGS84")
countries.WGS84 <- spTransform(countriesLow, crs.WGS84)
plot(countries.WGS84,col="light green")

crs.laea <- CRS("+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 +y_0=3210000 +ellps=GRS80 +units=m +no_defs") 
countries.laea <- spTransform(countriesLow, crs.laea) 
plot(countries.laea,col="light green")

# OpenStreetMap

# install.packages("OpenStreetMap")
library(OpenStreetMap)
map <- openmap(c(lat= 51.516,   lon= -.141),
               c(lat= 51.511,   lon= -.133))
map <- openproj(map, projection = "+init=epsg:27700") 
plot(map)

loc="http://freakonometrics.free.fr/cholera.zip"
download.file(loc,destfile="cholera.zip")
unzip("cholera.zip", exdir="./")
library(maptools)
deaths<-readShapePoints("Cholera_Deaths")
head(deaths@coords)

plot(map)
points(deaths@coords,col="red", pch=19, cex=.7 )

X <- deaths@coords
# install.packages("KernSmooth")
library(KernSmooth)
kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))
# install.packages("grDevices")
library(grDevices)
clrs <- colorRampPalette(c(rgb(0,0,1,0), rgb(0,0,1,1)), alpha = TRUE)(20)

plot(map)
image(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE,col=clrs)
contour(x=kde2d$x1, y=kde2d$x2,z=kde2d$fhat, add=TRUE)

# Google Map

#install.packages("ggmap")
library(ggmap)
get_london <- get_map(c(-.137,51.513), zoom=17, maptype = "roadmap", source="google")
london <- ggmap(get_london)
london

# install.packages("rgdal")
df_deaths <- data.frame(X)
# install.packages("sp")
# install.packages("rgdal")
library(sp)
library(rgdal)
coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700") 
df_deaths = spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))
london + geom_point(aes(x=coords.x1,y=coords.x2),data=data.frame(df_deaths@coords),col="red")

london + geom_point(aes(x=coords.x1, y=coords.x2), 
data=data.frame(df_deaths@coords),col="red") +
  geom_density2d(data = data.frame(df_deaths@coords), 
aes(x = coords.x1, y=coords.x2), size = 0.3) + 
  stat_density2d(data = data.frame(df_deaths@coords), 
 aes(x = coords.x1, y=coords.x2,fill = ..level.., alpha = ..level..),
 size = 0.01, geom = "polygon") + scale_fill_gradient(low = "green", high = "red",
 guide = FALSE) + scale_alpha(range = c(0, 0.3), guide = FALSE)

# Google Map & Leaflet

require(leaflet)
deaths <- readShapePoints("Cholera_Deaths")
df_deaths <- data.frame(deaths@coords)
coordinates(df_deaths)=~coords.x1+coords.x2
proj4string(df_deaths)=CRS("+init=epsg:27700") 
df_deaths=spTransform(df_deaths,CRS("+proj=longlat +datum=WGS84"))
df=data.frame(df_deaths@coords)
lng <- df$coords.x1
lat <- df$coords.x2


m = leaflet()%>% addTiles()  %>% fitBounds(-.141,  51.511, -.133, 51.516)
m

rd=.5
op=.8
clr="blue"
m = leaflet() %>% addTiles() %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr)
m

X=cbind(lng,lat)
kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))
x=kde2d$x1
y=kde2d$x2
z=kde2d$fhat
CL=contourLines(x , y , z)
m = leaflet() %>% addTiles()  %>% addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
m

m = leaflet() %>% addTiles()  %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE)
m = leaflet() %>% addTiles()  %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>%
  addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE)
m

m = leaflet() %>% addTiles() %>% addCircles(lng,lat, radius = rd,opacity=op,col=clr) %>%
  addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>%
  addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>%
  addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red")
m

# Web Scrapping & Hurricanes

# install.packages("XML")
library(XML)
year <- 2012
loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/index.php",sep="")
tabs <- readHTMLTable(htmlParse(loc)) 
tabs[1]
storms <- unlist(strsplit(as.character(tabs[[1]]$Name),split=" "))
storms

index <- storms%in%c("Tropical", "Storm", paste("Hurricane-",1:6,sep=""), "Depression", "Subtropical", "Extratropical", "Low", paste("Storm-",1:6,sep=""), "Xxx")
nstorms  <- storms[!index]
nstorms
TRACK=NULL

for(i in length(nstorms):1){
  if((nstorms[i]=="SIXTEE")&(year==2008)) nstorms[i] <- "SIXTEEN"
  if((nstorms[i]=="LAUR")&(year==2008)) nstorms[i] <- "LAURA"
  if((nstorms[i]=="FIFTEE")&(year==2007)) nstorms[i] <- "FIFTEEN"
  if((nstorms[i]=="CHANTA")&(year==2007)) nstorms[i] <- "CHANTAL"
  if((nstorms[i]=="ANDR")&(year==2007)) nstorms[i] <- "ANDREA"
  if((nstorms[i]=="NINETE")&(year==2005)) nstorms[i] <- "NINETEEN"
  if((nstorms[i]=="JOSEPH")&(year==2002)) nstorms[i] <- "JOSEPHINE"
  if((nstorms[i]=="FLOY")&(year==1993)) nstorms[i] <- "FLOYD"
  if((nstorms[i]=="KEIT")&(year==1988)) nstorms[i] <- "KEITH"
  if((nstorms[i]=="CHARLI")&(year==1972)) nstorms[i] <- "CHARLIE"}

for(i in length(nstorms):1){
  loc <- paste("http://weather.unisys.com/hurricane/atlantic/",year,"/",nstorms[i],"/track.dat",sep="")
  track <- read.fwf(loc,skip=3,widths = c(4,6,8,12,4,6,20))
  names(track)=c("ADV","LAT","LON","TIME","WIND","PR","STAT")
  track$LAT=as.numeric(as.character(track$LAT))
  track$LON=as.numeric(as.character(track$LON))
  track$WIND=as.numeric(as.character(track$WIND))
  track$PR=as.numeric(as.character(track$PR))
  track$year=year
  track$name=nstorms[i]
  TRACK=rbind(TRACK,track)}
tail(TRACK,10)

loc="http://freakonometrics.free.fr/TRACK-ATLANTIC.Rdata"
download.file(loc,destfile="TRACK-ATLANTIC.Rdata")
load("TRACK-ATLANTIC.Rdata")
dim(TOTTRACK)

# install.packages("grDevices")
# install.packages("maps")
library(grDevices)
library(maps)
map("world", col="light yellow", fill=TRUE)
for(n in unique(TOTTRACK$name)){
  lines(TOTTRACK$LON[TOTTRACK$name==n],TOTTRACK$LAT[TOTTRACK$name==n],lwd=.5,col=rgb(1, 0, 0, alpha=.5))}

# Shapefiles

loc="http://freakonometrics.free.fr/Italy.zip"
download.file(loc,destfile="Italy.zip")
unzip("Italy.zip", exdir="./")
library(rgdal)
ita1<-readOGR(dsn="./Italy/",layer="ITA_adm1",verbose=FALSE)
plot(ita1,col="light green")

ita1@data[,"NAME_1"][1:6]
pos<-which(ita1@data[,"NAME_1"]=="Sicily")
Poly_Sicily<-ita1[pos,]
plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)

plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
abline(v=5:20,col="light blue")
abline(h=35:50,col="light blue")
axis(1)
axis(2)

pos<-which(ita1@data[,"NAME_1"] %in% c("Abruzzo","Apulia","Basilicata","Calabria","Campania","Lazio","Molise","Sardegna","Sicily"))
ita_south <- ita1[pos,]
ita_north <- ita1[-pos,]
plot(ita1)
plot(ita_south,col="red",add=TRUE)
plot(ita_north,col="blue",add=TRUE)

# install.packages("xlsx")
library(xlsx)
loc="http://freakonometrics.free.fr/Italy_codes.xlsx"
download.file(loc,destfile="Italy_codes.xlsx")
data_codes<-read.xlsx2(file = "Italy_codes.xlsx",sheetName="ITALY", startRow=1, header=TRUE)
names(data_codes)[1]="NAME_1"
ita2<-merge(ita1, data_codes, by = "NAME_1", all.x=TRUE)
pos<-which(ita2@data[,"GROUP"] == "SOUTH")

# install.packages("rgeos")
library(rgeos)
ita_s<-gUnionCascaded(ita_south)
ita_n<-gUnionCascaded(ita_north)
plot(ita1)
plot(ita_s,col="red",add=TRUE)
plot(ita_n,col="blue",add=TRUE)

plot(ita1,col="light green")
plot(Poly_Sicily,col="yellow",add=TRUE)
gCentroid(Poly_Sicily,byid=TRUE)

G <- as.data.frame(gCentroid(ita1,byid=TRUE))
plot(ita1,col="light green")
text(G$x,G$y,1:20)

c1 <- G[c(17,20,6,16,8,12,2),]
c2 <- G[c(13,11,1,5,3,4),]
L1<-SpatialLines(list(Lines(list(Line(c1)),"Line1")))
L2<-SpatialLines(list(Lines(list(Line(c2)),"Line2")))
L1@proj4string<-ita1@proj4string
L2@proj4string<-ita1@proj4string
cross_road <-gIntersection(L1,L2)
cross_road@coords

plot(ita1,col="light green")
plot(L1,col="red",lwd=2,add=TRUE)
plot(L2,col="blue",lwd=2,add=TRUE)
plot(cross_road,pch=19,cex=1.5,add=TRUE)

library(maptools)
library(rgdal)
# install.packages("classInt")
library(classInt)
#paris <- readShapeSpatial("paris-cartelec.shp")

paris<-readOGR(dsn="./",layer="paris-cartelec",verbose=FALSE)
proj4string(paris) <- CRS("+init=epsg:2154")
paris <- spTransform(paris, CRS("+proj=longlat +ellps=GRS80"))
plot(paris)

length(paris)
poly_paris <- SpatialPolygons2PolySet(paris)
sub_poly <- poly_paris[poly_paris$PID==100,]
plot(sub_poly[,c("X","Y")])
polygon(sub_poly[,c("X","Y")])

length(paris)
poly_paris <- SpatialPolygons2PolySet(paris)
sub_poly <- poly_paris[poly_paris$PID==100,]
plot(sub_poly[,c("X","Y")])
polygon(sub_poly[,c("X","Y")])

point <- c(2.33,48.859)
point.in.polygon(point[1],point[2],sub_poly[,"X"],sub_poly[,"Y"])
point_in_i=function(i,point) point.in.polygon(point[1],point[2],poly_paris[poly_paris$PID==i,"X"],poly_paris[poly_paris$PID==i,"Y"])
where_is_point=function(point) which(Vectorize(function(i) point_in_i(i,point))(1:length(paris))>0)
where_is_point(point)

# install.packages("RColorBrewer")
library(RColorBrewer)
plotclr <- brewer.pal(7,"RdYlBu")
vizualize_point <- function(point){
 wp <- where_is_point(point)
 colcode <- rep(plotclr[4],length(paris))
 colcode[wp] <- plotclr[1]
 plot(paris,col=colcode,border="grey")}
 vizualize_point(c(2.4,48.87))

# install.packages("geosphere")
library(ggmap,verbose=FALSE)
library(geosphere)
(ADS <- geocode("4 rue chauveau lagarde paris", source="google"))
vizualize_point(ADS)
