#### 2016-09-26 ####
### some general information:
# software for statistical computing and graphics
# R is based on S
# GPL (general public license) -> free software
# first version R-1.0.0 released in 2000
# current version R-3.3.1
# available for several OS (including 64bit)
# functionality can be extended by packages
# packages are bundles of functions, data, and documentation
# Comprehensive R archive network (CRAN) is the primary source for packages
# April 2013: 4437, October 2014: 5966, September 2015: 7084, September 2016: 9237 packages on CRAN
# wide dissemination in academia and companies
# advantages: extensibility (by packages), flexibility (object-oriented language), transparency (source code publicly available)
# web page: https://www.r-project.org
5 + 5 # summation
5 * 5 # multiplication
5^0.5 # square root
5^.5 # same
# note the difference:
5^(.5/2)
5^.5/2
### functions:
sqrt(5) # square root
help(sqrt) # help for sqrt-function
sqrt(x = 5) # naming of arguments
?sqrt # help again
??regression # search within the installed packages for the word "regression"
sqrt # have a look at a function's code, not much to see here
lm
### objects
test <- 100 # assignment
102 -> test2 # not recommended
test3 = 104 # also not recommended
# note the difference:
mean(x = 5)
x # does not exist
mean(x <- 5)
x
# the 'right' way:
dat <- 5 # data are objects
dat
res <- sqrt(x = dat) # results are objects
res
function2 <- sqrt # functions are objects too
function2(x = 5)
### workspace
# the workspace holds all objects
ls() # list all objects in workspace
rm("dat") # remove object with name "dat"
### save script
### save workspace
#### 2016-09-27 ####
### package handling
# R commander
install.packages("Rcmdr") # install package "Rcmdr"
library(Rcmdr) # load package
### R Studio
# https://www.rstudio.com/products/rstudio/download/
### data types
mode() # query the data type of an object, argument: an object
x <- 5
mode(x)
# important data types:
# 1. NULL = empty set
# 2. numeric = numerical values
typeof(x) # gives either integer or double (for real real numbers)
# 3. character = character strings
x <- "5"
mode(x)
mean(x) # does not work
# 4. logical = logical values
# TRUE or FALSE
mode(TRUE)
x <- TRUE
mode(x)
# short notation: T or F
# logical values are the result of logical evaluations, e.g.
# == equality
5 == 3
# != inequality
# >, >= greater, greater or equal
# <, <= smaller, smaller or equal
# ! negation (not)
!FALSE
# numerical interpretation: TRUE = 1 and FALSE = 0
# attention:
-5<-3 # does not work
-5<(-3)
-5 < -3
is.numeric(x) # query for a specific data type
is.character(x)
is.logical(x)
as.numeric(x) # convert to a specific data type
as.character(x)
as.logical(x)
x <- "5"
mean(x) # does not work
mean(as.numeric(x))
# data types can be converted using the as.*-functions
# there are are restrictions on what can be converted into one another
# the data type of an object influences what functions can be applied to an object
### data structures
# vectors
# matrices
# arrays
# lists
# data frames
## vectors
x <- c(5, 6, 7) # generate a vector via c() "concatenate"
# all elements in a vector must be of the same data type
# indexing is done via []
x[2] # second element of x
x[c(2, 3, 1)] # second, third, and first element of x
x[c(F, T, F)] # indexing using logical values
y <- x[c(F, T, F, T)]
y
# NA = missing values (not available)
mean(y) # does not work
mean(y, na.rm=TRUE) # deal with NAs within a function
mean(na.omit(y)) # deal with NAs outside of an function (the more general solution)
## special vectors
# sequences
1:50 # sequence from 1 to 50 by 1
-20:100 # from -20 to 100 by 1
seq(from=3, to=12.5, by=.25) # specify the width between two neighbouring elements
seq(from=3, to=12.5, length.out=10) # specify the length of the resulting vector
# repetitions
rep(c(5,4), times=6) # repeat vector c(5,4) six times
#### 2016-09-28 ####
# script at http://www.domenicovistocco.it
# Teaching --> Statistics for Economics and Business
## vector calculations
# functions are applied to each element of a vector
x <- 5:10
sqrt(x)
x - 20
x <- 2:4
x
x * c(1,2,3) # 2*1, 3*2, 4*3
x %*% c(1,2,3) # dot product: 2*1 + 3*2 + 4*3
x
x - c(1, 2) # 2-1, 3-2, 4-1
# shorter vectors are "recycled"
# your task: generate a vector of 100 observations from a normal distribution with mu = 180 and sd = 10 (that may e.g. be the height of a person). Next, count how many of these 100 observations are in the interval 180 +/- 10.
height <- rnorm(100, mean = 180, sd = 10)
# a first solution
res1 <- height < 190
res2 <- height > 170
# note that:
TRUE & FALSE # FALSE
FALSE & TRUE # FALSE
FALSE & FALSE # FALSE
TRUE & TRUE # TRUE
res1 & res2
sum(res1 & res2)
# a second solution
height2 <- height[height < 190]
sum(height2 > 170)
## matrices
x <- matrix(1:6, ncol = 2) # matrix with two columns
matrix(1:6, ncol = 2, byrow = TRUE)
matrix(1:6, nrow = 2)
# indexing of matrices is done via [row number(s), column number(s)]
x[2, 2] # second row, second column
x[1, ] # first row
x[c(1, 3), ] # first and third row
# functions often required when dealing with matrices:
t(x) # transpose of x
diag(x) # diagonal of x
x <- matrix(1:4, ncol = 2)
solve(x) # matrix inverse
x %*% solve(x) # matrix multiplication; here: identity matrix
ncol(x) # number of columns
nrow(x) # number of rows
dim(x) # number of rows and columns
x <- 1:4
y <- 5:8
cbind(x, y) # column-wise bind vectors/matrices into a new matrix
rbind(x, y) # bind row-wise
z <- c("a", "b", "c", "d")
cbind(x, y, z) # all elements in a matrix are of the same data type
# your task: generate a matrix. Use the height of the persons generated previously and add the "weight" (i.e. random numbers from a normal distribution with mu = 75 and sd = 5). Next, sort the whole matrix by the weight of the persons in descending order.
persons <- cbind(height, rnorm(100, mean = 75, sd = 5))
x <- c(2.3, 9.7, 0.6)
sort(x)
order(x)
x[order(x)]
persons[order(persons[,2], decreasing = TRUE),]
persons[order(persons[,2]),][100:1,]
#### 2016-09-29 ####
## arrays
# similar to matrices but with dimension > 2
x <- array(1:12, dim=c(2, 3, 2))
# indexing is identical to matrices
x[2,2,1]
# all the elements must be of the same data type
## lists
# every element in a list can have any data structure (and can therefore be of any data type)
x <- 1:4
y <- c("a", "b", "c")
z <- list(x, y)
# indexing in lists works with listname[[number of element]]
z[[2]]
# lists are the standard data structure for output of functions in R
## data frames
# special case of a list
# elements (columns) are of same length
# elements (columns) can have different data type
# data frames are the standard data structure for data sets in R
y <- c("a", "b", "c", "d")
z <- data.frame(x,y)
cbind(x,y) # for comparison: a matrix
# indexing is the same as with matrices
z[,2] # second column
## data import
# 1. type data in to console (not recommended)
# 2. use data already available in R
# 3. read/import data into R from files
# option 2:
data(package=.packages(all.available=TRUE)) # list of all data sets in R (within installed packages)
library(MASS) # load package
data(Animals) # load data set into workspace
Animals # access the data set
# option 3:
# data as a file, e.g. txt, csv, xls, dta, etc
# for xls, dta additional packages may be required
# recommendation: prefer simple file formats, e.g. txt, csv
# recommendation: prepare data before importing into R (coding of NAs, deletion of unnecessary variables, sorting, etc)
read.table() # read data from file and create a data frame
# may also be an option: using R Commander
### graphics
persons <- cbind(rnorm(100, 180, 10), rnorm(100, 75, 5))
plot(persons) # standard plot function; here: scatterplot
plot(persons[,2]) # index plot
abline(h = 75) # add a horizontal line at 75
dice <- sample(1:6, 1000, replace=TRUE) # random sample
barplot(dice) # not useful
barplot(table(dice)) # barplot
pie(table(dice)) # pie plot
height <- rnorm(100, 180, 10)
hist(height, freq=FALSE) # histogram
curve(dnorm(x, mean=180, sd=10), from=150, to=210, n=1000, add=TRUE) # add curve of the density of the normal distribution
# more graphics functions in:
# package "lattice"
# package "scatterplot3d"
# package "rgl" (e.g. function plot3d)
## save graphics
# raster graphics: jpg, bmp
# vector graphics: ps, pdf
#### 2016-09-30 ####
## your task: recreate the graphic in graphic1.pdf.
# proceed in the following way:
# 1. find and import the data
# 2. plot the data (start simple and improve)
# 3. estimate a simple linear regression model (one for the full data) and one where the three outliers have been removed. Add the corresponding regression lines to the plot
# these functions may be helpful: par, plot, axis, points, abline, text, legend
library(MASS) # load package
data(Animals) # load data set
plot(log(Animals), main="brain and body weights for 28 species\n (comparison of simple linear regression with and without outliers)", xlab="x: body weight (logarithmized)", ylab="y: brain weight (logarithmized)", pch=16, xaxt="n", yaxt="n", ylim=c(-2,10), bty="n") # plot the data set
axis(1, seq(from=-2, to=10, by=2)) # add x-axis to the plot
axis(2, seq(from=-2, to=10, by=2)) # add y-axis to the plot
points(log(Animals[c(6, 16, 26), ]), pch=16, col="red") # color three outliers in red
res.all <- lm(log(Animals[,2]) ~ log(Animals[,1])) # estimate a simple linear regression model (using all the data)
names(res.all) # show names of elements in output of the lm() function
abline(res.all$coefficients[1], res.all$coefficients[2], lwd=2) # add regression line to the plot
Animals2 <- Animals[-c(6, 16, 26), ] # take subsample of the data (without the outliers)
res.red <- lm(log(Animals2[,2]) ~ log(Animals2[,1])) # estimate a simple linear regression model (using all the data minus the outliers)
abline(res.red$coefficients[1], res.red$coefficients[2], lwd=2, col="red") # add regression line to the plot
## you may try to add the text and the legend yourself. For that use: text() and legend()
## your 2nd task: recreate the graphic in graphic2.pdf.
# proceed in the following way:
# 1. find and import the data
# 2. install and load the scatterplot3d package
# 3. plot the data (start simple and improve)
# these functions may be helpful: install.packages, library, data, scatterplot3d, plane3d, points3d
library(datasets) # load package
data(VADeaths) # load data set
install.packages("scatterplot3d") # install package "scatterplot3d"
library(scatterplot3d) # load package
# restructure the data:
# x y z
# 50-54 Rural m 11.7
# ...
dat <- cbind(expand.grid(rownames(VADeaths), colnames(VADeaths)), c(VADeaths))
plot.temp <- scatterplot3d(dat, xlab="age group", ylab="population group", zlab="deaths per 1000 population per year", main="Plot of Death Rates in Virginia (1940)\n (using scatterplot3d)", box=FALSE, angle=75, x.ticklabs=c("",rownames(VADeaths),""), y.ticklabs=c("",colnames(VADeaths),""), type="h", ylim=c(0,5), xlim=c(0,6), y.margin.add=1)
plot.temp$plane3d(0, 13.5, 0) # add plane to the plot
## you may try yourself: fit a regression model in order to get the "real numbers" for the plane
## bonus: interactive 3D plot
install.packages("rgl") # download and install the package
library(rgl) # load the package
plot3d(dat) # plot the data
#### 2016-10-03 ####
# create an own function: functionname <- function(arguments){programcode}
test <- function(x){
tmp.mean <- mean(x)
tmp.median <- median(x)
return(list(mean=tmp.mean, median=tmp.median))
}
test(x=1:20)
fix(test) # editing a function's code
## loops
# three different variants of loops
# variant 1: repeat{programcode}
i <- 0
repeat{
i <- i + 1
print(i)
if(i==100){break}
}
# program code is repeated indefinitely
# "break" can be used to break out of the loop
# variant 2: while(condition){programcode}
# program code is carried out as long as the condition is met
i <- 1
while(i<=100){
print(i)
i <- i + 1
}
# variant 3: for(i in M){programcode}
for(x in 1:100){
print(x)
}
for(x in seq(from=2, to=10, by=2)){
print(x)
}
for(x in c("b", "a", "c")){
print(x)
}
# advantage: M can be an arbitrary sequence/vector
## vectorization: apply a function to all columns or rows of a matrix
x <- matrix(1:10, ncol=2)
apply(x, 1, sum)
## running time comparison:
test <- matrix(1:10^6, ncol=2)
row.sum <- numeric(500000)
system.time(
for(i in 1:500000){
row.sum[i] <- sum(test[i,])
}
) # using a loop
system.time(
row.sum <- apply(test, 1, sum)
) # using apply
system.time(
row.sum <- rowSums(test)
) # using a tailor-made function
## programming exercise: three persons are successively flipping a coin. If the coin shows head, the game ends and the corresponding player wins the game. Otherwise (if it shows tail), it is the next player's turn. What is the probability that the second player wins?
# Implement the problem in R (simulation). Proceed the following way:
# 1. simulate a single coin flip (using the "sample" function)
# 2. repeat that coin flip multiple times (break if it shows a head), using some loop
# 3. repeat the overall game multiple times (say 1000 games), using some loop (this part might be considered a simulation)
res.sim <- list()
for(j in 1:1000){
i <- 0
res.game <- c()
repeat{
flip <- sample(c("H", "T"), 1)
i <- i + 1
res.game[i] <- flip
if(flip=="H"){break}
}
res.sim[[j]] <- res.game
}
lengths <- sapply(res.sim, length) # lengths if the individual games
p2won <- seq(from=2, to=max(lengths), by=3) # game lengths where player 2 can be considered the winner
sum(lengths %in% p2won) / length(lengths) # number of times that player 2 won divided by the total number of games played
2/7 # theoretical probability of player 2 to win the game