The very short introduction to R

Some history

• Developed as reduced version of S (later S-plus, spotfire)
• Released as open source in 1995
• At that time the only statistical package available for Linux

Installation

• Think of it as SAS versus SAS EG

R as an overgrown calculator

2+2
## [1] 4
exp(-2)
## [1] 0.1353353
rnorm(15)
##  [1] -1.31610441 -0.68287241 -0.13930987 -1.37804907 -0.14774243
##  [6] -0.86050033 -1.04545518 -0.40769396 -0.34812848  1.57435674
## [11]  1.41489930  0.09422239 -0.52949356  0.96144952 -0.48719970
# What happened?
# Write ?rnorm

Assignments

x<-2
x=2
x
## [1] 2
x+x
## [1] 4

Calculations on vectors

weight<-c(60,72,57,90,95,72)
height<-c(1.75,1.80,1.65,1.90,1.74,1.91)
bmi<-weight/height^2

# What is the mean weight?
sum(weight)
## [1] 446
sum(weight)/length(weight)
## [1] 74.33333
# That was very precise
round(sum(weight)/length(weight),1)
## [1] 74.3
# What about the standard deviation?
xbar<-sum(weight)/length(weight)
weight-xbar
## [1] -14.333333  -2.333333 -17.333333  15.666667  20.666667  -2.333333
# Use up-arrow to get previous command
(weight-xbar)^2
## [1] 205.444444   5.444444 300.444444 245.444444 427.111111   5.444444
sum((weight-xbar)^2)
## [1] 1189.333
sqrt(sum((weight-xbar)^2)/(length(weight)-1))
## [1] 15.42293
# But of course there are built-in functions for mean and SD
mean(weight)
## [1] 74.33333
sd(weight)
## [1] 15.42293
var(weight)
## [1] 237.8667

More on vectors

some_names<-c("Tine","Thomas","Grete")

# Same as
some_names<-c('Tine','Thomas','Grete')
is.vector(some_names)
## [1] TRUE
is.character(some_names)
## [1] TRUE
is.character(bmi)
## [1] FALSE
is.numeric(bmi)
## [1] TRUE
# Logical
c(T,T,F,T)
## [1]  TRUE  TRUE FALSE  TRUE
# Now it becomes really strange
bmi>25
## [1] FALSE FALSE FALSE FALSE  TRUE FALSE
# Create your own vectors with rep() and seq()
seq(4,9)
## [1] 4 5 6 7 8 9
# same as
4:9
## [1] 4 5 6 7 8 9
# But we could also do
seq(4,10,2)
## [1]  4  6  8 10
# What happened?
# Write ?seq

# Some more examples - now using rep()
oops<-c(7,9,13)
rep(oops,3)
## [1]  7  9 13  7  9 13  7  9 13
rep(oops,1:3)
## [1]  7  9  9 13 13 13
rep(1:2,c(10,15))
##  [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

Data frames - now SAS users know what is going on

# First create two vectors
intake_pre<-c(5260,5470,5640,6180,6390,6515,6805,7515,7515,8230,8770)
intake_post<-c(3910,4220,3885,5160,5645,4680,5265,5975,6790,6900,7335)

d<-data.frame(intake_pre,intake_post)
names(d)
## [1] "intake_pre"  "intake_post"
dim(d)
## [1] 11  2
# Look at one variable
d$intake_pre ## [1] 5260 5470 5640 6180 6390 6515 6805 7515 7515 8230 8770 # Note that R is case sensitive d$Intake_pre 
## NULL
# Add a subject ID
d$subjid<-1:dim(d)[1] names(d) ## [1] "intake_pre" "intake_post" "subjid" dim(d) ## [1] 11 3 # View it #View(d) # Or simply d ## intake_pre intake_post subjid ## 1 5260 3910 1 ## 2 5470 4220 2 ## 3 5640 3885 3 ## 4 6180 5160 4 ## 5 6390 5645 5 ## 6 6515 4680 6 ## 7 6805 5265 7 ## 8 7515 5975 8 ## 9 7515 6790 9 ## 10 8230 6900 10 ## 11 8770 7335 11 # This does not work #d$subjid<-as.character(1:20)

### indexing, subsetting
d$intake_pre[1] ## [1] 5260 # Same as d[1,1] ## [1] 5260 d[c(1,3,7),] ## intake_pre intake_post subjid ## 1 5260 3910 1 ## 3 5640 3885 3 ## 7 6805 5265 7 d[-c(1,3,7),] ## intake_pre intake_post subjid ## 2 5470 4220 2 ## 4 6180 5160 4 ## 5 6390 5645 5 ## 6 6515 4680 6 ## 8 7515 5975 8 ## 9 7515 6790 9 ## 10 8230 6900 10 ## 11 8770 7335 11 # The row number is kept # Here is another data frame d1<-data.frame(subjid=1:8,age_group=rep(c("old","young"),4)) # Merging d2<-merge(d,d1,by="subjid") d2 ## subjid intake_pre intake_post age_group ## 1 1 5260 3910 old ## 2 2 5470 4220 young ## 3 3 5640 3885 old ## 4 4 6180 5160 young ## 5 5 6390 5645 old ## 6 6 6515 4680 young ## 7 7 6805 5265 old ## 8 8 7515 5975 young # Inner join # Write ?merge # merge() can only handle two data frames at a time # Subsetting using the subset() function young<-subset(d2,age_group=="young") # note == low_pre<-subset(d2,intake_pre<6000) low_pre_young_1<-subset(d2,intake_pre<6000 & age_group=="young") low_pre_young_2<-subset(d2,intake_pre<6000 | age_group=="young") # Looping and some if-then stuff d$low_pre_fl<-0
for(i in 1:dim(d)[1]){
if(d$intake_pre[i]<6000){d$low_pre_fl[i]<-1}
}
d
##    intake_pre intake_post subjid low_pre_fl
## 1        5260        3910      1          1
## 2        5470        4220      2          1
## 3        5640        3885      3          1
## 4        6180        5160      4          0
## 5        6390        5645      5          0
## 6        6515        4680      6          0
## 7        6805        5265      7          0
## 8        7515        5975      8          0
## 9        7515        6790      9          0
## 10       8230        6900     10          0
## 11       8770        7335     11          0
# Sort (or order) examples
d<-d[order(d$intake_post),] d<-d[order(-d$subjid,d$intake_post),] # Stacking two data frames d_again<-d d_double<-rbind(d,d_again) # But the next command R will not accept - goes fine in SAS d_again<-subset(d_again,select=c("subjid","intake_pre","intake_post")) d_double<-rbind(d,d_again) Descriptive statistics summary(d) ## intake_pre intake_post subjid low_pre_fl ## Min. :5260 Min. :3885 Min. : 1.0 Min. :0.0000 ## 1st Qu.:5910 1st Qu.:4450 1st Qu.: 3.5 1st Qu.:0.0000 ## Median :6515 Median :5265 Median : 6.0 Median :0.0000 ## Mean :6754 Mean :5433 Mean : 6.0 Mean :0.2727 ## 3rd Qu.:7515 3rd Qu.:6382 3rd Qu.: 8.5 3rd Qu.:0.5000 ## Max. :8770 Max. :7335 Max. :11.0 Max. :1.0000 Import data and R packages # Import a .csv file csv_file<-read.csv("path_to_file/filename.csv") names(csv_file) dim(csv_file) head(csv_file) tail(csv_file) # Import a SAS dataset library(haven) sas_file<-read_sas("path_to_file/filename.sas7bdat") sas_file<-as.data.frame(sas_file) # Import an excel file library(readxl) excel_file<-read_xlsx("path_to_file/filename.xlsx") excel_file<-as.data.frame(excel_file) Handling dates and a bit of string manipulation sas_file_mitt<-subset(sas_file,MITTFL=="Y") summary(sas_file_mitt$TRTSDT)
summary(sas_file_mitt$TRTEDT) sas_file_mitt$TRTEDT-sas_file_mitt$TRTSDT summary(excel_file) excel_file$lbdtn<-as.Date(substr(excel_file$LBDTC,1,9),"%d%B%Y") excel_file$lbdtn-as.Date(excel_file\$LB_DOB)