Everyday R code (6)

# R is simple to do data analyst work. If you want to write complicated loop and agrithem, that would be another story. Here is the code using R in a simple way.

#sometimes we need to install new packages, our R studio is running on the server. So it has a path for lib as follows.

install.packages(“Rcpp”,lib=”/folder1/shared/DS/Rpackages”)

#after install, we need to call this library before we can use it

library(aod,lib.loc = “/ folder1/shared/DS/Rpackages “)

#install.packages(“aod”,lib=”/ folder1/shared/DS/Rpackages “)

library(aod,lib.loc = “/ folder1/shared/DS/Rpackages “)

#install.packages(“ggplot2”,lib=”/ folder1/shared/DS/Rpackages “)

#library(ggplot2,lib.loc = “/folder1/shared/DS/Rpackages “)

#load files that I saved last time. It will show the file name on the right for you. I named it as P.

load(“P.Rdata”)

colnames(P)

nrow(P)

length(unique(P$ID_C))

#check if it is numeric, character format and so on

sapply(P, mode)

#exclude some columns

data<-subset(P,select=-c(SIC1,SIC2))

#make some variables dummy (for example, using 1 or 0 in a column to represent each skill, 1 means has this skill, 0 means it does not has this skill), and then add those dummy columns on the right of the data.

data<-cbind(data, dummy(data$SKILL_ID),dummy(data$SOURCE),dummy(data$why))

colnames(data)

#exclude these original columns which are dummied in the last command

data<-subset(data,select=-c(SKILL_ID,SOURCE,why))

#if column E is Y, we use 0 to replace it. If it is N, we use 1 to replace it.

data$E[data$E==’Y’]<-0

data$E[data$E==’N’]<-1

#E colume has 0 and 1. Let’s count how many 0 are there in E and how many 1.

table(data$E)

#this used to generate indicator for SIC “cant find”, if it has it, write 1, if not having it, write 0.

data$SIC_cantfind<-ifelse(data$SIC==”cant find”,1,0)

sapply(data, mode)

#used mode above, we find IS column (it has 1 and 0) is character, not numeric. So we need to change it to numeric format

data$IS<-as.numeric(data$IS)

nrow(data)

#this is used to check format of each column especially when there are hundreds of columns

a<-sapply(data, mode)

write.csv(a,’mode2.csv’)

#save data into data.Rdata file

save(data,file=”data.Rdata”)

Leave a Comment