# R is simple to do data analyst work. If you want to write complicated loop and agrithem, that would be another story. Here is the code using R in a simple way.
#sometimes we need to install new packages, our R studio is running on the server. So it has a path for lib as follows.
install.packages(“Rcpp”,lib=”/folder1/shared/DS/Rpackages”)
#after install, we need to call this library before we can use it
library(aod,lib.loc = “/ folder1/shared/DS/Rpackages “)
#install.packages(“aod”,lib=”/ folder1/shared/DS/Rpackages “)
library(aod,lib.loc = “/ folder1/shared/DS/Rpackages “)
#install.packages(“ggplot2”,lib=”/ folder1/shared/DS/Rpackages “)
#library(ggplot2,lib.loc = “/folder1/shared/DS/Rpackages “)
#load files that I saved last time. It will show the file name on the right for you. I named it as P.
load(“P.Rdata”)
colnames(P)
nrow(P)
length(unique(P$ID_C))
#check if it is numeric, character format and so on
sapply(P, mode)
#exclude some columns
data<-subset(P,select=-c(SIC1,SIC2))
#make some variables dummy (for example, using 1 or 0 in a column to represent each skill, 1 means has this skill, 0 means it does not has this skill), and then add those dummy columns on the right of the data.
data<-cbind(data, dummy(data$SKILL_ID),dummy(data$SOURCE),dummy(data$why))
colnames(data)
#exclude these original columns which are dummied in the last command
data<-subset(data,select=-c(SKILL_ID,SOURCE,why))
#if column E is Y, we use 0 to replace it. If it is N, we use 1 to replace it.
data$E[data$E==’Y’]<-0
data$E[data$E==’N’]<-1
#E colume has 0 and 1. Let’s count how many 0 are there in E and how many 1.
table(data$E)
#this used to generate indicator for SIC “cant find”, if it has it, write 1, if not having it, write 0.
data$SIC_cantfind<-ifelse(data$SIC==”cant find”,1,0)
sapply(data, mode)
#used mode above, we find IS column (it has 1 and 0) is character, not numeric. So we need to change it to numeric format
data$IS<-as.numeric(data$IS)
nrow(data)
#this is used to check format of each column especially when there are hundreds of columns
a<-sapply(data, mode)
write.csv(a,’mode2.csv’)
#save data into data.Rdata file
save(data,file=”data.Rdata”)