# we have a data set with all numeric variables (some of the columns only have 0 or 1). And we want to run a logistic regression model. First we checked correlation and exclude the variables which has high correlation (remove one and keep the other).
COR<-cor(Answered_data2)
data<-subset(data,select=-c(A,C,D,AA,CC))
#we need to run correlation multiple times until all the variable correlation is OK. For example, a threshold which is 0.7, any correlation number below 0.7 is OK for this case.
# we need training data set and testing data set
## 75% of the sample size
smp_size <- floor(0.75 * nrow(data)) #floor() function returns the largest integer not greater than the giving number.
## set the seed to make your partition reproducible
set.seed(110)
train_ind <- sample(seq_len(nrow(Data)), size = smp_size)## seq_len(nrow(HMD_2)) is 1:nrow()
#train data with 75% of the data
train1_1 <- Data[train_ind, ]
table(train1$RESPONSE)
#exclude 2 columns which would not be used here for modeling
train1<-subset(train1_1,select=-c(ID_C,IS_LIKE))
#test data with the other 25% of the data
test1_1 <- Data[-train_ind, ]
table(test1$RESPONSE)
#exclude 2 columns which would not be used here for modeling
test1<-subset(test1_1,select=-c(ID_C,IS_LIKE))
###############################################################################
#Modeling part using glm, ‘.’ Represent all the variables in the data file except RESPONSE which is the y variable
logit_fit<- glm(RESPONSE~.,data=train1,family=binomial())
#show the model parameters
summary(logit_fit)
library(car)
##checking VIF for multicollinearity issue. if variable’s VIF in the model is smaller than 3 (experience from other person), no multicollinearity issue. (Some text book said VIF<10 is Okay)
vif(logit_fit)
write.csv(vif,”vif4.csv”)
# we have to run the model and VIF multiple times until all the variable’s VIF is smaller than 3. (if bigger than 3, exclude that variable and rerun the model and VIF to check)
#Then the model is used for the training data still to let you see the real data and predicted value. (this is only for learning purpose)
#predict using the trained model “logit_fit”
P_fitpreds_TRAIN = predict(logit_fit,train1,type=’response’)
#write the predicted value as column on the right of the original data set
P_HMD_TRAIN_PRED <-cbind(train1_1,P_fitpreds_TRAIN)
#check model
summary(P_HMD_TRAIN_PRED)
#view the data
View((P_HMD_TRAIN_PRED))
#compare real data and predicted data, the data and calculated data are super similar because the same data is used to train the model
table(P_HMD_TRAIN_PRED$RESPONSE)
mean(P_HMD_TRAIN_PRED$P_fitpreds_TRAIN) # use count of 1/(count of 1 + count of 0) for calculation
############now we use testing data set to test the model
#type = “response” gives the predicted probabilities
P_fitpreds_TEST = predict(logit_fit,test1,type=’response’)
P_HMD_TEST_PRED <-cbind(test1_1,P_fitpreds_TEST)
View(P_HMD_TEST_PRED)
nrow(P_HMD_TEST_PRED)
colnames(P_HMD_TEST_PRED)
#compare
table(P_HMD_TEST_PRED$RESPONSE)
mean(P_HMD_TEST_PRED$P_fitpreds_TEST)
########################matrix to evaluate this model
##Analysis of Deviance Table
anova(logit_fit,test=”Chisq”)
## this is the method to get chi-square value for each variable in the model
Anova(logit_fit, test.statistic=”Wald”)
install.packages(“MKmisc”,lib=”/folder/Rpackages”)
library(MKmisc,lib.loc = “/folder/Rpackages”)
### Hosmer-Lemeshow C statistic, Hosmer-Lemeshow H statistic
HLgof.test(fit = fitted(logit_fit), obs = train1$RESPONSE)
#install.packages(“ResourceSelection”,lib=”/folder/Rpackages”)
library(ResourceSelection,lib.loc = “/folder/Rpackages”)
#get AUC (C value) using ROC curve
library(pROC)
predpr <- predict(logit_fit,type=c(“response”))
roccurve<-roc(RESPONSE~predpr)
roccurve
#the data we used is huge. And it take forever to get AUC. So I tried to use first 120000 records to get the value to get a sense of the C value.
p_1<-p[1:120000]
predpr_1<-predpr[1:120000]
train1_1<-train1[1:120000,]
roc(RESPONSE ~ p_1, data = train1_1)
## CIs using profiled log-likelihood
confint(logit_fit)
## CIs using standard errors
confint.default(mylogit)
## odds ratios and 95% CI
exp(cbind(OR = coef(mylogit), confint(mylogit)))
#install.packages(“caret”,lib=”/folder/Rpackages”)
library(caret,lib.loc = “/folder/Rpackages”)
#Variable Importance, this is very useful. It is like the importance from random forest.
varImp(logit_fit)
## odds ratios only
odds_ratio<-exp(coef(logit_fit))
write.csv(odds_ratio,”odds_ratio.csv”)
summary(logit_fit)
很受用,感谢博主!