Everyday R code (8) logistic regression

# we have a data set with all numeric variables (some of the columns only have 0 or 1). And we want to run a logistic regression model. First we checked correlation and exclude the variables which has high correlation (remove one and keep the other).

COR<-cor(Answered_data2)

data<-subset(data,select=-c(A,C,D,AA,CC))

#we need to run correlation multiple times until all the variable correlation is OK. For example, a threshold which is 0.7, any correlation number below 0.7 is OK for this case.

# we need training data set and testing data set

## 75% of the sample size

smp_size <- floor(0.75 * nrow(data))   #floor() function returns the largest integer not greater than the giving number.

## set the seed to make your partition reproducible

set.seed(110)

train_ind <- sample(seq_len(nrow(Data)), size = smp_size)## seq_len(nrow(HMD_2)) is 1:nrow()

#train data with 75% of the data

train1_1 <- Data[train_ind, ]

table(train1$RESPONSE)

#exclude 2 columns which would not be used here for modeling

train1<-subset(train1_1,select=-c(ID_C,IS_LIKE))

#test data  with the other 25% of the data

test1_1 <- Data[-train_ind, ]

table(test1$RESPONSE)

#exclude 2 columns which would not be used here for modeling

test1<-subset(test1_1,select=-c(ID_C,IS_LIKE))

###############################################################################

#Modeling part using glm, ‘.’ Represent all the variables in the data file except RESPONSE which is the y variable

logit_fit<- glm(RESPONSE~.,data=train1,family=binomial())

#show the model parameters

summary(logit_fit)

library(car)

##checking VIF for multicollinearity issue. if variable’s VIF in the model is smaller than 3 (experience from other person), no multicollinearity issue. (Some text book said VIF<10 is Okay)

vif(logit_fit)

write.csv(vif,”vif4.csv”)

# we have to run the model and VIF multiple times until all the variable’s VIF is smaller than 3. (if bigger than 3, exclude that variable and rerun the model and VIF to check)

#Then the model is used for the training data still to let you see the real data and predicted value. (this is only for learning purpose)

#predict using the trained model “logit_fit”

P_fitpreds_TRAIN = predict(logit_fit,train1,type=’response’)

#write the predicted value as column on the right of the original data set

P_HMD_TRAIN_PRED <-cbind(train1_1,P_fitpreds_TRAIN)

#check model

summary(P_HMD_TRAIN_PRED)

#view the data

View((P_HMD_TRAIN_PRED))

#compare real data and predicted data, the data and calculated data are super similar because the same data is used to train the model

table(P_HMD_TRAIN_PRED$RESPONSE)

mean(P_HMD_TRAIN_PRED$P_fitpreds_TRAIN)  # use count of 1/(count of 1 + count of 0) for calculation

############now we use testing data set to test the model

#type = “response” gives the predicted probabilities

P_fitpreds_TEST = predict(logit_fit,test1,type=’response’)

P_HMD_TEST_PRED <-cbind(test1_1,P_fitpreds_TEST)

View(P_HMD_TEST_PRED)

nrow(P_HMD_TEST_PRED)

colnames(P_HMD_TEST_PRED)

#compare

table(P_HMD_TEST_PRED$RESPONSE)

mean(P_HMD_TEST_PRED$P_fitpreds_TEST)

########################matrix to evaluate this model

##Analysis of Deviance Table

anova(logit_fit,test=”Chisq”)

## this is the method to get chi-square value for each variable in the model

Anova(logit_fit, test.statistic=”Wald”)

install.packages(“MKmisc”,lib=”/folder/Rpackages”)

library(MKmisc,lib.loc = “/folder/Rpackages”)

### Hosmer-Lemeshow C statistic, Hosmer-Lemeshow H statistic

HLgof.test(fit = fitted(logit_fit), obs = train1$RESPONSE)

#install.packages(“ResourceSelection”,lib=”/folder/Rpackages”)

library(ResourceSelection,lib.loc = “/folder/Rpackages”)

 

#get AUC (C value) using ROC curve

library(pROC)

predpr <- predict(logit_fit,type=c(“response”))

roccurve<-roc(RESPONSE~predpr)

roccurve

#the data we used is huge. And it take forever to get AUC. So I tried to use first 120000 records to get the value to get a sense of the C value.

p_1<-p[1:120000]

predpr_1<-predpr[1:120000]

train1_1<-train1[1:120000,]

roc(RESPONSE ~ p_1, data = train1_1)

## CIs using profiled log-likelihood

confint(logit_fit)

## CIs using standard errors

confint.default(mylogit)

## odds ratios and 95% CI

exp(cbind(OR = coef(mylogit), confint(mylogit)))

#install.packages(“caret”,lib=”/folder/Rpackages”)

library(caret,lib.loc = “/folder/Rpackages”)

#Variable Importance, this is very useful. It is like the importance from random forest.

varImp(logit_fit)

## odds ratios only

odds_ratio<-exp(coef(logit_fit))

write.csv(odds_ratio,”odds_ratio.csv”)

summary(logit_fit)

1 thought on “Everyday R code (8) logistic regression”

Leave a Comment