Everyday R code (14) sentiment analysis

###########sentiment analysis################

##use RTextTools package ## this one works well#####

#you need 3 files with the following format

#positive

comment flag
like it 0
good job 0
great! 0

#negative

comment flag
Disappointed that there are … 1
You make a you 1
Pretty difficult 1

########################################

pos_tweets = read.csv(“positive.csv”,header=T,stringsAsFactors = FALSE)

neg_tweets = read.csv(“negative.csv”,header=T,stringsAsFactors = FALSE)

test_tweets = read.csv(“test.csv”,header=T,stringsAsFactors = FALSE)

nrow(pos_tweets)

nrow(neg_tweets)

nrow(test_tweets)

tweets = rbind(pos_tweets, neg_tweets, test_tweets)

nrow(tweets)

# build dtm

install.packages(“RTextTools”,lib=”/local/XXXX/Rpackages”)

library(RTextTools,lib.loc = “/local/XXXX/Rpackages”)

matrix= create_matrix(tweets[,1], language=”english”, removeStopwords=FALSE, removeNumbers=TRUE, stemWords=FALSE)

# build the data to specify response variable, training set, testing set.

container = create_container(matrix, as.numeric(as.factor(tweets[,2])), trainSize=1:827, testSize=828:897,virgin=FALSE)

#train model with many learning algrithom

models = train_models(container, algorithms=c(“MAXENT” , “SVM”, “RF”, “BAGGING”, “TREE”))

#use trained model to do classification

results = classify_models(container, models)

install.packages(“maxent”,lib=”/local/XXXX/Rpackages”)

library(NLP,lib.loc = “/local/XXXX/Rpackages”)

library(tm,lib.loc = “/local/XXXX/Rpackages”)

library(maxent,lib.loc = “/local/XXXX/Rpackages”)

results <- classify_models(container, models)

# accuracy table

table(as.numeric(as.factor(tweets[828:897, 2])), results[,”FORESTS_LABEL”])

table(as.numeric(as.factor(tweets[828:897, 2])), results[,”MAXENTROPY_LABEL”])

table(as.numeric(as.factor(tweets[828:897, 2])), results[,”TREE_LABEL”])

table(as.numeric(as.factor(tweets[828:897, 2])), results[,”BAGGING_LABEL”])

table(as.numeric(as.factor(tweets[828:897, 2])), results[,”SVM_LABEL”])

#recall accuracy

recall_accuracy(as.numeric(as.factor(tweets[828:897, 2])), results[,”FORESTS_LABEL”])

recall_accuracy(as.numeric(as.factor(tweets[828:897, 2])), results[,”MAXENTROPY_LABEL”])

recall_accuracy(as.numeric(as.factor(tweets[828:897, 2])), results[,”TREE_LABEL”])

recall_accuracy(as.numeric(as.factor(tweets[828:897, 2])), results[,”BAGGING_LABEL”])

recall_accuracy(as.numeric(as.factor(tweets[828:897, 2])), results[,”SVM_LABEL”])

# model summary

analytics = create_analytics(container, results)

summary(analytics)

head(analytics@document_summary)

analytics@ensemble_summary

#cross validation

N=4

set.seed(2014)

cross_validate(container,N,”MAXENT”)

cross_validate(container,N,”TREE”)

cross_validate(container,N,”SVM”)

cross_validate(container,N,”RF”)

#write.csv(analytics@document_summary,”C:\\XXXX\\results_trained.csv”)

# new data for prediction

predictionData<-read.csv(“forPrediction.csv”,header=T,stringsAsFactors = FALSE)

nrow(predictionData)

# create a prediction document term matrix

library(RTextTools,lib.loc = “/local/shared_wwcp/chefang/Rpackages”)

predMatrix <- create_matrix(predictionData[,1], language=”english”, removeStopwords=FALSE, removeNumbers=TRUE, stemWords=FALSE) #matrix is used before ## originalMatrix=matrix not used, why it does not working

trace(“create_matrix”,edit=T) ##this is used to see the saurce code and we need to manully change row 42 from “A to “a and then hit save to make it work.

##http://stackoverflow.com/questions/32513513/rtexttools-create-matrix-got-an-error

predMatrix <- create_matrix(predictionData[“comment”],originalMatrix=matrix,language=”english”, removeStopwords=FALSE, removeNumbers=TRUE, stemWords=FALSE)

# create the corresponding container

predSize=length(predictionData[,1])

predictionContainer <- create_container(predMatrix, labels=rep(0,predSize), testSize=1:predSize, virgin=FALSE)

# predict

results <- classify_model(predictionContainer, models$RF)

nrow(results)

write.csv(results,”trial.csv”) ##write predicted value, finally work

predict(models[[1]],predictionContainer@training_matrix[,1:predSize],decision.values = T, probability = T)

#######################################

 

 

 

 

 

 

 

Leave a Comment