###########sentiment analysis################
##use RTextTools package ## this one works well#####
#you need 3 files with the following format
#positive
comment | flag |
like it | 0 |
good job | 0 |
great! | 0 |
#negative
comment | flag |
Disappointed that there are … | 1 |
You make a you | 1 |
Pretty difficult | 1 |
########################################
pos_tweets = read.csv(“positive.csv”,header=T,stringsAsFactors = FALSE)
neg_tweets = read.csv(“negative.csv”,header=T,stringsAsFactors = FALSE)
test_tweets = read.csv(“test.csv”,header=T,stringsAsFactors = FALSE)
nrow(pos_tweets)
nrow(neg_tweets)
nrow(test_tweets)
tweets = rbind(pos_tweets, neg_tweets, test_tweets)
nrow(tweets)
# build dtm
install.packages(“RTextTools”,lib=”/local/XXXX/Rpackages”)
library(RTextTools,lib.loc = “/local/XXXX/Rpackages”)
matrix= create_matrix(tweets[,1], language=”english”, removeStopwords=FALSE, removeNumbers=TRUE, stemWords=FALSE)
# build the data to specify response variable, training set, testing set.
container = create_container(matrix, as.numeric(as.factor(tweets[,2])), trainSize=1:827, testSize=828:897,virgin=FALSE)
#train model with many learning algrithom
models = train_models(container, algorithms=c(“MAXENT” , “SVM”, “RF”, “BAGGING”, “TREE”))
#use trained model to do classification
results = classify_models(container, models)
install.packages(“maxent”,lib=”/local/XXXX/Rpackages”)
library(NLP,lib.loc = “/local/XXXX/Rpackages”)
library(tm,lib.loc = “/local/XXXX/Rpackages”)
library(maxent,lib.loc = “/local/XXXX/Rpackages”)
results <- classify_models(container, models)
# accuracy table
table(as.numeric(as.factor(tweets[828:897, 2])), results[,”FORESTS_LABEL”])
table(as.numeric(as.factor(tweets[828:897, 2])), results[,”MAXENTROPY_LABEL”])
table(as.numeric(as.factor(tweets[828:897, 2])), results[,”TREE_LABEL”])
table(as.numeric(as.factor(tweets[828:897, 2])), results[,”BAGGING_LABEL”])
table(as.numeric(as.factor(tweets[828:897, 2])), results[,”SVM_LABEL”])
#recall accuracy
recall_accuracy(as.numeric(as.factor(tweets[828:897, 2])), results[,”FORESTS_LABEL”])
recall_accuracy(as.numeric(as.factor(tweets[828:897, 2])), results[,”MAXENTROPY_LABEL”])
recall_accuracy(as.numeric(as.factor(tweets[828:897, 2])), results[,”TREE_LABEL”])
recall_accuracy(as.numeric(as.factor(tweets[828:897, 2])), results[,”BAGGING_LABEL”])
recall_accuracy(as.numeric(as.factor(tweets[828:897, 2])), results[,”SVM_LABEL”])
# model summary
analytics = create_analytics(container, results)
summary(analytics)
head(analytics@document_summary)
analytics@ensemble_summary
#cross validation
N=4
set.seed(2014)
cross_validate(container,N,”MAXENT”)
cross_validate(container,N,”TREE”)
cross_validate(container,N,”SVM”)
cross_validate(container,N,”RF”)
#write.csv(analytics@document_summary,”C:\\XXXX\\results_trained.csv”)
# new data for prediction
predictionData<-read.csv(“forPrediction.csv”,header=T,stringsAsFactors = FALSE)
nrow(predictionData)
# create a prediction document term matrix
library(RTextTools,lib.loc = “/local/shared_wwcp/chefang/Rpackages”)
predMatrix <- create_matrix(predictionData[,1], language=”english”, removeStopwords=FALSE, removeNumbers=TRUE, stemWords=FALSE) #matrix is used before ## originalMatrix=matrix not used, why it does not working
trace(“create_matrix”,edit=T) ##this is used to see the saurce code and we need to manully change row 42 from “A to “a and then hit save to make it work.
##http://stackoverflow.com/questions/32513513/rtexttools-create-matrix-got-an-error
predMatrix <- create_matrix(predictionData[“comment”],originalMatrix=matrix,language=”english”, removeStopwords=FALSE, removeNumbers=TRUE, stemWords=FALSE)
# create the corresponding container
predSize=length(predictionData[,1])
predictionContainer <- create_container(predMatrix, labels=rep(0,predSize), testSize=1:predSize, virgin=FALSE)
# predict
results <- classify_model(predictionContainer, models$RF)
nrow(results)
write.csv(results,”trial.csv”) ##write predicted value, finally work
predict(models[[1]],predictionContainer@training_matrix[,1:predSize],decision.values = T, probability = T)
#######################################