# word cloud is easy
#we need cleanDescription.r file as follows. It was used to write some similar words into one word. For example, write games and gaming into game. You can write lots of them into it.
require(tm)
cleanDescription <- function(description,additional.stopwords=NULL) {
# write to lower case
description <- tolower(description)
##remove non-character symbols
description <- gsub(“\\W+”,” “,description)
##remove underscore
description <- gsub(“\\_”,” “,description)
##remove special character
description <- gsub(“å”,”a”,description)
description <- gsub(“â”,”a”,description)
#remove similar words (change them to the commonly used ones for counting purpose)
description <- gsub(” games | gaming “,” game “,description)
description <- gsub(“complicated”,”complicate”,description)
description <- gsub(” getting| got| gets”,” get”,description)
return(description)
}
#########this is the code for word cloud
#read data
MyData=read.csv(file=’C:\\folder\\Desktop\\yourdata.csv’,stringsAsFactors=F)
#get cleanDescrition.r in
source(file=’C:\\Users\\folder\\Desktop\\cleanDescription.r’)
MyData=MyData[,1]
length(MyData)
DATA_.comments=MyData
MyData=tolower(MyData)
#used cleanDescription.r as a function
MyData=cleanDescription(MyData)
#remove stop words which needs to be excluded from the word frequency counting
O.stopwords <- c(“aha”,’hello’)
ShortSAT.stopwords=c(‘also’)
OctFY15.stopwords<-c( “always”)
MyData=Corpus(VectorSource(MyData))
MyData=tm_map(MyData,stripWhitespace)
MyData <- tm_map(MyData, content_transformer(tolower))
MyData <- tm_map(MyData, removeNumbers)
MyData <- tm_map(MyData, removePunctuation)
MyData <- tm_map(MyData, removeWords, stopwords(“english”))
stop.en=stopwords(“english”)
SurveyStopWords=c(stop.en,O.stopwords,OctFY15.stopwords,ShortSAT.stopwords)
MyData=tm_map(MyData,removeWords, SurveyStopWords)
MyData=tm_map(MyData,removeWords, stop.en)
MyData=tm_map(MyData,removeNumbers)
MyData=tm_map(MyData,removePunctuation)
MyData=tm_map(MyData,stemDocument)
MyData=tm_map(MyData,stripWhitespace)
DATA_Tdm <- TermDocumentMatrix(MyData)
length(dimnames(DATA_Tdm)$Terms)
word=dimnames(DATA_Tdm)$Terms[100:150]
wordFreqTable=findFreqTerms(DATA_Tdm, lowfreq=5)
##Keyword frequency table and chart
termFrequency=rowSums(as.matrix(DATA_Tdm))
termFrequency=termFrequency[order(termFrequency,decreasing=T)]
termFrequency[1:40]
library(ggplot2)
data=data.frame(name=names(termFrequency)[1:25], freq=termFrequency[1:25])
data=data[order(data$freq),]
data$name=factor(data$name,levels=data$name)
ggplot(data=data,aes(x=name,y=freq))+geom_bar(stat=”identity”,fill=’red’)+xlab(‘Frequent Words’)+ylab(‘Frequency’)+coord_flip()
#Write them into a file
write.csv(termFrequency,file=’C:\\Users\\folder\\Desktop\\nameit.csv’)
########################################word cloud
library(wordcloud)
require(RColorBrewer)
#wordFreqTable <- itemFrequency(verbWordList,type=”absolute”)
pal <- brewer.pal(8,”Dark2″)
wordcloud(words=names(termFrequency), freq=termFrequency, max.words=120, colors=pal,random.order=F,scale=c(3.5,.5))