Everyday R code (9)

# word cloud is easy

#we need cleanDescription.r file as follows. It was used to write some similar words into one word. For example, write games and gaming into game. You can write lots of them into it.

require(tm)

cleanDescription <- function(description,additional.stopwords=NULL) {

# write to lower case

description <- tolower(description)

##remove non-character symbols

description <- gsub(“\\W+”,” “,description)

##remove underscore

description <- gsub(“\\_”,” “,description)

##remove special character

description <- gsub(“å”,”a”,description)

description <- gsub(“â”,”a”,description)

#remove similar words (change them to the commonly used ones for counting purpose)

description <- gsub(” games | gaming “,” game “,description)

description <- gsub(“complicated”,”complicate”,description)

description <- gsub(” getting| got| gets”,” get”,description)

return(description)

}

#########this is the code for word cloud

#read data

MyData=read.csv(file=’C:\\folder\\Desktop\\yourdata.csv’,stringsAsFactors=F)

#get cleanDescrition.r in

source(file=’C:\\Users\\folder\\Desktop\\cleanDescription.r’)

MyData=MyData[,1]

length(MyData)

DATA_.comments=MyData

MyData=tolower(MyData)

#used cleanDescription.r as a function

MyData=cleanDescription(MyData)

#remove stop words which needs to be excluded from the word frequency counting

O.stopwords <- c(“aha”,’hello’)

ShortSAT.stopwords=c(‘also’)

OctFY15.stopwords<-c( “always”)

MyData=Corpus(VectorSource(MyData))

MyData=tm_map(MyData,stripWhitespace)

MyData <- tm_map(MyData, content_transformer(tolower))

MyData <- tm_map(MyData, removeNumbers)

MyData <- tm_map(MyData, removePunctuation)

MyData <- tm_map(MyData, removeWords, stopwords(“english”))

stop.en=stopwords(“english”)

SurveyStopWords=c(stop.en,O.stopwords,OctFY15.stopwords,ShortSAT.stopwords)

MyData=tm_map(MyData,removeWords, SurveyStopWords)

MyData=tm_map(MyData,removeWords, stop.en)

MyData=tm_map(MyData,removeNumbers)

MyData=tm_map(MyData,removePunctuation)

MyData=tm_map(MyData,stemDocument)

MyData=tm_map(MyData,stripWhitespace)

DATA_Tdm <- TermDocumentMatrix(MyData)

length(dimnames(DATA_Tdm)$Terms)

word=dimnames(DATA_Tdm)$Terms[100:150]

wordFreqTable=findFreqTerms(DATA_Tdm, lowfreq=5)

##Keyword frequency table and chart

termFrequency=rowSums(as.matrix(DATA_Tdm))

termFrequency=termFrequency[order(termFrequency,decreasing=T)]

termFrequency[1:40]

library(ggplot2)

data=data.frame(name=names(termFrequency)[1:25], freq=termFrequency[1:25])

data=data[order(data$freq),]

data$name=factor(data$name,levels=data$name)

ggplot(data=data,aes(x=name,y=freq))+geom_bar(stat=”identity”,fill=’red’)+xlab(‘Frequent Words’)+ylab(‘Frequency’)+coord_flip()

#Write them into a file

write.csv(termFrequency,file=’C:\\Users\\folder\\Desktop\\nameit.csv’)

########################################word cloud

library(wordcloud)

require(RColorBrewer)

#wordFreqTable <- itemFrequency(verbWordList,type=”absolute”)

pal <- brewer.pal(8,”Dark2″)

wordcloud(words=names(termFrequency), freq=termFrequency, max.words=120, colors=pal,random.order=F,scale=c(3.5,.5))

Leave a Comment