In order to test the package I retrieved the titles of the XKCD web comics included in my RXKCD package and produced a word cloud based on the titles' word frequencies calculated using the powerful tm package for text mining (I know, it is like killing a fly with a bazooka!).
library(RXKCD) library(tm) library(wordcloud) library(RColorBrewer) path <- system.file("xkcd", package = "RXKCD") datafiles <- list.files(path) xkcd.df <- read.csv(file.path(path, datafiles)) xkcd.corpus <- Corpus(DataframeSource(data.frame(xkcd.df[, 3]))) xkcd.corpus <- tm_map(xkcd.corpus, removePunctuation) xkcd.corpus <- tm_map(xkcd.corpus, content_transformer(tolower)) xkcd.corpus <- tm_map(xkcd.corpus, function(x) removeWords(x, stopwords("english"))) tdm <- TermDocumentMatrix(xkcd.corpus) m <- as.matrix(tdm) v <- sort(rowSums(m),decreasing=TRUE) d <- data.frame(word = names(v),freq=v) pal <- brewer.pal(9, "BuGn") pal <- pal[-(1:2)] png("wordcloud.png", width=1280,height=800) wordcloud(d$word,d$freq, scale=c(8,.3),min.freq=2,max.words=100, random.order=T, rot.per=.15, colors=pal, vfont=c("sans serif","plain")) dev.off()
As a second example, inspired by this post from the eKonometrics blog, I created a word cloud from the description of 3177 available R packages listed at http://cran.r-project.org/web/packages.
require(XML) require(tm) require(wordcloud) require(RColorBrewer) u = "http://cran.r-project.org/web/packages/available_packages_by_date.html" t = readHTMLTable(u)[[1]] ap.corpus <- Corpus(DataframeSource(data.frame(as.character(t[,3])))) ap.corpus <- tm_map(ap.corpus, removePunctuation) ap.corpus <- tm_map(ap.corpus, content_transformer(tolower)) ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english")))
ap.corpus <- Corpus(VectorSource(ap.corpus)) ap.tdm <- TermDocumentMatrix(ap.corpus) ap.m <- as.matrix(ap.tdm) ap.v <- sort(rowSums(ap.m),decreasing=TRUE) ap.d <- data.frame(word = names(ap.v),freq=ap.v) table(ap.d$freq) pal2 <- brewer.pal(8,"Dark2") png("wordcloud_packages.png", width=1280,height=800) wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=3, max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2) dev.off()
As a third example, thanks to Jim's comment, I take advantage of Duncan Temple Lang's RNYTimes package to access user-generate content on the NY Times and produce a wordcloud of 'today' comments on articles.
Caveat: in order to use the RNYTimes package you need a API key from The New York Times which you can get by registering to the The New York Times Developer Network (free of charge) from here.
require(XML) require(tm) require(wordcloud) require(RColorBrewer) install.packages(packageName, repos = "http://www.omegahat.org/R", type = "source") require(RNYTimes) my.key <- "your API key here" what= paste("by-date", format(Sys.time(), "%Y-%m-%d"),sep="/") # what="recent" recent.news <- community(what=what, key=my.key) pagetree <- htmlTreeParse(recent.news, error=function(...){}, useInternalNodes = TRUE) x <- xpathSApply(pagetree, "//*/body", xmlValue) # do some clean up with regular expressions x <- unlist(strsplit(x, "\n")) x <- gsub("\t","",x) x <- sub("^[[:space:]]*(.*?)[[:space:]]*$", "\\1", x, perl=TRUE) x <- x[!(x %in% c("", "|"))] ap.corpus <- Corpus(DataframeSource(data.frame(as.character(x)))) ap.corpus <- tm_map(ap.corpus, removePunctuation) ap.corpus <- tm_map(ap.corpus, content_transformer(tolower)) ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english"))) ap.tdm <- TermDocumentMatrix(ap.corpus) ap.m <- as.matrix(ap.tdm) ap.v <- sort(rowSums(ap.m),decreasing=TRUE) ap.d <- data.frame(word = names(ap.v),freq=ap.v) table(ap.d$freq) pal2 <- brewer.pal(8,"Dark2") png("wordcloud_NewYorkTimes_Community.png", width=1280,height=800) wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=2, max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2) dev.off()