Ok so I have now done two iterations on a better way to visualize term frequencies using R, ggplot2 and plyr. The first was ok but ugly, the second was better but still ugly.
How to read it:
- Frequency is segmented in to 20% quantiles
- The frequency is on the y axis
- Word size is proportional to frequency
- Words with similar frequency are in approximately alphabetical order from left to right.
- Color is still random (this could be better)
This one is now good enough that I will start using it in my own presentations and announce my retirement from the prestigious and highly-paid world of word-cloud improvement!
Here’s the code.
library(languageR) # get english word freq data data(english) df <- english[,c("Word","WrittenFrequency")] df <- df[sample.int(NROW(df),200),] df <- unique(df) df$freq <- df$WrittenFrequency/sum(df$WrittenFrequency) qtiles <- quantile(df$freq, seq(0,1,.2)) twotiles <- quantile(df$WrittenFrequency, seq(0,1,10/NROW(df))) qdf <- data.frame(cut = qtiles,quantile= as.numeric(strsplit(names(qtiles),"%"))) df$qtilerange <- cut(df$freq,breaks=qtiles,labels=F) df$twotiles <- as.factor(cut(df$WrittenFrequency,breaks=twotiles,labels=F)) df$quantile <- qdf[(df$qtilerange+1),"quantile"] df$quantilecut <- qdf[df$qtilerange,"cut"] df <- df[order(df$quantile),] df$quantile <- as.factor(df$quantile) df$quantile <- reorder(df$quantile,NROW(df):1) df$WordColor <- factor(sample.int(4,NROW(df),replace=T)) df <- df[!is.na(df$quantile),] ticks <- ddply(df,c("quantile"),summarize,ticks=quantile(WrittenFrequency,c(.2,.8)))$ticks ticks <- round(unique(c(max(df$WrittenFrequency),ticks)),2) df <- ddply(df,c("twotiles"),summarize, Word=sort(Word), WordColor=WordColor, WrittenFrequency=WrittenFrequency, quantile=quantile, x=seq(-min(WrittenFrequency)/mean(WrittenFrequency),max(WrittenFrequency)/mean(WrittenFrequency),length.out=length(WrittenFrequency)) ) library(ggplot2) # frequency label on the yaxis # x axis is frequency scale (log data in this example) # word name is shown in the facet label p <- ggplot(df,aes(x=x,y=WrittenFrequency)) p <- p + geom_text(aes(label=Word,size=WrittenFrequency,color=WordColor),family="Courier",face="bold") p <- p + opts(axis.text.x=theme_blank(), axis.title.x=theme_blank(),panel.grid.major=theme_blank()) p <- p + scale_y_continuous(breaks=ticks) p <- p + facet_grid(quantile~.,scales="free_y",space="free",labeller = label_both) p + opts(strip.text.y = theme_text(angle = 0, size = 15, hjust = 0.5, vjust = 0.5), axis.text.y = theme_text(angle = 0, size = 15, hjust = 0.5, vjust = 0.5), axis.title.y = theme_blank(), legend.text=theme_blank(),legend.position = "none", title="Word Frequency")