Ok so I have now done two iterations on a better way to visualize term frequencies using R, ggplot2 and plyr. The first was ok but ugly, the second was better but still ugly.
How to read it:
- Frequency is segmented in to 20% quantiles
- The frequency is on the y axis
- Word size is proportional to frequency
- Words with similar frequency are in approximately alphabetical order from left to right.
- Color is still random (this could be better)
This one is now good enough that I will start using it in my own presentations and announce my retirement from the prestigious and highly-paid world of word-cloud improvement!
Here’s the code.
library(languageR)
# get english word freq data
data(english)
df <- english[,c("Word","WrittenFrequency")]
df <- df[sample.int(NROW(df),200),]
df <- unique(df)
df$freq <- df$WrittenFrequency/sum(df$WrittenFrequency)
qtiles <- quantile(df$freq, seq(0,1,.2))
twotiles <- quantile(df$WrittenFrequency, seq(0,1,10/NROW(df)))
qdf <- data.frame(cut = qtiles,quantile= as.numeric(strsplit(names(qtiles),"%")))
df$qtilerange <- cut(df$freq,breaks=qtiles,labels=F)
df$twotiles <- as.factor(cut(df$WrittenFrequency,breaks=twotiles,labels=F))
df$quantile <- qdf[(df$qtilerange+1),"quantile"]
df$quantilecut <- qdf[df$qtilerange,"cut"]
df <- df[order(df$quantile),]
df$quantile <- as.factor(df$quantile)
df$quantile <- reorder(df$quantile,NROW(df):1)
df$WordColor <- factor(sample.int(4,NROW(df),replace=T))
df <- df[!is.na(df$quantile),]
ticks <- ddply(df,c("quantile"),summarize,ticks=quantile(WrittenFrequency,c(.2,.8)))$ticks
ticks <- round(unique(c(max(df$WrittenFrequency),ticks)),2)
df <- ddply(df,c("twotiles"),summarize,
Word=sort(Word),
WordColor=WordColor,
WrittenFrequency=WrittenFrequency,
quantile=quantile,
x=seq(-min(WrittenFrequency)/mean(WrittenFrequency),max(WrittenFrequency)/mean(WrittenFrequency),length.out=length(WrittenFrequency))
)
library(ggplot2)
# frequency label on the yaxis # x axis is frequency scale (log data in this example) # word name is shown in the facet label
p <- ggplot(df,aes(x=x,y=WrittenFrequency))
p <- p + geom_text(aes(label=Word,size=WrittenFrequency,color=WordColor),family="Courier",face="bold")
p <- p + opts(axis.text.x=theme_blank(), axis.title.x=theme_blank(),panel.grid.major=theme_blank())
p <- p + scale_y_continuous(breaks=ticks)
p <- p + facet_grid(quantile~.,scales="free_y",space="free",labeller = label_both)
p + opts(strip.text.y = theme_text(angle = 0, size = 15, hjust = 0.5, vjust = 0.5),
axis.text.y = theme_text(angle = 0, size = 15, hjust = 0.5, vjust = 0.5),
axis.title.y = theme_blank(),
legend.text=theme_blank(),legend.position = "none",
title="Word Frequency")





