Data Visualization
Data Visualization
>gg + coord_flip()
# Use the mpg dataset for this one. It is available in the ggplot2 package
>plot(g)
#se sets the confidence interval. By default, it is TRUE (means 95% CI)
# This shows a simple chart of highway mileage (hwy) against the engine displacement (displ) for the
whole dataset
Facet Wrap
## The facet_wrap() is used to break down a large plot into multiple small plots for individual
categories.
## By default, all the plots share the same scale in both X and Y axis. We can set them free by
setting scales='free' but this way it could be harder to compare between groups.
From Fig-1, most 2 seater cars have higher engine displacement while the
minivan and compact vehicles are on the lower side. This is evident from where
the points are placed along the X-axis.
From Fig-2, the highway mileage drops across all segments as the engine
displacement increases. This drop seems more pronounced in compact and
subcompact vehicles.
Facet Grid
>plot(g1)
Facet Grid
>plot(g2)
Layout both these charts in the sample panel
>install.packages(“gridExtra”)
>library(gridExtra)
# Correlation matrix
>mtcars
>data(mtcars)
>corr = round(cor(mtcars), 1)
# Plot
ggcorrplot(corr, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of mtcars",
ggtheme=theme_bw)
Diverging bars
library(ggplot2)
theme_set(theme_bw())
# Data Prep
data("mtcars") # load data
mtcars$`car name` <- rownames(mtcars) # create new column for car names
mtcars$mpg_z <- round((mtcars$mpg - mean(mtcars$mpg))/sd(mtcars$mpg), 2) # compute normalized mpg
mtcars$mpg_type <- ifelse(mtcars$mpg_z < 0, "below", "above") # above / below avg flag
mtcars <- mtcars[order(mtcars$mpg_z), ] # sort
mtcars$`car name` <- factor(mtcars$`car name`, levels = mtcars$`car name`) # convert to factor to retain sorted order in plot.
# Diverging Barcharts
ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) +
geom_bar(stat='identity', aes(fill=mpg_type), width=.5) +
scale_fill_manual(name="Mileage",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
labs(subtitle="Normalised mileage from 'mtcars'",
title= "Diverging Bars") +
coord_flip()
Diverging Lollipop Chart
library(ggplot2)
theme_set(theme_bw())
ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) +
geom_point(stat='identity', fill="black", size=6) +
geom_segment(aes(y = 0,
x = `car name`,
yend = mpg_z,
xend = `car name`),
color = "black") +
geom_text(color="white", size=2) +
labs(title="Diverging Lollipop Chart",
subtitle="Normalized mileage from 'mtcars': Lollipop") +
ylim(-2.5, 2.5) +
coord_flip()
Area Chart
Area charts are typically used to visualize how a particular metric (such as % returns from a stock) performed compared to a certain baseline.
Other types of %returns or %change data are also commonly used. The geom_area() implements this.
library(ggplot2)
install.packages("quantmod")
library(quantmod)
data("economics", package = "ggplot2")
# Compute % Returns
economics$returns_perc <- c(0, diff(economics$psavert)/economics$psavert[-length(economics$psavert)])
# Create break points and labels for axis ticks
brks <- economics$date[seq(1, length(economics$date), 12)]
lbls <- lubridate::year(economics$date[seq(1, length(economics$date), 12)])
# Plot
ggplot(economics[1:100, ], aes(date, returns_perc)) +
geom_area() +
scale_x_date(breaks=brks, labels=lbls) +
theme(axis.text.x = element_text(angle=90)) +
labs(title="Area Chart",
subtitle = "Perc Returns for Personal Savings",
y="% Returns for Personal savings",
caption="Source: economics")
Density plot
library(ggplot2)
theme_set(theme_classic())
# Plot
g <- ggplot(mpg, aes(cty))
g + geom_density(aes(fill=factor(cyl)), alpha=0.8) +
labs(title="Density plot",
subtitle="City Mileage Grouped by Number of cylinders",
caption="Source: mpg",
x="City Mileage",
fill="# Cylinders")
Population Pyramid
Population pyramids offer a unique way of visualizing how much population or what percentage of population fall
under a certain category. The below pyramid is an excellent example of how many users are retained at each
stage of an email marketing campaign funnel.
library(ggplot2)
install.packages("ggthemes")
library(ggthemes)
options(scipen = 999) # turns of scientific notations like 1e+40
# Read data
email_campaign_funnel <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/email_campaign_funnel.csv")
# X Axis Breaks and Labels
brks <- seq(-15000000, 15000000, 5000000)
lbls = paste0(as.character(c(seq(15, 0, -5), seq(5, 15, 5))), "m")
# Plot
ggplot(email_campaign_funnel, aes(x = Stage, y = Users, fill = Gender)) + # Fill column
geom_bar(stat = "identity", width = .6) + # draw the bars
scale_y_continuous(breaks = brks, # Breaks
labels = lbls) + # Labels
coord_flip() + # Flip axes
labs(title="Email Campaign Funnel") +
theme_tufte() + # Tufte theme from ggfortify
theme(plot.title = element_text(hjust = .5),
axis.ticks = element_blank()) + # Centre plot title
scale_fill_brewer(palette = "Dark2") # Color palette
Calendar Heatmap
When we want to see the variation, especially the highs and lows, of a metric like stock price, on an actual
calendar itself, the calendar heat map is a great tool. It emphasizes the variation visually over time rather than
the actual value itself.
This can be implemented using the geom_tile. But getting it in the right format has more to do with the data
preparation rather than the plotting itself.
library(ggplot2)
library(plyr)
library(scales)
library(zoo)
df <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/yahoo.csv")
df$date <- as.Date(df$date) # format date
df <- df[df$year >= 2012, ] # filter reqd years
# Create Month Week
df$yearmonth <- as.yearmon(df$date)
df$yearmonthf <- factor(df$yearmonth)
df <- ddply(df,.(yearmonthf), transform, monthweek=1+week-min(week)) # compute week number of month
df <- df[, c("year", "yearmonthf", "monthf", "week", "monthweek", "weekdayf", "VIX.Close")]
head(df)
#> year yearmonthf monthf week monthweek weekdayf VIX.Close
#> 1 2012 Jan 2012 Jan 1 1 Tue 22.97
#> 2 2012 Jan 2012 Jan 1 1 Wed 22.22
#> 3 2012 Jan 2012 Jan 1 1 Thu 21.48
#> 4 2012 Jan 2012 Jan 1 1 Fri 20.63
#> 5 2012 Jan 2012 Jan 2 2 Mon 21.07
#> 6 2012 Jan 2012 Jan 2 2 Tue 20.69
# Plot
ggplot(df, aes(monthweek, weekdayf, fill = VIX.Close)) +
geom_tile(colour = "white") +
facet_grid(year~monthf) +
scale_fill_gradient(low="red", high="green") +
labs(x="Week of Month",
y="",
title = "Time-Series Calendar Heatmap",
subtitle="Yahoo Closing Price",
fill="Close")
Seasonal Plot
If we are working with a time series object of class ts or xts, we can view the seasonal fluctuations through a
seasonal plot drawn using forecast::ggseasonplot. Below is an example using the native AirPassengers and
nottem time series.
We can see the traffic increase in air passengers over the years along with the repetitive seasonal patterns in
traffic. Whereas Nottingham does not show an increase in overal temperatures over the years, but they definitely
follow a seasonal pattern.
library(ggplot2)
install.packages("forecast")
library(forecast)
theme_set(theme_classic())
# Subset data
nottem_small <- window(nottem, start=c(1920, 1), end=c(1925, 12)) # subset a smaller timewindow
# Plot1
ggseasonplot(AirPassengers) + labs(title="Seasonal plot: International Airline Passengers")
# Plot1
ggseasonplot(nottem_small) + labs(title="Seasonal plot: Air temperatures at Nottingham Castle")