R Visualization ADA
R Visualization ADA
Visualizing Data
s.patra@iimkashipur.ac.in
x <- 1:10
y <- log(x)
plot(x,y)
2.0
1.5
y
1.0
0.5
0.0
2 4 6 8 10
ggpubr::show_point_shapes()
0 1 2 3 4 5
6 7 8 9 10 11
12 13 14 15 16 17
18 19 20 21 22 23
24 25
2.0
1.5
y
1.0
0.5
0.0
2 4 6 8 10
labelset <-c('one','two','three','four','five','six','seven','eight','nine','ten')
plot(x,y,pch = c(0,18),cex = 1.5,col = c('red','blue'),type='o',lty = 3,lwd = 2,
main = "Graph of y = log(x) vs Graph of y = x-1", col.main = "purple",
xlab="X Values",ylab="Y Values")
text(x+1,y,labelset,col='red')
lines(x,x-1,col='green',lty = 4, lwd = 2)
legend('bottomright',inset=0.05, c("Log","minus 1"),
lty=c(2,4),col=c("red","green"))
abline(h=c(4,6),col="orange",lty=2)
ten
nine
eight
2.0
seven
six
five
1.5
four
Y Values
three
1.0
two
0.5
Log(x)
x−1
0.0
one
2 4 6 8 10
X Values
dev.off() : closes the specified plot (by default the current device)
200
MntSweetProducts
100
200
MntSweetProducts
100
200
100
MntSweetProducts
0
0e+00 2e+05 4e+05 6e+05
Master PhD
200
100
0
0e+00 2e+05 4e+05 6e+05 0e+00 2e+05 4e+05 6e+05
Income
400
300
MntSweetProducts
200
100
400
300
200
100
MntSweetProducts
400
300
200
100
200
150
MntSweetProducts
100
50
200
150
MntSweetProducts
100
50
200
factor(Teenhome)
MntSweetProducts
0
1
2
size
100 2
8 10 12
log(Income)
## Specification 1
ggplot(data = df, aes(x = Income,
y = MntSweetProducts,
col=factor(Teenhome),
size = 2)) +
geom_point()
## Specification 2
ggplot(data = df, aes(x = Income, y = MntSweetProducts)) +
geom_point(aes(col=factor(Teenhome), size = 2))
## Specification 3
ggplot(data = df, aes(x = Income)) +
geom_point(aes(y = MntSweetProducts, col=factor(Teenhome), size = 2))
## Specification 4
ggplot(data = df) +
geom_point(aes(x = Income,
y = MntSweetProducts,
col=factor(Teenhome),
size = 2))
200
150
MntSweetProducts
factor(Teenhome)
0
100
1
2
50
200
150
MntSweetProducts
factor(Teenhome)
0
100
1
2
50
200
150
MntSweetProducts
colour
100
red
50
200
150
MntSweetProducts
colour
100 lm
loess
50
df %>%
ggplot(aes(x = Education, y = Income)) +
geom_bar(stat = "identity")
6e+07
4e+07
Income
2e+07
0e+00
df %>%
filter(Income < 20000) %>%
ggplot(aes(x=Income)) +
geom_histogram(binwidth=2000, fill="red", color="blue", alpha=0.9)
30
20
count
10
df %>%
filter(Income < 20000) %>%
ggplot(aes(x=Income)) +
geom_density(fill="green", color="#e9ecef", alpha=0.8)
1.0e−04
7.5e−05
density
5.0e−05
2.5e−05
0.0e+00
df %>%
filter(Income < 20000) %>%
ggplot(aes(x=Education, y=Income)) +
geom_boxplot() +
geom_jitter(color="black", size=0.4, alpha=0.9)
20000
15000
Income
10000
5000
df %>%
filter(Income < 20000) %>%
ggplot(aes(x = Income, colour = factor(Teenhome))) +
stat_ecdf()
1.00
0.75
factor(Teenhome)
0
ecdf
0.50
1
2
0.25
0.00
9
No_Customer
df %>%
filter(Income < 20000) %>%
ggplot(aes(x=Education, y=Income)) +
geom_boxplot() +
geom_jitter(color="black", size=0.4, alpha=0.9) +
facet_grid(factor(Teenhome) ~ factor(Kidhome))
0 1 2
20000
15000
0
10000
5000
20000
15000
Income
1
10000
5000
20000
15000
2
10000
5000
2n Cycle Basic Graduation Master PhD 2n Cycle Basic Graduation Master PhD 2n Cycle Basic Graduation Master PhD
Education
df %>%
ggplot(aes(x = Education, y = Income, fill = factor(Teenhome))) +
geom_bar(stat = "Identity")
6e+07
4e+07
factor(Teenhome)
Income
0
1
2
2e+07
0e+00
df %>%
ggplot(aes(x = Education, y = Income, fill = factor(Teenhome))) +
geom_bar(stat = "Identity", position = "dodge")
6e+05
4e+05
factor(Teenhome)
Income
0
1
2
2e+05
0e+00
df %>%
ggplot(aes(x = Education, y = Income, fill = factor(Teenhome))) +
geom_bar(stat = "Identity", position = "fill")
1.00
0.75
factor(Teenhome)
Income
0
0.50
1
2
0.25
0.00
ggplot(df) +
aes(x=Income, y=MntSweetProducts) +
geom_point(aes(col=factor(Teenhome)), size=2) +
scale_x_continuous(breaks=seq(0, 150000, 25000), labels = seq(0,150,25)) +
xlim(c(0, 115000)) +
ylim(c(0, 200))
200
150
MntSweetProducts
factor(Teenhome)
0
100
1
2
50
A continuous scale will handle things like numeric data (where there is a continuous set of
numbers), whereas a discrete scale (scale_x_discrete())will handle things like colors.
ggplot(df) +
aes(x=Income, y=MntSweetProducts) +
geom_point(aes(col=factor(Teenhome)), size=2) +
scale_x_continuous(breaks=seq(0, 150000, 25000), labels = seq(0,150,25)) +
xlim(c(0, 115000)) +
ylim(c(0, 200)) +
labs(title="Income vs Amount of Sweet Products Bought",
subtitle="Customer dataset",
y="Amount of sweet products",
x="Income (in thousand units)",
color = "Teens at home",
caption="Customer Purchase Behaviour")
200
150
Amount of sweet products
Teens at home
0
100
1
2
50
Name: R offers about 657 color names. You can read all of them using colors().
rgb(red, green, blue, alpha): The rgb() function allows to build a color using a
quantity of red, green and blue. An additionnal parameter (alpha) is available to set the
transparency. All parameters ranged from 0 to 1.
Number: Also possible to call a function by its number. For instance, if you need the
color number 143, use colors()[143].
Hex code → All colors can be defined by their hex code. A hex code looks like this:
#69b3a2. To find the hex code of your colour, visit this colour picker.
Colour Libraries: Rcolorbrewer, paletteer etc.
YlOrRd
YlOrBr
YlGnBu
YlGn
Reds
RdPu
Purples
PuRd
PuBuGn
PuBu
OrRd
Oranges
Greys
Greens
GnBu
BuPu
BuGn
Blues
Set3
Set2
Set1
Pastel2
Pastel1
Paired
Dark2
Accent
Spectral
RdYlGn
RdYlBu
RdGy
RdBu
PuOr
PRGn
PiYG
BrBG
library(RColorBrewer)
ggplot(df) +
aes(x=Income, y=MntSweetProducts) +
geom_point(aes(col=factor(Teenhome)), size=2) +
scale_colour_brewer(palette = "Set1") +
scale_x_continuous(breaks=seq(0, 150000, 25000), labels = seq(0,150,25)) +
xlim(c(0, 115000)) +
ylim(c(0, 200)) +
labs(title="Income vs Amount of Sweet Products Bought",
subtitle="Customer dataset",
y="Amount of sweet products",
x="Income (in thousand units)",
color = "Teens at home",
caption="Customer Purchase Behaviour") +
theme_classic()
200
150
Amount of sweet products
Teens at home
0
100
1
2
50
ggplot(df) +
aes(x=Income, y=MntSweetProducts) +
geom_point(aes(col=factor(Teenhome)), size=2) +
scale_colour_brewer(palette = "Set1") +
scale_x_continuous(breaks=seq(0, 150000, 25000), labels = seq(0,150,25)) +
xlim(c(0, 115000)) +
ylim(c(0, 200)) +
geom_vline(xintercept = c(35000,88000), #geom_hline for horizontal
linetype="dotted",
color = "green",
size=1.5) +
labs(title="Income vs Amount of Sweet Products Bought",
subtitle="Customer dataset",
y="Amount of sweet products",
x="Income (in thousand units)",
color = "Teens at home",
caption="Customer Purchase Behaviour") +
theme_classic()
200
150
Amount of sweet products
Teens at home
0
100
1
2
50
ggplot(df) +
aes(x=Income, y=MntSweetProducts) +
geom_point(aes(col=factor(Teenhome)), size=2) +
scale_colour_brewer(palette = "Set1") +
scale_x_continuous(breaks=seq(0, 150000, 25000), labels = seq(0,150,25)) +
xlim(c(0, 115000)) +
ylim(c(0, 200)) +
geom_vline(xintercept = c(35000,88000), #geom_hline for horizontal
linetype="dotted",
color = "green",
size=1.5) +
geom_text(x=5000, y=175, label="Scatter plot")) +
labs(title="Income vs Amount of Sweet Products Bought",
subtitle="Customer dataset",
y="Amount of sweet products",
x="Income (in thousand units)",
color = "Teens at home",
caption="Customer Purchase Behaviour") +
theme_classic()
200
Scatter plot
150
Amount of sweet products
Teens at home
0
100
1
2
50