Machine Learning - Data visualization with R (I)

Machine Learning - Data visualization with R (I)

This article presents different techniques that can be used to communicate data or information by encoding it in graphs.


Histogram

Histograms roughly assess the probability distribution of a given variable by depicting the frequencies of observations occurring in certain ranges of values.

# Load required packages
install.packages("ggplot2");library("ggplot2")
install.packages("ggExtra");library("ggExtra")
install.packages("qcc");library("qcc")
install.packages("gridExtra");library("gridExtra")
install.packages("ggpubr");library("ggpubr");


# A single categorical variable
ggplot(iris[iris$Petal.Length>4,], aes(x=Species,y = ..count..)) + 
  geom_histogram(stat="count",
  color="white", fill=rgb(0.2,0.7,0.1,0.4))
# A single quantitative variable
ggplot(iris[iris$Petal.Length>2,], aes(x=Petal.Length,y = ..count..)) + 
  geom_histogram(stat="count",
  color="white", fill=rgb(0.2,0.7,0.1,0.4)) 
# A quantitative variable and a categorical variable.
g <- ggplot(mpg, aes(displ)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(aes(fill=class), 
  bins=20, col="black", size=.1) +
  labs(title="Histogram with Fixed Bins", 
   subtitle="Engine Displacement across Vehicle Classes")
# Two categorical variables.
g <- ggplot(mpg, aes(manufacturer)) + scale_fill_brewer(palette = "Spectral")
g + geom_bar(aes(fill=class), col="black", width = 0.5) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Histogram on Categorical Variable", 
    subtitle="Manufacturer across Vehicle Classes"
# A quantitative variable with a categorical variable
species_list <- split(iris, iris$Species)
par(mfrow = c(1,length(species_list)))
for(i in 1:length(species_list)){ 
  hist(species_list[[i]]$Sepal.Width, probability = T, 
       main = "", xlab = "Sepal Width (cm)", ylab = "Probability",
       col = c("cyan","green3","yellow")[i])
  mtext(toupper(names(species_list)[i]),
        side = 3, # 1 is bottom, 2 is left, 3 is top, 4 is right
        line = 0, # do not shiftp or down
        font = 2) # 2 is bold
  lines(density(species_list[[i]]$Sepal.Width),col="black",lwd =3)
}
# A quantitative variable with a categorical variable in a single chart 
ggplot(iris, aes(x=iris$Petal.Length, fill = iris$Species)) + 
  geom_histogram(alpha=0.5, position = 'identity')


Scatter Plot With Marginal Histograms

# Two quantitative variables 
p<-ggplot(iris,aes(x=Petal.Length,y=Petal.Width))+
  geom_point(cex=2,shape=1,stroke=1)
ggExtra::ggMarginal(p, type = "histogram")
# Two quantitative variables and a categorical variable
p<-ggplot(iris,aes(x=Petal.Length,y=Petal.Width,colour=levels(iris$Species)[iris$Species]))+
  labs(colour="Species")+
  geom_point(cex=2,shape=1,stroke=1)+
  theme(legend.key=element_blank(),panel.border=element_blank(),strip.background=element_blank())
ggExtra::ggMarginal(p, type = "histogram")


Pareto Chart

A Pareto chart is a type of chart that contains both bars and a line graph. Bars represent individual values sorted in descending order, whereas the line represents the cumulative total.

# A single categorical variable
pareto.chart(summary(iris$`Species`))


Box Plot

A box plot is a method for graphically depicting quantitative data through their quartiles. Box plots also have lines extending vertically from the boxes indicating variability outside the upper and lower quartiles. Outliers may be plotted as individual points. 

# A single quantitative variable
ggplot(mpg, aes(x="", y=hwy, fill="")) + 
    geom_boxplot(alpha=0.3) +
    theme(legend.position="none")
# A quantitative variable with a categorical variable
ggplot(mpg, aes(x=class, y=hwy, fill=class)) + 
    geom_boxplot(alpha=0.3) +
    theme(legend.position="none")
# A quantitative variable with two categorical variables

# Create a data frame
variety=rep(LETTERS[1:7], each=40)
treatment=rep(c("high","low"),each=20)
note=seq(1:280)+sample(1:150, 280, replace=T)
data=data.frame(variety, treatment ,  note)

# Create box plot
ggplot(data, aes(x=variety, y=note, fill=treatment)) + geom_boxplot()
# A quantitative variable with two categorical variables
# One box per treatment
ggplot(data, aes(x=variety, y=note, fill=treatment)) +
    geom_boxplot() + facet_wrap(~treatment)
# Or... one box per variety
ggplot(data, aes(x=variety, y=note, fill=treatment)) +
    geom_boxplot() + facet_wrap(~variety, scale="free")
# A quantitative variable with a categorical variable
qplot(x=variety, y=note, data=data, geom=c("boxplot","jitter"), fill=variety)

# A quantitative variable with two categorical variables
qplot(x=variety, y=note, data=data, geom=c("boxplot","jitter"), fill=treatment)
# A quantitative variable with a categorical variable
ggplot(data, aes(factor(variety), note)) + geom_violin(aes(fill = variety))

# A quantitative variable with two categorical variables
ggplot(data, aes(factor(variety), note)) + geom_violin(aes(fill = treatment))


Scatter Plot With Marginal Box Plots

# Two quantitative variables 
p<-ggplot(iris,aes(x=Petal.Length,y=Petal.Width))+
  geom_point(cex=2,shape=1,stroke=1)
ggExtra::ggMarginal(p, type="boxplot")
# Two quantitative variables and a categorical variable
p<-ggplot(iris,aes(x=Petal.Length,y=Petal.Width,colour=levels(iris$Species)[iris$Species]))+
  labs(colour="Species")+
  geom_point(cex=2,shape=1,stroke=1)+
  theme(legend.key=element_blank(),panel.border=element_blank(),strip.background=element_blank())
ggExtra::ggMarginal(p, type="boxplot")


Density Plot

# A quantitative variable
ggplot(iris, aes(Petal.Length, fill="")) + geom_density(alpha=0.55)
# A quantitative variable with a categorical variable
ggplot(iris, aes(Petal.Length, colour=Species, fill=Species)) + 
  geom_density(alpha=0.55)
l1<-ggplot(iris, aes(iris$Petal.Width, colour=Species, fill=Species)) + 
  geom_density(alpha=0.55)+ theme_bw()+xlab("Petal.Width")

l2<-ggplot(iris, aes(iris$Petal.Length, colour=Species, fill=Species)) +
  geom_density(alpha=0.55)+ theme_bw()+xlab("Petal.Length")

l3<-ggplot(iris, aes(iris$Sepal.Width, colour=Species, fill=Species)) + 
  geom_density(alpha=0.55)+ theme_bw()+xlab("Sepal.Width")

l4<-ggplot(iris, aes(iris$Sepal.Length, colour=Species, fill=Species)) +
  geom_density(alpha=0.55)+ theme_bw()+xlab("Sepal.Length")

ggarrange(l1,l2,l3,l4,common.legend = TRUE, legend = "bottom")


Scatter Plot With Marginal Density Plots

# Two quantitative variables
scatterPlot <- ggplot(iris,aes(x=Petal.Length,y=Petal.Width)) + 
  geom_point() + 
  theme(legend.position=c(0,1), legend.justification=c(0,1))

xdensity <- ggplot(iris, aes(x=Petal.Length,fill="")) + 
  geom_density(alpha=.5) + 
  theme(legend.position = "none")

ydensity <- ggplot(iris, aes(x=Petal.Width,fill="")) + 
  geom_density(alpha=.5) + 
  theme(legend.position = "none")

blankPlot <- ggplot()+geom_blank(aes(1,1))+
  theme(plot.background = element_blank(), 
   panel.grid.major = element_blank(),
   panel.grid.minor = element_blank(), 
   panel.border = element_blank(),
   panel.background = element_blank(),
   axis.title.x = element_blank(),
   axis.title.y = element_blank(),
   axis.text.x = element_blank(), 
   axis.text.y = element_blank(),
   axis.ticks = element_blank()
   )

grid.arrange(xdensity, blankPlot, scatterPlot, ydensity, 
        ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))
# Two quantitative variables and a categorical variable
scatterPlot <- ggplot(iris,aes(x=Petal.Length,y=Petal.Width, color=Species)) + 
  geom_point() + 
  scale_color_manual(values = rainbow(3)) + 
  theme(legend.position=c(0,1), legend.justification=c(0,1))

xdensity <- ggplot(iris, aes(x=Petal.Length, fill=Species)) + 
  geom_density(alpha=.5) + 
  scale_fill_manual(values = rainbow(3)) + 
  theme(legend.position = "none")

ydensity <- ggplot(iris, aes(x=Petal.Width, fill=Species)) + 
  geom_density(alpha=.5) + 
  scale_fill_manual(values = rainbow(3)) + 
  theme(legend.position = "none")

blankPlot <- ggplot()+geom_blank(aes(1,1))+
  theme(plot.background = element_blank(), 
   panel.grid.major = element_blank(),
   panel.grid.minor = element_blank(), 
   panel.border = element_blank(),
   panel.background = element_blank(),
   axis.title.x = element_blank(),
   axis.title.y = element_blank(),
   axis.text.x = element_blank(), 
   axis.text.y = element_blank(),
   axis.ticks = element_blank()
   )

grid.arrange(xdensity, blankPlot, scatterPlot, ydensity, 
        ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))

To be continued...

Share your experience and provide feedback!

To view or add a comment, sign in

Insights from the community

Others also viewed

Explore topics