This vignette replicates the figures found in “The q-q boxplot” (citation coming soon).
First load the ‘qqboxplot’ package and the relevant packages from the ‘tidyverse’.
Figure 1 is meant to be a quick comparison of boxplots, q-q plots, and q-q boxplots. It uses a random sample of genes from one autism and one control patient and determines if the log of the gene expression counts can be modeled as lognormal.
#set value for text size so consistent across figures (only applies to figure 1)
eltext <- 12
#q-q boxplot
qqbox <- expression_data %>%
ggplot(aes(specimen, log_count)) + geom_qqboxplot(varwidth = TRUE, notch = TRUE) +
ylab('logged normalized expression') + ggtitle("c) q-q boxplot") +
theme(plot.title=element_text(size=eltext, face="plain", hjust=0.5), axis.title.x = element_text(size=eltext), axis.title.y = element_text(size=eltext),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid.major = element_line(colour = "grey70"),
panel.grid.minor = element_line(colour="grey80"))
# regular boxplot
box <- expression_data %>%
ggplot(aes(specimen, log_count)) + geom_boxplot(varwidth = TRUE, notch = TRUE) +
ylab('logged normalized expression') + ggtitle('a) boxplot') +
theme(plot.title=element_text(size=eltext, face="plain", hjust=0.5), axis.title.x = element_text(size=eltext), axis.title.y = element_text(size=eltext),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid.major = element_line(colour = "grey70"),
panel.grid.minor = element_line(colour="grey80"))
override.shape <- c(16, 17)
#q-q plot
qq <- expression_data %>%
ggplot(aes(sample=log_count)) + geom_qq(aes(color=specimen, shape=specimen)) +
xlab('theoretical normal distribution') +
ylab('logged normalized expression') + ggtitle('b) q-q plot') +
labs(color="specimen") +
guides(color = guide_legend(override.aes = list(shape=override.shape)), shape=FALSE) +
theme(plot.title=element_text(size=eltext, face="plain", hjust=0.5), axis.title.x = element_text(size=eltext), axis.title.y = element_text(size=eltext),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid.major = element_line(colour = "grey70"),
panel.grid.minor = element_line(colour="grey80"),
legend.position = c(0.8, 0.2))
#> Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
#> 3.5.0.
#> ℹ Please use the `legend.position.inside` argument of `theme()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
library(gridExtra)
#>
#> Attaching package: 'gridExtra'
#> The following object is masked from 'package:dplyr':
#>
#> combine
# combine the plots
gridExtra::grid.arrange(box, qq, qqbox, ncol=3)
#> Warning: The following aesthetics were dropped during statistical transformation: y.
#> ℹ This can happen when ggplot fails to infer the correct grouping structure in
#> the data.
#> ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
#> variable into a factor?
Figure 2 in the paper shows boxplots for simulated t-distributions
(and one simulated normal distribution with mean=2). simulated_data
contains two columns, “y” and “group”.
“group” specifies the distribution the data (“y”) comes from.
simulated_data %>%
ggplot(aes(factor(group, levels=c("normal, mean=2", "t distribution, df=32", "t distribution, df=16", "t distribution, df=8", "t distribution, df=4")), y=y)) +
geom_boxplot(notch=TRUE, varwidth = TRUE) +
xlab(NULL) +
ylab(NULL) +
theme(axis.text.x = element_text(angle = 23, size = 15), axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Figure 3 is the same data, visualized with a q-q plot compared with the theoretical normal distribution.
override.shape <- c(16, 17, 15, 3, 7)
simulated_data %>% ggplot(aes(sample=y, color=factor(group, levels=c("normal, mean=2", "t distribution, df=32", "t distribution, df=16", "t distribution, df=8", "t distribution, df=4")),
shape=factor(group, levels=c("normal, mean=2", "t distribution, df=32", "t distribution, df=16", "t distribution, df=8", "t distribution, df=4")))) +
geom_qq() + geom_qq_line() + labs(color="distribution") +
xlab("Normal Distribution") +
ylab("Simulated Datasets") +
guides(color = guide_legend(override.aes = list(shape=override.shape)), shape=FALSE) +
theme(axis.title.x = element_text(size=15), axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Figure 4 is the same data, visualized with q-q boxplots, compared with the theoretical normal distribution. Note in this figure that reference_dist = “norm” is chosen to specify that the normal distribution should be the reference distribution.
simulated_data %>%
ggplot(aes(factor(group, levels=c("normal, mean=2", "t distribution, df=32", "t distribution, df=16", "t distribution, df=8", "t distribution, df=4")), y=y)) +
geom_qqboxplot(notch=TRUE, varwidth = TRUE, reference_dist="norm") +
xlab("reference: normal distribution") +
ylab(NULL) +
guides(color=FALSE) +
theme(axis.text.x = element_text(angle = 23, size = 15), axis.title.y = element_text(size=15),
axis.title.x = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
#> Warning: The following aesthetics were dropped during statistical transformation: y.
#> ℹ This can happen when ggplot fails to infer the correct grouping structure in
#> the data.
#> ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
#> variable into a factor?
simulated data was created by running the following code:
Figure 5 shows the same data from Figure 4, but compared against a
simulated normal distribution, with mean=5 and variance=1. Note that the
reference dataset comparison_dataset
is a separate vector
and is not contained in the dataset simulated_data
.
simulated_data %>%
ggplot(aes(factor(group, levels=c("normal, mean=2", "t distribution, df=32", "t distribution, df=16", "t distribution, df=8", "t distribution, df=4")), y=y)) +
geom_qqboxplot(notch=TRUE, varwidth = TRUE, compdata=comparison_dataset) +
xlab("reference: simulated normal dataset") +
ylab(NULL) +
theme(axis.text.x = element_text(angle = 23, size = 15), axis.title.y = element_text(size=15),
axis.title.x = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
#> Warning: The following aesthetics were dropped during statistical transformation: y.
#> ℹ This can happen when ggplot fails to infer the correct grouping structure in
#> the data.
#> ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
#> variable into a factor?
The vector comparison_dataset
was simulated as
follows
Figure 6 uses dataset “indicators” from this package. Male, 2008 serves as a base case. Note that the deviation whiskers for Male 2008 are straight since it is being compared with itself.
comparison_data <- indicators %>% filter(year==2008 & `Series Code`=="SL.TLF.ACTI.1524.MA.NE.ZS")
indicators %>%
#change the labels in series name to shorter titles
mutate(`Series Name`= ifelse(
`Series Name`=="Labor force participation rate for ages 15-24, male (%) (national estimate)",
"Male ages 15-24",
"Female ages 15-24")) %>%
ggplot(aes(as.factor(year), y=indicator))+
geom_qqboxplot(notch=TRUE, varwidth = TRUE, compdata=comparison_data$indicator) +
xlab("Year") +
ylab("Country level labor force\nparticipation rate (%)") +
facet_wrap(~factor(`Series Name`, levels = c("Male ages 15-24", "Female ages 15-24"))) +
theme(strip.text = element_text(size=12), axis.text.x = element_text(size = 15), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=12),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
#> Warning: The following aesthetics were dropped during statistical transformation: y.
#> ℹ This can happen when ggplot fails to infer the correct grouping structure in
#> the data.
#> ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
#> variable into a factor?
#> The following aesthetics were dropped during statistical transformation: y.
#> ℹ This can happen when ggplot fails to infer the correct grouping structure in
#> the data.
#> ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
#> variable into a factor?
Figure 7 is the same data visualized with a violin plot.
indicators %>%
#change the labels in series name to shorter titles
mutate(`Series Name`= ifelse(
`Series Name`=="Labor force participation rate for ages 15-24, male (%) (national estimate)",
"Male ages 15-24",
"Female ages 15-24")) %>%
ggplot()+
geom_violin(aes(x=factor(year),y=indicator),fill='grey',trim=F, draw_quantiles = c(.25, .5, .75))+
geom_segment(aes(
x=match(factor(year),levels(factor(year)))-0.1,
xend=match(factor(year),levels(factor(year)))+0.1,
y=indicator,yend=indicator),
col='black'
) +
xlab("Year") +
ylab("Country level labor force\nparticipation rate (%)") +
facet_wrap(~factor(`Series Name`, levels = c("Male ages 15-24", "Female ages 15-24"))) +
theme(strip.text = element_text(size=12), axis.text.x = element_text(size = 15), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=12),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Figure 8 looks at spike counts from the V1 region of the brain under various grating orientations shown to a anesthetized macaque. The question her is if the spike counts can for each orientation can be reasonably modeled by a normal distribution (i.e. is the firing rate high enough to be reasonably approximated by a Gaussian)
spike_data %>% filter(region=="V1") %>%
ggplot(aes(factor(orientation), nspikes)) +
geom_qqboxplot(notch=TRUE, varwidth = TRUE, reference_dist="norm") +
xlab("orientation") +
ylab("spike count") +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
#> Warning: The following aesthetics were dropped during statistical transformation: y.
#> ℹ This can happen when ggplot fails to infer the correct grouping structure in
#> the data.
#> ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
#> variable into a factor?
Same data, visualized with a bean plot.
spike_data %>% filter(region=="V1") %>%
ggplot()+
geom_violin(aes(x=factor(orientation),y=nspikes),fill='grey',trim=F, draw_quantiles = c(.25, .5, .75))+
geom_segment(aes(
x=match(factor(orientation),levels(factor(orientation)))-0.1,
xend=match(factor(orientation),levels(factor(orientation)))+0.1,
y=nspikes,yend=nspikes),
col='black'
) +
xlab("orientation") +
ylab("spike count") +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Figure 10 is the same setup but is looking at neurons in V2. Here we see that the firing rate is too low to be reasonably modeled as Gaussian.
spike_data %>% filter(region=="V2") %>%
ggplot(aes(factor(orientation), nspikes)) +
geom_qqboxplot(notch=TRUE, varwidth = TRUE, reference_dist="norm") +
xlab("orientation") +
ylab("spike count") +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
#> Warning: The following aesthetics were dropped during statistical transformation: y.
#> ℹ This can happen when ggplot fails to infer the correct grouping structure in
#> the data.
#> ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
#> variable into a factor?
Same data as Figure 10, visualized with a bean plot.
spike_data %>% filter(region=="V2") %>%
ggplot()+
geom_violin(aes(x=factor(orientation),y=nspikes),fill='grey',trim=F, draw_quantiles = c(.25, .5, .75))+
geom_segment(aes(
x=match(factor(orientation),levels(factor(orientation)))-0.1,
xend=match(factor(orientation),levels(factor(orientation)))+0.1,
y=nspikes,yend=nspikes),
col='black'
) +
xlab("orientation") +
ylab("spike count") +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14), axis.title.x = element_text(size=15),
axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))
Figure 12 look at the firing rate for a population of neurons and asks if the population can be reasonably modeled as lognormal. It appears that, in general, this is not a reasonable distributional assumption.
population_brain_data %>%
ggplot(aes(x=ecephys_structure_acronym, y=log(rate))) +
geom_qqboxplot(notch=TRUE, varwidth = TRUE, reference_dist="norm") +
xlab("Brain regions") +
ylab("Log firing rate") +
facet_wrap(~fr_type) +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14),
axis.title.x = element_text(size=15)
, axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"),
strip.text.x = element_text(size = 14))
#> Warning: The following aesthetics were dropped during statistical transformation: y.
#> ℹ This can happen when ggplot fails to infer the correct grouping structure in
#> the data.
#> ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
#> variable into a factor?
#> The following aesthetics were dropped during statistical transformation: y.
#> ℹ This can happen when ggplot fails to infer the correct grouping structure in
#> the data.
#> ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
#> variable into a factor?
#> The following aesthetics were dropped during statistical transformation: y.
#> ℹ This can happen when ggplot fails to infer the correct grouping structure in
#> the data.
#> ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
#> variable into a factor?
Same data as Figure 12, visualized with a violin plot.
population_brain_data %>%
ggplot(aes(x=ecephys_structure_acronym, y=log(rate))) +
geom_violin(fill='grey',trim=F, draw_quantiles = c(.25, .5, .75))+
xlab("Brain regions") +
ylab("Log firing rate") +
facet_wrap(~fr_type) +
theme(axis.text.x = element_text(size = 15), axis.text.y = element_text(size=14),
axis.title.x = element_text(size=15)
, axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"),
strip.text.x = element_text(size = 14))
library(scales)
# set grid
h_grid <- seq(from=-2, to=2, length.out = 101)
g_grid <- seq(from=-2, to=2, length.out = 101)
z_grid <- c(-.67448, .67448)
# transformation function
zfun <- function(h, g, z){
ifelse(g==0,
z*exp(h*z^2/2),
exp(h*z^2/2)*((exp(g*z)-1))/g)
}
# expand grid, calculate transformation of IQR. Group the data by g and h values
# and then calculate the IQR for the transformed data and the IQR for the normal
# distribution (the second is the same for all groups, but is calculated here
# to avoid hardcoding). Finally, compute the IQR ratio.
data <- as_tibble(expand.grid(h=h_grid, g=g_grid, z=z_grid)) %>%
mutate(zvals = zfun(h, g, z)) %>% group_by(h, g) %>%
summarize(iqr_mod = max(zvals) - min(zvals), iqr=max(z)-min(z)) %>%
ungroup() %>%
mutate(iqr_ratio = (iqr_mod) / iqr)
#> `summarise()` has grouped output by 'h'. You can override using the `.groups`
#> argument.
data %>%
ggplot(aes(g, h)) + geom_raster(aes(fill=iqr_ratio)) + scale_fill_gradient2(low=muted("blue"), mid="white", high=muted("red"), midpoint=1) +
guides(fill=guide_colorbar(title="iqr ratio")) +
theme(axis.title.x = element_text(size=15), axis.title.y = element_text(size=15),
panel.border = element_blank(), panel.background = element_rect(fill="white"),
panel.grid = element_line(colour = "grey70"))