The following are instructions to download and manipulate the metadata from the "Mesophotic.org" repository in R (https://www.r-project.org/). Electronic notebook by Veronica Radice with contributions from Pim Bongaerts.
For these examples, please install (e.g. using install.packages(dplyr)
) and load (as detailed below using require
) the following packages for manipulating and viewing the data:
require(dplyr)
require(tidyr)
require(ggplot2)
require(ggstance)
require(ggpubr)
require(tidyverse)
require(cowplot)
You can import the latest database directly from the website into R:
publs <- read.csv("http://www.mesophotic.org/publications.csv",
as.is=TRUE)
# examine publications up until and including the year 2018 (eliminate the
# latest 2019 publications)
publs <- publs[!(publs$publication_year=="2019"),]
Show table dimensions and column names:
dim(publs)
colnames(publs)
Subset the publication list to those that are validated (i.e. metadata validated by two different content editors) or those that are included_in_stats (i.e. as validated but only including publications representing (peer-reviewed) scientific articles presenting original data from mesophotic depths):
publs_validated <- subset(publs, validated == "true")
publs_for_stats <- subset(publs, included_in_stats == "true")
Calculate the fold-increase in publications over the past 10 years (between 2008 and 2018):
publs_by_year <- publs_validated %>%
group_by(publication_year) %>%
summarise(count=n()) %>%
mutate(total_count = cumsum(count))
fold_increase <- publs_by_year[publs_by_year$publication_year==2018, "total_count"] /
publs_by_year[publs_by_year$publication_year==2008, "total_count"]
fold_increase
Where fields have multiple values (through so-called “HABTM” associations), such as research_fields , they are separated by semicolons, for example: Ecology; Community structure; Disturbances. Therefore, to query any of these fields, we need to separate these into individual rows. Then, we can plot for example the number of publications for each of those fields:
# Split `fields` into new rows
publs_fields <- separate_rows(publs_for_stats, "fields", sep=";\\s+")
# Remove rows with no associated `fields`
publs_field <- publs_fields[!(is.na(publs_fields$fields) |
publs_fields$fields == ""), ]
# Group by `fields`, count the number of publications and plot
top10_fields <- publs_fields %>%
group_by(fields) %>%
summarise(count=n()) %>%
top_n(10)
top10_fields$fields <- as.factor(top10_fields$fields)
top10_fields <- top10_fields %>% arrange(desc(count))
top10_fields
bar_fields <- ggplot(data = top10_fields, aes(x = count,
y = reorder(fields, count))) +
geom_barh(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Publications", y = "Fields") + theme_bw()
bar_fields
# Split `focusgroups` into new rows
publs_focusgroups <- separate_rows(publs_for_stats, "focusgroups", sep=";\\s+")
# Remove rows with no associated `focusgroups`
publs_focusgroups <- publs_focusgroups[!(is.na(publs_focusgroups$focusgroups) |
publs_focusgroups$focusgroups ==""), ]
# count total number of records (rows)
countTotal_focusgroups <- nrow(publs_focusgroups)
count_focusgroups <- publs_focusgroups %>%
group_by(focusgroups) %>%
summarise(count=n())
# percent of focusgroup counts divided by total number of records of focusgroups (aka rows)
percent_focusgroups <- count_focusgroups %>%
mutate(percent=paste0(round(100*count/countTotal_focusgroups,1)))
percent_focusgroups$percent <- as.numeric(percent_focusgroups$percent)
top10_focusgroups <- percent_focusgroups %>%
top_n(10, percent) %>%
arrange(desc(count))
top10_focusgroups$focusgroups <- as.factor(top10_focusgroups$focusgroups)
top10_focusgroups
bar_focusgroups <- ggplot(data = top10_focusgroups, aes(x = percent,
y = reorder(focusgroups, count))) +
geom_barh(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Percent", y = "Focus Groups") +
theme_bw()
bar_focusgroups
# Split `platforms` into new rows
publs_platforms <- separate_rows(publs_for_stats, "platforms", sep=";\\s+")
# Remove rows with no associated `platforms`
publs_platforms <- publs_platforms[!(is.na(publs_platforms$platforms) |
publs_platforms$platforms == ""), ]
# Group by `platforms`, count the number of publications and plot
top10_platforms <- publs_platforms %>%
group_by(platforms) %>%
summarise(count=n()) %>%
top_n(10)
top10_platforms$platforms <- as.factor(top10_platforms$platforms)
top10_platforms <- top10_platforms %>%
arrange(desc(count))
bar_platforms <- ggplot(data = top10_platforms, aes(x = count,
y = reorder(platforms, count))) +
geom_barh(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Publications", y = "Platforms") +
theme_bw()
bar_platforms
# Split `locations` into new rows
publs_locations <- separate_rows(publs_for_stats, "locations", sep=";\\s+")
# Remove rows with no associated `locations`
publs_locations <- publs_locations[!(is.na(publs_locations$locations) |
publs_locations$locations == ""), ]
# Group by `locations`, count the number of publications and plot
top10_locations <- publs_locations %>%
group_by(locations) %>%
summarise(count=n()) %>%
top_n(10)
top10_locations$locations <- as.factor(top10_locations$locations)
top10_locations <- top10_locations %>% arrange(desc(count))
bar_locations <- ggplot(data = top10_locations, aes(x = count,
y = reorder(locations, count))) +
geom_barh(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Publications", y = "Locations") +
theme_bw()
bar_locations
#pdf("Fig category bargraphs.pdf", width = 9, height = 6,
encoding="MacRoman")
plot <- plot_grid(bar_focusgroups,
bar_fields,
bar_platforms,
bar_locations,
labels = c('a)', 'b)', 'c)', 'd)'),
hjust = -1,
vjust = 1,
align = 'vh',
scale = c(1, 1, 1, 1),
label_size = 12,
ncol = 2,
label_fontface = "plain")
plot
#dev.off()
Data for the visualizations on the http://mesophotic.org/stats page can be downloaded as CSVs and manipulated too. For example:
pubs_over_depth <- read.csv("pubs_over_depth.csv", as.is = TRUE)
pubs_over_depth <- data.frame(pubs_over_depth$Publications, pubs_over_depth$Category)
Publications by max depth
colnames(pubs_over_depth) <- c("Publications", "Depth")
#pdf("Fig Publications over depth.pdf", width = 8, height = 6, encoding="MacRoman")
g <- ggplot(pubs_over_depth, aes(x = Depth, y = Publications)) +
geom_area(color="#69b3a2", fill = "#69b3a2") +
scale_x_reverse(breaks = seq(150, 30, -8), limits = c(150, 30)) +
scale_y_continuous(breaks = seq(0, 500, 50), limits = c(0, 500)) +
labs(x = "Depth (m)", y = "Publications") +
theme_classic()
g + rotate()
#dev.off()
Import data Publications over time
pubs_over_time <- read.csv("pubs_over_time.csv", as.is = TRUE)
colnames(pubs_over_time) <- c("year", "value")
# modify data frame by creating new variable
pubs_over_time <- mutate(pubs_over_time, category = "Publication")
pubs_over_time$category <- as.factor(pubs_over_time$category)
pubs_over_time
Import data Locations (by Year) over time
locations_over_time <- read.csv("locations_over_time.csv", as.is = TRUE)
colnames(locations_over_time) <- c("year", "value")
# modify data frame by creating new variable
locations_over_time <- mutate(locations_over_time,
category = "Study location")
locations_over_time$category <- as.factor(locations_over_time$category)
locations_over_time
Import data First Authors (by Year) over time
people_over_time <- read.csv("people_over_time.csv", as.is = TRUE) # unique first authors
colnames(people_over_time) <- c("year", "value")
# modify data frame by creating new variable
people_over_time <- mutate(people_over_time, category = "First author")
people_over_time$category <- as.factor(people_over_time$category)
people_over_time
Import data term "mesophotic" (by Year) over time
mesophotic_over_time <- read.csv("mesophotic-mentions-cumulative.csv",
as.is = TRUE)
mesophotic_over_time <- mesophotic_over_time[-c(3)]
colnames(mesophotic_over_time) <- c("year", "value")
# modify data frame by creating new variable
mesophotic_over_time <- mutate(mesophotic_over_time,
category = "Term 'mesophotic'")
mesophotic_over_time$category <- as.factor(mesophotic_over_time$category)
mesophotic_over_time
Merge data that look at counts (Pubs, Location, etc) over time (by Year)
# join data
# append to dataframe as new rows
data_over_time <- bind_rows(locations_over_time, pubs_over_time,
mesophotic_over_time, .id = NULL) # could add people_over_time
data_over_time$category <- as.factor(data_over_time$category)
data_over_time
Publications and Locations over time Stacked Area chart
# designate specific order
data_over_time$category <- factor(data_over_time$category,
levels = c("Publication", "Study location", "Term 'mesophotic'")) # could add "First author"
# colorblind friendly palette
# http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/
# cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
cbPalette <- c("#0072B2", "#56B4E9", "#999999", "#E69F00") # "#E69F00", "#009E73",
# alternate color pallets via color brewer
# scale_fill_brewer(palette="Greens")
#pdf("Fig Series of data over time.pdf", width = 8, height = 5, encoding="MacRoman")
ggplot(data_over_time, aes(x=year, y=value, fill=category)) +
geom_area(colour="black", size=.3, alpha=.4) +
scale_fill_manual(values = cbPalette) +
scale_x_continuous(name = "Year", breaks = seq(1990, 2018, 4), limits = c(1990, 2018)) +
ylab("Number") +
labs(fill = "Series") +
theme_classic() +
theme(legend.title = element_text(), legend.justification = c(0, 1), legend.position = c(0, 1), legend.box.margin=margin(c(5,5,5,5)))
#dev.off()