This post is a complementary material for my talk at rstudio::conf 2022 (which is happening TODAY!!!)!
I’m writing this really quick, from the hotel room, for anyone interested to see the code used to create the plot that it is on my talk:
The plot uses data from the GitHub API, collected with the help of the package gh
and tidyverse
. The plot was created using ggplot2
, and shows the actions runs per month on GitHub repositories in organizations related to RStudio/Posit.
First, I had to look for the organizations. This was the list of organizations related to RStudio/Posit that I found on GitHub:
There might be more that I don’t know about!
So, here is a vector with the names of the organizations:
rstudio_orgs <-
c("rstudio",
"tidyverse",
"r-lib",
"mlverse",
"tidymodels",
"quarto-dev")
Then, I created an R function that looks for all public repositories in a organization:
find_repos_github_org <- function(organisation) {
# get info about the org
gh_org <- gh::gh("GET /orgs/{org}",
org = organisation)
number_of_public_repos <- gh_org$public_repos
pages_to_iterate <- ceiling(number_of_public_repos / 100)
repos_org_list <- purrr::map(
1:pages_to_iterate,
.f = ~ gh::gh(
"GET /orgs/{org}/repos",
org = organisation,
type = "public",
sort = "updated",
per_page = 100,
page = .x
)
)
repos_org_df <- repos_org_list |>
purrr::flatten() |>
purrr::map(unlist, recursive = TRUE) |>
purrr::map_dfr(tibble::enframe, .id = "id_repo") |>
tidyr::pivot_wider() |>
janitor::clean_names()
}
And then I executed that function for all organizations using purrr
:
rstudio_repositories <- rstudio_orgs |>
purrr::map_dfr(find_repos_github_org)
This is what I found (click to unfold the glimpsed result):
Código
dplyr::glimpse(rstudio_repositories)
# Rows: 599
# Columns: 114
# $ id_repo <chr> "1", "2", "3", "4", …
# $ id <chr> "390115983", "168794…
# $ node_id <chr> "MDEwOlJlcG9zaXRvcnk…
# $ name <chr> "py-shiny", "chromot…
# $ full_name <chr> "rstudio/py-shiny", …
# $ private <chr> "FALSE", "FALSE", "F…
# $ owner_login <chr> "rstudio", "rstudio"…
# $ owner_id <chr> "513560", "513560", …
# $ owner_node_id <chr> "MDEyOk9yZ2FuaXphdGl…
# $ owner_avatar_url <chr> "https://avatars.git…
# $ owner_gravatar_id <chr> "", "", "", "", "", …
# $ owner_url <chr> "https://api.github.…
# $ owner_html_url <chr> "https://github.com/…
# $ owner_followers_url <chr> "https://api.github.…
# $ owner_following_url <chr> "https://api.github.…
# $ owner_gists_url <chr> "https://api.github.…
# $ owner_starred_url <chr> "https://api.github.…
# $ owner_subscriptions_url <chr> "https://api.github.…
# $ owner_organizations_url <chr> "https://api.github.…
# $ owner_repos_url <chr> "https://api.github.…
# $ owner_events_url <chr> "https://api.github.…
# $ owner_received_events_url <chr> "https://api.github.…
# $ owner_type <chr> "Organization", "Org…
# $ owner_site_admin <chr> "FALSE", "FALSE", "F…
# $ html_url <chr> "https://github.com/…
# $ description <chr> "Shiny for Python", …
# $ fork <chr> "FALSE", "FALSE", "F…
# $ url <chr> "https://api.github.…
# $ forks_url <chr> "https://api.github.…
# $ keys_url <chr> "https://api.github.…
# $ collaborators_url <chr> "https://api.github.…
# $ teams_url <chr> "https://api.github.…
# $ hooks_url <chr> "https://api.github.…
# $ issue_events_url <chr> "https://api.github.…
# $ events_url <chr> "https://api.github.…
# $ assignees_url <chr> "https://api.github.…
# $ branches_url <chr> "https://api.github.…
# $ tags_url <chr> "https://api.github.…
# $ blobs_url <chr> "https://api.github.…
# $ git_tags_url <chr> "https://api.github.…
# $ git_refs_url <chr> "https://api.github.…
# $ trees_url <chr> "https://api.github.…
# $ statuses_url <chr> "https://api.github.…
# $ languages_url <chr> "https://api.github.…
# $ stargazers_url <chr> "https://api.github.…
# $ contributors_url <chr> "https://api.github.…
# $ subscribers_url <chr> "https://api.github.…
# $ subscription_url <chr> "https://api.github.…
# $ commits_url <chr> "https://api.github.…
# $ git_commits_url <chr> "https://api.github.…
# $ comments_url <chr> "https://api.github.…
# $ issue_comment_url <chr> "https://api.github.…
# $ contents_url <chr> "https://api.github.…
# $ compare_url <chr> "https://api.github.…
# $ merges_url <chr> "https://api.github.…
# $ archive_url <chr> "https://api.github.…
# $ downloads_url <chr> "https://api.github.…
# $ issues_url <chr> "https://api.github.…
# $ pulls_url <chr> "https://api.github.…
# $ milestones_url <chr> "https://api.github.…
# $ notifications_url <chr> "https://api.github.…
# $ labels_url <chr> "https://api.github.…
# $ releases_url <chr> "https://api.github.…
# $ deployments_url <chr> "https://api.github.…
# $ created_at <chr> "2021-07-27T20:19:49…
# $ updated_at <chr> "2022-07-27T23:47:04…
# $ pushed_at <chr> "2022-07-27T20:48:42…
# $ git_url <chr> "git://github.com/rs…
# $ ssh_url <chr> "git@github.com:rstu…
# $ clone_url <chr> "https://github.com/…
# $ svn_url <chr> "https://github.com/…
# $ homepage <chr> "https://shiny.rstud…
# $ size <chr> "7850", "16032", "31…
# $ stargazers_count <chr> "19", "125", "1532",…
# $ watchers_count <chr> "19", "125", "1532",…
# $ language <chr> "JavaScript", "R", "…
# $ has_issues <chr> "TRUE", "TRUE", "TRU…
# $ has_projects <chr> "TRUE", "TRUE", "FAL…
# $ has_downloads <chr> "TRUE", "TRUE", "TRU…
# $ has_wiki <chr> "TRUE", "TRUE", "FAL…
# $ has_pages <chr> "TRUE", "TRUE", "FAL…
# $ forks_count <chr> "0", "18", "335", "1…
# $ archived <chr> "FALSE", "FALSE", "F…
# $ disabled <chr> "FALSE", "FALSE", "F…
# $ open_issues_count <chr> "48", "27", "44", "1…
# $ license_key <chr> "mit", NA, NA, "othe…
# $ license_name <chr> "MIT License", NA, N…
# $ license_spdx_id <chr> "MIT", NA, NA, "NOAS…
# $ license_url <chr> "https://api.github.…
# $ license_node_id <chr> "MDc6TGljZW5zZTEz", …
# $ allow_forking <chr> "TRUE", "TRUE", "TRU…
# $ is_template <chr> "FALSE", "FALSE", "F…
# $ web_commit_signoff_required <chr> "FALSE", "FALSE", "F…
# $ visibility <chr> "public", "public", …
# $ forks <chr> "0", "18", "335", "1…
# $ open_issues <chr> "48", "27", "44", "1…
# $ watchers <chr> "19", "125", "1532",…
# $ default_branch <chr> "main", "main", "mai…
# $ permissions_admin <chr> "FALSE", "FALSE", "F…
# $ permissions_maintain <chr> "FALSE", "FALSE", "F…
# $ permissions_push <chr> "FALSE", "FALSE", "F…
# $ permissions_triage <chr> "FALSE", "FALSE", "F…
# $ permissions_pull <chr> "TRUE", "TRUE", "TRU…
# $ topics1 <chr> NA, NA, "blog-engine…
# $ topics2 <chr> NA, NA, "blogdown", …
# $ topics3 <chr> NA, NA, "hugo", NA, …
# $ topics4 <chr> NA, NA, "r", NA, NA,…
# $ topics5 <chr> NA, NA, "rmarkdown",…
# $ topics6 <chr> NA, NA, "rstudio", N…
# $ topics7 <chr> NA, NA, "website-gen…
# $ topics <chr> NA, NA, NA, NA, NA, …
# $ topics8 <chr> NA, NA, NA, NA, NA, …
# $ topics9 <chr> NA, NA, NA, NA, NA, …
# $ topics10 <chr> NA, NA, NA, NA, NA, …
Then, I filtered the repositories to remove the repositories that are forks, in order to see only the original repos:
original_rstudio_repos <- rstudio_repositories |>
dplyr::filter(fork == FALSE)
At this point, I had a tibble with more than 500 repositories. I wanted to get the action runs for every one of this repositories! So I wrote another function to do that, and save the result for each repository in a local file.
get_repo_github_actions_runs <- function(repo_full_name) {
usethis::ui_info("Starting with {repo_full_name}...")
total_count <- gh::gh(
"GET /repos/{full_name}/actions/runs",
full_name = repo_full_name,
per_page = 1
)$total_count
pages_to_iterate <- ceiling(total_count / 100)
list_all_runs <- purrr::map_dfr(
1:pages_to_iterate,
~ gh::gh(
"GET /repos/{full_name}/actions/runs",
full_name = repo_full_name,
per_page = 100,
page = .x
)
)
runs_df <- list_all_runs |>
purrr::pluck("workflow_runs") |>
purrr::map(purrr::compact) |>
purrr::map(purrr::discard, ~ is.list(.x)) |>
purrr::map_dfr(tibble::as_tibble, .id = "id") |>
dplyr::mutate(repo = repo_full_name,
.before = tidyselect::everything())
fs::dir_create("data/repos/")
readr::write_rds(
runs_df,
glue::glue(
"data/repos/runs_{stringr::str_replace(repo_full_name, '/', '_')}_{Sys.Date()}.Rds"
)
)
usethis::ui_done("Done with {repo_full_name}...")
}
Then I used purrr again, to run this function for every repository. This took quite some time to run!
At this point, I had one .Rds
file per repository, so more than 500 files. Thankfully I know the arts of purrr
and used that magic to import all the files into a single tibble:
dplyr::glimpse(df_complete_runs)
# Rows: 333,594
# Columns: 28
# $ repo <chr> "mlverse/cuda.ml", "mlverse/cuda.m…
# $ id <chr> "1", "2", "3", "4", "5", "6", "7",…
# $ name <chr> "R-CMD-check", "pages build and de…
# $ node_id <chr> "WFR_kwLOFqIlBc5kYBiS", "WFR_kwLOF…
# $ head_branch <chr> "main", "main", "main", "main", "m…
# $ head_sha <chr> "54fc9575e271b6a94c8650bea1887bb7f…
# $ path <chr> ".github/workflows/R-CMD-check.yam…
# $ run_number <int> 221, 13, 220, 12, 219, 11, 218, 21…
# $ event <chr> "push", "dynamic", "push", "dynami…
# $ status <chr> "completed", "completed", "complet…
# $ conclusion <chr> "success", "success", "success", "…
# $ workflow_id <int> 14189930, 17359975, 14189930, 1735…
# $ check_suite_id <dbl> 4901217228, 4901217123, 4886275561…
# $ check_suite_node_id <chr> "CS_kwDOFqIlBc8AAAABJCKjzA", "CS_k…
# $ url <chr> "https://api.github.com/repos/mlve…
# $ html_url <chr> "https://github.com/mlverse/cuda.m…
# $ created_at <chr> "2022-01-11T18:30:04Z", "2022-01-1…
# $ updated_at <chr> "2022-01-11T22:37:18Z", "2022-01-1…
# $ run_attempt <int> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1…
# $ run_started_at <chr> "2022-01-11T18:30:04Z", "2022-01-1…
# $ jobs_url <chr> "https://api.github.com/repos/mlve…
# $ logs_url <chr> "https://api.github.com/repos/mlve…
# $ check_suite_url <chr> "https://api.github.com/repos/mlve…
# $ artifacts_url <chr> "https://api.github.com/repos/mlve…
# $ cancel_url <chr> "https://api.github.com/repos/mlve…
# $ rerun_url <chr> "https://api.github.com/repos/mlve…
# $ workflow_url <chr> "https://api.github.com/repos/mlve…
# $ previous_attempt_url <chr> NA, NA, NA, NA, NA, NA, "https://a…
Then, I counted action runs per month:
counted_data <- df_complete_runs |>
dplyr::mutate(
start_date = lubridate::as_date(run_started_at),
run_month = lubridate::floor_date(start_date, "month")
) |>
dplyr::filter(conclusion != "skipped", start_date < "2022-06-01") |>
dplyr::count(run_month)
With this tibble, I was able to use tidyverse and ggplot2 to create the plot!
library(ggplot2)
options(scipen = 9999)
max_min <- counted_data |>
dplyr::filter(run_month %in% c(min(run_month), max(run_month))) |>
dplyr::mutate(label = glue::glue("{round(n/1000)}k"))
base_graph <- counted_data |>
ggplot() +
geom_line(aes(x = run_month, y = n), color = "#80868b", size = 1.5)
final_graph <- base_graph +
theme_minimal(base_size = 15) +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
plot.title.position = "plot",
plot.title = element_text(family = "Montserrat", color = "#4c83b6"),
text = element_text(family = "Montserrat", color = "#80868b")
) +
labs(
y = "Actions runs per month",
x = "",
title = "Actions runs by RStudio/Posit organizations on GitHub",
caption = "Plot made by @BeaMilz. Data from the GitHub API."
) +
scale_x_date(
date_labels = "%b/%y",
date_breaks = "4 month",
limits = c(as.Date("2020-03-01"), as.Date("2022-06-01"))
) +
scale_y_continuous(
limits = c(0, 25000),
labels = function(x) {
glue::glue("{x/1000}k")
}
) +
geom_point(
data = max_min,
aes(x = run_month, y = n),
size = 3,
color = "#4c83b6"
) +
ggrepel::geom_text_repel(
data = max_min,
aes(x = run_month, y = n, label = label),
size = 10,
color = "#4c83b6",
nudge_y = 4000,
nudge_x = 0,
min.segment.length = 0
)
ggsave(
"img/rstudio_ggplot.png",
plot = final_graph,
dpi = 300,
height = 5,
width = 7
)
🎉 Now we have the plot showed in the talk! 🎉
knitr::include_graphics("img/rstudio_ggplot.png")
Thanks!!