url <- "https://stats.ncaa.org/reports/game_length?id=21853"
response <- httr::GET(url, httr::add_headers("User-Agent" = "Mozilla/5.0"))
gl_response <- httr::content(response, "text")
gl_page <- rvest::read_html(gl_response)
gl_data <- gl_page |>
rvest::html_nodes("table") |>
magrittr::extract2(1) |>
rvest::html_table(fill = TRUE)
gl_data <- gl_data[-c(1, 2), ]
# Make the first row the column headers
colnames(gl_data) <- gl_data[1, ]
# Remove the first row (previously used as column headers)
gl_data <- gl_data[-1, ]
# now only snag the columns for the full season, not home and away
columns_to_keep <- c(2, 3, 12, 13, 14, 15)
gl_format <- gl_data |>
dplyr::select(all_of(columns_to_keep)) |>
dplyr::filter(Conference != "Totals") |>
dplyr::mutate_at(dplyr::vars(-Institution, -Conference, -Length), as.numeric) |>
dplyr::mutate(est_plays = OffPlays + DefPlays)
gl_format |>
dplyr::mutate(
hours = as.numeric(substr(Length, 1, 2)),
minutes = as.numeric(substr(Length, 4, 5)),
avg_duration = hours * 60 + minutes
) |>
dplyr::select(-hours, -minutes) |>
dplyr::rename(Team = Institution) -> gl_final
readxl::read_excel("top.xlsx") |>
dplyr::filter(!Team %in% c("Miami (FL) (ACC)", "Miami (OH) (MAC)")) |>
dplyr::mutate(Team = gsub("\\([^)]*\\)", "", Team)) |>
dplyr::mutate(Team = stringr::str_trim(Team, side = "right")) |>
dplyr::mutate(
minutes = as.numeric(substr(AvgTOP, 1, 2)),
seconds = as.numeric(substr(AvgTOP, 4, 5)),
avg_min = minutes + (seconds /100)
) |>
dplyr::add_row(Team = "Miami (FL)", avg_min = 30.4) |>
dplyr::add_row(Team = "Miami (OH)", avg_min = 30.2) |>
dplyr::add_row(Team = "James Madison", avg_min = 33.4) -> poss
poss_gl <- dplyr::left_join(gl_final, poss, by = "Team") |>
dplyr::mutate(Team = dplyr::case_match(Team,
"App State" ~ "Appalachian State",
"Army West Point" ~ "Army",
"Central Mich." ~ "Central Michigan",
"Eastern Mich." ~ "Eastern Michigan",
"Fla. Atlantic" ~ "Florida Atlantic",
"Ga. Southern" ~ "Georgia Southern",
"Middle Tenn." ~ "Middle Tennessee",
"NIU" ~ "Northern Illinois",
"South Fla." ~ "South Florida",
"Southern California" ~ "USC",
"Southern Miss." ~ "Southern Mississippi",
"ULM" ~ "Louisiana Monroe",
"Western Ky." ~ "Western Kentucky",
"Western Mich." ~ "Western Michigan",
"Miami (FL)" ~ "Miami FL",
.default = Team
)) |>
dplyr::mutate(avg_ppm = OffPlays/avg_min) |>
dplyr::select(-Rank, -G.y, -`W-L`, -minutes, -seconds)