Introduction

Sometime during 2021 I heard about the Big Data Bowl and got pretty interested in what what it was all about. They announced the Special Teams topic and when the datasets came out I was excited and wanted to see what I could do, partially because I have an interest in sports analytics, but also because I’m in a Data Analytics program and wanted some practice with datasets that are a little bigger and messier than what we usually see in class.

I thought about some different models that I could maybe build, played around with visualizing kickoffs, which is super fun and can be found here, but the idea of looking at weather trends was the idea that really compelled me to get into the data.

I have some questions for myself, like, “Is that a sports analytics question anyody cares about?” and “Is the answer to anything you could ask about the weather going to be useful?” I had no idea, really, but I did get to the answer of the second question. Below I’ll show you how I did it.

I’ll also go ahead and point out that my submission didn’t make the deadline and I just kept browsing the data to make this. Bad for competition, good for learning and digging into data!

Learn more about the Big Data Bowl and take a look at their data.

I also benefitted GREATLY from these resources, even if some of them didn’t really end up in this data: Weather Data Football Field

# Clear environment
rm(list = ls())
`%notin%` <- Negate(`%in%`)


# Setting the random number generator seed so that our results are reproducible
set.seed(1)

library(dplyr)
library(GGally)
library(data.table)
library(ggplot2)
library(devtools)
library(ggpubr)
library(ggrepel)


games <- read.csv('games.csv', header=TRUE)
glimpse(games)
## Rows: 764
## Columns: 7
## $ gameId          <int> 2018090600, 2018090900, 2018090901, 2018090902, 201...
## $ season          <int> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 201...
## $ week            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, ...
## $ gameDate        <chr> "09/06/2018", "09/09/2018", "09/09/2018", "09/09/20...
## $ gameTimeEastern <chr> "20:20:00", "13:00:00", "13:00:00", "13:00:00", "13...
## $ homeTeamAbbr    <chr> "PHI", "BAL", "CLE", "IND", "MIA", "NE", "NO", "NYG...
## $ visitorTeamAbbr <chr> "ATL", "BUF", "PIT", "CIN", "TEN", "HOU", "TB", "JA...

Mashing the data together

The first thing that I need to do here is select data from a few different tables, make a vector of stadiums based on whether or not they have a dome, and tie in the weather data.

plays <- fread('plays.csv')

plays <- plays %>%
  mutate(game_play = paste0(gameId, '_', playId)) %>%
  select(game_play, gameId, kickLength, kickReturnYardage, specialTeamsPlayType)

pffscout <- fread('PFFScoutingData.csv')

pffscout <- pffscout %>%
  mutate(game_play = paste0(gameId, '_', playId)) %>%
  select(game_play, hangTime, kickContactType, kickType)

data <- left_join(plays, pffscout, by = 'game_play')

games <- games %>%
  select(gameId, gameDate, gameTimeEastern, homeTeamAbbr)

data <- left_join(data, games, by = 'gameId')

glimpse(data)
## Rows: 19,979
## Columns: 11
## $ game_play            <chr> "2018090600_37", "2018090600_366", "2018090600...
## $ gameId               <int> 2018090600, 2018090600, 2018090600, 2018090600...
## $ kickLength           <int> 66, 56, 21, 64, 65, 49, 26, 72, 52, 68, 58, 57...
## $ kickReturnYardage    <int> NA, 5, NA, 30, NA, NA, NA, NA, NA, 13, NA, 8, ...
## $ specialTeamsPlayType <chr> "Kickoff", "Punt", "Field Goal", "Kickoff", "P...
## $ hangTime             <dbl> 3.85, 4.46, NA, 4.06, 4.35, 4.98, NA, 4.09, NA...
## $ kickContactType      <chr> NA, "CC", NA, NA, "BF", "CC", NA, NA, NA, NA, ...
## $ kickType             <chr> "D", "N", NA, "D", "N", "N", NA, "D", NA, "D",...
## $ gameDate             <chr> "09/06/2018", "09/06/2018", "09/06/2018", "09/...
## $ gameTimeEastern      <chr> "20:20:00", "20:20:00", "20:20:00", "20:20:00"...
## $ homeTeamAbbr         <chr> "PHI", "PHI", "PHI", "PHI", "PHI", "PHI", "PHI...
stadiums_list <- unique(data$homeTeamAbbr)
stadiums_list
##  [1] "PHI" "BAL" "CLE" "IND" "MIA" "NE"  "NO"  "NYG" "ARI" "CAR" "GB"  "DET"
## [13] "OAK" "CIN" "ATL" "BUF" "NYJ" "PIT" "TB"  "TEN" "WAS" "LA"  "SF"  "DEN"
## [25] "JAX" "DAL" "CHI" "HOU" "KC"  "MIN" "SEA" "LAC" "LV"
covereddf <- data.frame(homeTeamAbbr = stadiums_list, covered = c('No Cover', 'No Cover', 'No Cover', 'Cover', 'No Cover', 'No Cover', 'Cover', 'No Cover', 'Cover', 'No Cover', 'No Cover', 'Cover', 'No Cover', 'No Cover', 'Cover', 'No Cover', 'No Cover', 'No Cover', 'No Cover', 'No Cover', 'No Cover', 'Cover', 'No Cover', 'No Cover', 'No Cover', 'Cover', 'No Cover', 'Cover', 'No Cover', 'Cover', 'No Cover', 'Cover', 'Cover'))

data <- left_join(data, covereddf, by = 'homeTeamAbbr')
data$covered <- as.factor(data$covered)
data$kickType <- as.factor(data$kickType)

levels(data$kickType) <- (list('Deep' = 'D', 'Squib' = 'Q', 'Pooch' = 'P', 'Flat' = 'F', 'Free Kick' = 'K', 'Obvious Onside' = 'O', 'Surprise Onside' = 'S', 'Deep Direct OOB' = 'B', 'Normal Punt' = 'N', 'Rugby Style Punt' = 'R', 'Nose Down Punt' = 'A'))

levels(data$kickType)
##  [1] "Deep"             "Squib"            "Pooch"            "Flat"            
##  [5] "Free Kick"        "Obvious Onside"   "Surprise Onside"  "Deep Direct OOB" 
##  [9] "Normal Punt"      "Rugby Style Punt" "Nose Down Punt"
weather <- fread('games_weather.csv')
weather <- weather %>%rename('gameId' = 'game_id')
head(weather)
##        gameId    Source DistanceToStation    TimeMeasure Temperature DewPoint
## 1: 2000090300 Meteostat              3.79 9/3/2000 12:00       80.96    73.04
## 2: 2000090300 Meteostat              3.79 9/3/2000 13:00       80.96    73.04
## 3: 2000090300 Meteostat              3.79 9/3/2000 14:00       82.94    73.04
## 4: 2000090300 Meteostat              3.79 9/3/2000 15:00       80.06    74.12
## 5: 2000090300 Meteostat              3.79 9/3/2000 16:00       75.02    75.02
## 6: 2000090300 Meteostat              3.79 9/3/2000 17:00       75.02    75.02
##    Humidity Precipitation WindSpeed WindDirection Pressure EstimatedCondition
## 1:       77         0.000      3.36           200  29.8902              Clear
## 2:       77         0.000      5.84           300  29.8784              Clear
## 3:       72         0.000      5.84           260  29.8696              Clear
## 4:       82         0.000      3.36           180  29.8578              Clear
## 5:      100         0.728      4.72           280  29.8519         Heavy Rain
## 6:      100         0.161      0.00            NA  29.8430      Moderate Rain
wea_summ <- weather %>% group_by(gameId) %>%
  summarise(uPrecip = mean(Precipitation),
            uPress = mean(Pressure),
            uHumidity = mean(Humidity),
            uTemp = mean(Temperature),
            uDewP = mean(DewPoint))

head(wea_summ)
## # A tibble: 6 x 6
##       gameId uPrecip uPress uHumidity uTemp uDewP
##        <int>   <dbl>  <dbl>     <dbl> <dbl> <dbl>
## 1 2000090300   0.129   29.9      86.4  78.7  74.0
## 2 2000090301   0       29.9      76.5  79.8  71.5
## 3 2000090302   0       29.9      83.6  80.1  74.7
## 4 2000090303  NA       30.0      80    67.6  61.0
## 5 2000090304   0       29.8      59.3  92.7  76.1
## 6 2000090305  NA       29.9      50.7  89.2  68.2
data <- data %>% left_join(wea_summ, by='gameId')

head(data)
##         game_play     gameId kickLength kickReturnYardage specialTeamsPlayType
## 1:  2018090600_37 2018090600         66                NA              Kickoff
## 2: 2018090600_366 2018090600         56                 5                 Punt
## 3: 2018090600_658 2018090600         21                NA           Field Goal
## 4: 2018090600_677 2018090600         64                30              Kickoff
## 5: 2018090600_872 2018090600         65                NA                 Punt
## 6: 2018090600_973 2018090600         49                NA                 Punt
##    hangTime kickContactType    kickType   gameDate gameTimeEastern homeTeamAbbr
## 1:     3.85            <NA>        Deep 09/06/2018        20:20:00          PHI
## 2:     4.46              CC Normal Punt 09/06/2018        20:20:00          PHI
## 3:       NA            <NA>        <NA> 09/06/2018        20:20:00          PHI
## 4:     4.06            <NA>        Deep 09/06/2018        20:20:00          PHI
## 5:     4.35              BF Normal Punt 09/06/2018        20:20:00          PHI
## 6:     4.98              CC Normal Punt 09/06/2018        20:20:00          PHI
##     covered uPrecip   uPress uHumidity  uTemp uDewP
## 1: No Cover      NA 30.12279  72.41667 78.875 69.05
## 2: No Cover      NA 30.12279  72.41667 78.875 69.05
## 3: No Cover      NA 30.12279  72.41667 78.875 69.05
## 4: No Cover      NA 30.12279  72.41667 78.875 69.05
## 5: No Cover      NA 30.12279  72.41667 78.875 69.05
## 6: No Cover      NA 30.12279  72.41667 78.875 69.05
#I collapsed the weather data into averages and then joined it by gameId



kickdat <- data %>%
  filter(specialTeamsPlayType == 'Kickoff')

puntdat <- data %>%
  filter(specialTeamsPlayType == 'Punt')

Now I’ve got the data in place and I want to plot it in a few ways to see where the differences might lie. I want to first just create some boxplot type comparisons (although they’ll often be jitter and violin plots to give me an idea of the underlying distributions). After I have a feel for whether or not there are any interesting comparisons to make, I’ll want to go a little deeper.

p_cov_kick_vio <- ggplot(kickdat, aes(x=as.factor(covered), y=kickLength)) +
  geom_violin() +
  stat_compare_means(method = "t.test")

p_cov_kick_vio

p_cov_kick_vio_facet <- ggplot(kickdat, aes(x=as.factor(covered), y=kickLength)) +
  geom_violin() +
  stat_compare_means(method = "t.test", label.y=100, size=2) +  
  facet_wrap(~kickdat$kickType) +
  ylim(0,120)  +
  theme(strip.background = element_rect(size=.5),
      strip.text.x = element_text(
        size = 6, color = "black", face = "bold"
        )) +
  labs(title='Kick Length by Stadium Type', y='Kick Length',x=NULL)

p_cov_kick_vio_facet

p_cov_kick_box_facet <- ggplot(filter(kickdat, kickType=='Free Kick'), aes(x=as.factor(covered), y=kickLength)) +
  geom_boxplot() +
  stat_compare_means(method = "t.test") #+ facet_wrap(~kickdat$kickType)

p_cov_kick_box_facet

p_cov_kick_jitter <- ggplot(data, aes(x=as.factor(covered), y=kickLength, color = kickType, alpha=0.2)) +
  geom_jitter() +
  stat_compare_means(method = "t.test")

p_cov_kick_jitter

p_cov_punt_vio <- ggplot(puntdat, aes(x=covered, y=kickLength)) +
  geom_violin() +
  stat_compare_means(method = "t.test")

p_cov_punt_vio

p_cov_punt_jitter <- ggplot(data, aes(x=covered, y=kickLength, color = kickType, alpha=0.2)) +
  geom_jitter() +
  stat_compare_means(method = "t.test")

p_cov_punt_jitter

kickcors <- ggpairs(kickdat,
                   columns=c(6, 3, 4),
                   mapping = aes(color = covered),
                   switch = "both",
                   upper = list(continuous = wrap("cor")),
                   lower=list(continuous = wrap("points",size= 2)),
                   diag = list(continuous = wrap("densityDiag",alpha = 0.5))) +
  theme_grey(base_size=8)
kickcors

puntcors <- ggpairs(puntdat,
                   columns=c(6, 3, 4),
                   mapping = aes(color = covered),
                   switch = "both",
                   upper = list(continuous = wrap("cor")),
                   lower=list(continuous = wrap("points",size= 2)),
                   diag = list(continuous = wrap("densityDiag",alpha = 0.5))) +
  theme_grey(base_size=8)
puntcors

kick_scat_type <- ggplot(kickdat, aes(x=kickLength, y=kickReturnYardage, color=kickType, alpha=0.5)) +
  geom_point() +
  stat_cor(aes(color = kickType), label.x = -20, show.legend = FALSE) +
  geom_smooth(method='lm', formula= y~x) +
  guides(alpha=FALSE) +
  theme_minimal() +
  xlim(-20,100) +
  labs(title = "Kickoff Return vs Kick Length by Type") +
  xlab("Kick Length") +
  ylab("Kick Return Yardage")

kick_scat_type

punt_scat_type <- ggplot(puntdat, aes(x=kickLength, y=kickReturnYardage, color=kickType, alpha=0.2)) +
  geom_point() +
  stat_cor(aes(color = kickType), label.x = 10, show.legend = FALSE) +
  geom_smooth(method='lm', formula= y~x) +
  guides(alpha=FALSE) +
  labs(title="Punt Return Yardage vs Kick Length", x="Punt Length", y="Return Yardage")

punt_scat_type

Now I want to make some checks on the weather data. I know that there are some statistically significant comparisons to be made with indoor and outdoor stadiums, but I now want to include correlations in the comparisons for humidity, temperature, and pressure.

punt_press_hang <- ggplot(puntdat, aes(x=uPress, y=hangTime, color=covered, alpha=0.2)) +
  geom_point() +
  stat_cor(aes(color = covered), label.x = 27.5, show.legend = FALSE) +
  geom_smooth(method='lm', formula= y~x) +
  ylim(1,7) +
  guides(alpha=FALSE) +
  labs(title = "Punt Hangtime vs Pressure")
  #+
  #facet_wrap( ~ kickType)
  #geom_text(show.legend = FALSE, labels.default())

punt_press_hang

ptry <- ggplot(puntdat, aes(x=uHumidity, y=hangTime, color = covered, alpha=0.1)) +
  geom_point() +
  stat_cor(aes(color = covered), label.x = 10, show.legend = FALSE) +
  geom_smooth(method='lm', formula= y~x) +
  ylim(1,7) +
  guides(alpha=FALSE)

ptry

kick_hum <- ggplot(kickdat, aes(x=uHumidity, y=kickLength, color=covered, alpha=0.2)) +
  geom_point()

kick_hum

punt_hum <- ggplot(puntdat, aes(x=uHumidity, y=kickLength, color=covered, alpha=0.2)) +
  geom_point()

punt_hum

Biggest Finds

Here are some plots of the biggest differences that I found after looking through the comparisons. We can see a couple of interesting things from the plots below: there’s a small correlation in punts and kickoffs that shows up more prominently in uncovered stadiums. That’s REALLY neat to see! But it’s also clear that it’s not very important. I wouldn’t make decisions on kicks based on this weak correlation, but it’s really cool to see something that I predicted physically from the football data.

kick_hang <- ggplot(kickdat, aes(x=uTemp, y=hangTime, color=covered, alpha=0.2)) +
  geom_point() +
  stat_cor(aes(color = covered), label.x = 10, show.legend = FALSE) +
  geom_smooth(method='lm', formula= y~x) +
  ylim(1,7) +
  guides(alpha=FALSE) +
  labs(title = "Kickoff Hangtime vs Temperature", x="Temperature", y="Hang Time")

kick_hang

kickdat %>%
  group_by(kickType) %>%
  summarize(n=n())
## # A tibble: 8 x 2
##   kickType            n
##   <fct>           <int>
## 1 Deep             6944
## 2 Squib             134
## 3 Pooch             226
## 4 Flat              319
## 5 Free Kick          50
## 6 Obvious Onside    155
## 7 Surprise Onside    14
## 8 Deep Direct OOB     1
kick_hang_facet <- ggplot(kickdat, aes(x=uTemp, y=hangTime, color=covered, alpha=0.2)) +  
  geom_point() +
  stat_cor(aes(color = covered), label.x = 10, show.legend = FALSE) +
  geom_smooth(method='lm', formula= y~x) +
  ylim(1,7) +
  guides(alpha=FALSE) +
  labs(title = "Kickoff Hangtime vs Temperature", x="Temperature", y="Hangtime") +
  facet_wrap(~kickdat$kickType)

kick_hang_facet

kick_len_facet <- ggplot(kickdat, aes(x=uTemp, y=kickLength, color=covered, alpha=0.2)) +  
  geom_point() +
  stat_cor(aes(color = covered), label.x = 10, show.legend = FALSE) +
  geom_smooth(method='lm', formula= y~x) +
  guides(alpha=FALSE) +
  labs(title = "Kickoff Length vs Temperature", x="Temperature", y="Hangtime") +
  facet_wrap(~kickdat$kickType)

kick_len_facet

punt_temp <- ggplot(puntdat, aes(x=uTemp, y=kickLength, color=covered, alpha=0.2)) +
  geom_point() +
  labs(title="Punt Kick Length vs Temperature", x="Temperature", y="Kick Length")

punt_temp

punt_temp_hang <- ggplot(puntdat, aes(x=uTemp, y=hangTime, color=covered, alpha=0.2)) +
  geom_point() +
  stat_cor(aes(color = covered), label.x = 10, show.legend = FALSE) +
  geom_smooth(method='lm', formula= y~x) +
  ylim(1,7) +
  guides(alpha=FALSE) +
  labs(title = "Hangtime vs Temperature", x="Temperature", y="Hangtime") +
  facet_wrap( ~ kickType)
  #geom_text(show.legend = FALSE, labels.default())

punt_temp_hang

punt_temp_len <- ggplot(puntdat, aes(x=uTemp, y=kickLength, color=covered, alpha=0.2)) +
  geom_point() +
  stat_cor(aes(color = covered), label.x = 10, show.legend = FALSE) +
  geom_smooth(method='lm', formula= y~x) +
  guides(alpha=FALSE) +
  labs(title = "Kick Length vs Temperature") +
  facet_wrap( ~ kickType)
  #geom_text(show.legend = FALSE, labels.default())

punt_temp_len