diff --git a/DESCRIPTION b/DESCRIPTION index 730a468..896a318 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,7 +31,7 @@ Imports: tidyr URL: https://github.com/simonpcouch/anyflights, https://simonpcouch.github.io/anyflights/ BugReports: https://github.com/simonpcouch/anyflights/issues -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Encoding: UTF-8 Suggests: testthat, diff --git a/NEWS.md b/NEWS.md index b9a8546..df251f4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,13 @@ # anyflights (development version) +* `get_flights()` now reinterprets the BTS source's airport-local scheduled + departure wall clocks as proper UTC instants in the `time_hour` column, + using each origin's IANA time zone (looked up from `get_airports()$tzone`). + `get_weather()$time_hour` already publishes UTC observation instants, so + the canonical `(origin, time_hour)` join between flights and weather now + returns correct rows across multiple time zones, where previously the two + columns disagreed for any non-UTC airport (#28, @ismayc). + # anyflights 0.3.5 * Include `tz = "GMT"` argument to `ISOdatetime()` so that weather output isn't diff --git a/R/get_flights.R b/R/get_flights.R index 4045fbb..6babc07 100644 --- a/R/get_flights.R +++ b/R/get_flights.R @@ -34,9 +34,13 @@ #' \code{\link{get_airports}} for additional metadata.} #' \item{\code{air_time}}{Amount of time spent in the air, in minutes} #' \item{\code{distance}}{Distance between airports, in miles} -#' \item{\code{time_hour}}{Scheduled date and hour of the flight as a -#' \code{POSIXct} date. Along with \code{origin}, can be used to join -#' flights data to weather data.} +#' \item{\code{time_hour}}{Scheduled date and hour of the flight as a UTC +#' \code{POSIXct} instant. The BTS source publishes scheduled departures in +#' the airport's local wall clock; \code{get_flights()} interprets those +#' wall clocks using the IANA \code{tzone} for each \code{origin} (from +#' \code{\link{get_airports}}) and stores the resulting UTC instant. Along +#' with \code{origin}, this column can be used to join flights data to +#' weather data.} #' } #' #' @note @@ -122,7 +126,12 @@ get_flights <- function(station, year, month = 1:12, dir = NULL, ...) { station = station) %>% dplyr::bind_rows() %>% dplyr::arrange(year, month, day, dep_time) - + + # convert airport-local wall clocks (stored as naive UTC) to true UTC + # instants so the (origin, time_hour) join with get_weather() works across + # multiple time zones (#28) + flights <- flights_time_hour_to_utc(flights, get_airports()) + # get rid of the "raw" data unlink(x = flight_exdir, recursive = TRUE) diff --git a/R/get_weather.R b/R/get_weather.R index 09161d6..4c1618e 100644 --- a/R/get_weather.R +++ b/R/get_weather.R @@ -10,18 +10,19 @@ #' #' @return A data frame with ~1k-25k rows and 15 variables: #' \describe{ -#' \item{\code{origin}}{Weather station. Named \code{origin} to facilitate +#' \item{\code{origin}}{Weather station. Named \code{origin} to facilitate #' merging with flights data} -#' \item{\code{year, month, day, hour}}{Time of recording, UTC} +#' \item{\code{year, month, day, hour}}{Time of recording in UTC.} #' \item{\code{temp, dewp}}{Temperature and dewpoint in F} #' \item{\code{humid}}{Relative humidity} -#' \item{\code{wind_dir, wind_speed, wind_gust}}{Wind direction (in degrees), +#' \item{\code{wind_dir, wind_speed, wind_gust}}{Wind direction (in degrees), #' speed and gust speed (in mph)} #' \item{\code{precip}}{Precipitation, in inches} #' \item{\code{pressure}}{Sea level pressure in millibars} #' \item{\code{visib}}{Visibility in miles} -#' \item{\code{time_hour}}{Date and hour of the recording as a \code{POSIXct} -#' date, UTC} +#' \item{\code{time_hour}}{Date and hour of the recording as a UTC +#' \code{POSIXct} instant. Along with \code{origin}, can be used to join +#' weather data to flights data.} #' } #' @source ASOS download from Iowa Environmental Mesonet, #' \url{https://mesonet.agron.iastate.edu/request/download.phtml} @@ -66,14 +67,14 @@ get_weather <- function(station, year, month = 1:12, dir = NULL) { dir_is_null <- FALSE } - weather <- purrr::map(station, - get_weather_for_station, + weather <- purrr::map(station, + get_weather_for_station, year = year, - dir = dir, + dir = dir, month_and_day_range = month_and_day_range, month = month) %>% dplyr::bind_rows() - + # save the data if the user supplied a directory if (!dir_is_null) { save(weather, file = paste0(dir, "/weather.rda"), compress = "xz") diff --git a/R/utils.R b/R/utils.R index b4f7eac..b48b667 100644 --- a/R/utils.R +++ b/R/utils.R @@ -198,6 +198,31 @@ download_file_wrapper <- function(url, file_path, quiet = TRUE){ out } +# Convert `time_hour` from the airport-local wall clock that the BTS data +# arrives with (numerically stored as UTC, e.g. 9am-Pacific encoded as +# "2023-01-01 09:00:00 UTC") into the correct UTC instant ("2023-01-01 +# 17:00:00 UTC"). Joining against `get_weather()`, which already publishes +# UTC observation instants, then works correctly across multiple origin time +# zones. Row order in `data` is preserved. +flights_time_hour_to_utc <- function(data, airports_data) { + tz_lookup <- airports_data$tzone[match(data$origin, airports_data$faa)] + unique_tz <- unique(stats::na.omit(tz_lookup)) + + out <- lubridate::with_tz(data$time_hour, "UTC") + for (tz in unique_tz) { + idx <- which(!is.na(tz_lookup) & tz_lookup == tz) + if (length(idx)) { + out[idx] <- lubridate::with_tz( + lubridate::force_tz(data$time_hour[idx], tzone = tz), + "UTC" + ) + } + } + data$time_hour <- out + data +} + + # get_flights utilities -------------------------------------------------- download_month <- function(year, month, dir, flight_exdir, pb, diff_fn) { diff --git a/man/get_flights.Rd b/man/get_flights.Rd index a8651a5..7c8eaae 100644 --- a/man/get_flights.Rd +++ b/man/get_flights.Rd @@ -45,9 +45,13 @@ A data frame with ~1k-500k rows and 19 variables: \code{\link{get_airports}} for additional metadata.} \item{\code{air_time}}{Amount of time spent in the air, in minutes} \item{\code{distance}}{Distance between airports, in miles} -\item{\code{time_hour}}{Scheduled date and hour of the flight as a - \code{POSIXct} date. Along with \code{origin}, can be used to join - flights data to weather data.} +\item{\code{time_hour}}{Scheduled date and hour of the flight as a UTC + \code{POSIXct} instant. The BTS source publishes scheduled departures in + the airport's local wall clock; \code{get_flights()} interprets those + wall clocks using the IANA \code{tzone} for each \code{origin} (from + \code{\link{get_airports}}) and stores the resulting UTC instant. Along + with \code{origin}, this column can be used to join flights data to + weather data.} } } \description{ diff --git a/man/get_weather.Rd b/man/get_weather.Rd index edb2b04..07913f6 100644 --- a/man/get_weather.Rd +++ b/man/get_weather.Rd @@ -27,18 +27,19 @@ to save datasets in. By default, datasets will not be saved to file.} \value{ A data frame with ~1k-25k rows and 15 variables: \describe{ -\item{\code{origin}}{Weather station. Named \code{origin} to facilitate +\item{\code{origin}}{Weather station. Named \code{origin} to facilitate merging with flights data} -\item{\code{year, month, day, hour}}{Time of recording, UTC} +\item{\code{year, month, day, hour}}{Time of recording in UTC.} \item{\code{temp, dewp}}{Temperature and dewpoint in F} \item{\code{humid}}{Relative humidity} -\item{\code{wind_dir, wind_speed, wind_gust}}{Wind direction (in degrees), +\item{\code{wind_dir, wind_speed, wind_gust}}{Wind direction (in degrees), speed and gust speed (in mph)} \item{\code{precip}}{Precipitation, in inches} \item{\code{pressure}}{Sea level pressure in millibars} \item{\code{visib}}{Visibility in miles} -\item{\code{time_hour}}{Date and hour of the recording as a \code{POSIXct} - date, UTC} +\item{\code{time_hour}}{Date and hour of the recording as a UTC + \code{POSIXct} instant. Along with \code{origin}, can be used to join + weather data to flights data.} } } \description{ diff --git a/tests/testthat/test-8-time-hour-tz.R b/tests/testthat/test-8-time-hour-tz.R new file mode 100644 index 0000000..246c4b0 --- /dev/null +++ b/tests/testthat/test-8-time-hour-tz.R @@ -0,0 +1,155 @@ +context("flights_time_hour_to_utc") + +# A small in-memory airports lookup so these tests don't hit the network. +mock_airports <- function() { + dplyr::tibble( + faa = c("PDX", "JFK", "DEN", "HNL", "XYZ"), + tzone = c( + "America/Los_Angeles", + "America/New_York", + "America/Denver", + "Pacific/Honolulu", + NA_character_ + ) + ) +} + +test_that("airport-local wall clocks become correct UTC instants", { + # Simulates raw flights data: airport-local wall clock implicitly tagged UTC. + flights <- dplyr::tibble( + origin = c("PDX", "JFK", "DEN"), + time_hour = lubridate::make_datetime(2023, 1, 1, c(9, 9, 9), 0, 0) + ) + + out <- flights_time_hour_to_utc(flights, mock_airports()) + + expect_setequal(colnames(out), c("origin", "time_hour")) + expect_s3_class(out$time_hour, "POSIXct") + + # Each row's instant equals "9am at airport local tz" expressed as UTC + expected <- c( + PDX = as.numeric(as.POSIXct("2023-01-01 09:00:00", tz = "America/Los_Angeles")), + JFK = as.numeric(as.POSIXct("2023-01-01 09:00:00", tz = "America/New_York")), + DEN = as.numeric(as.POSIXct("2023-01-01 09:00:00", tz = "America/Denver")) + ) + by_origin <- stats::setNames(as.numeric(out$time_hour), out$origin) + expect_equal(by_origin[names(expected)], expected) +}) + +test_that("flights and weather time_hour join across multiple time zones (issue #28)", { + airports <- mock_airports() + origins <- c("PDX", "JFK", "DEN", "HNL") + + # Flights data: airport-local 9am scheduled departure + flights <- dplyr::tibble( + origin = origins, + time_hour = lubridate::make_datetime(2023, 1, 1, 9, 0, 0) + ) %>% + flights_time_hour_to_utc(airports) + + # Weather data: UTC observation at the corresponding instant (as produced by + # get_weather_for_station with `tz = "GMT"`) + utc_hour <- c(PDX = 17L, JFK = 14L, DEN = 16L, HNL = 19L) + weather <- dplyr::tibble( + origin = names(utc_hour), + year = 2023L, month = 1L, day = 1L, + hour = unname(utc_hour), + time_hour = ISOdatetime(2023, 1, 1, unname(utc_hour), 0, 0, tz = "GMT") + ) + + joined <- dplyr::inner_join(flights, weather, by = c("origin", "time_hour")) + expect_equal(nrow(joined), length(origins)) + expect_setequal(joined$origin, origins) +}) + +test_that("origins with NA tzone leave time_hour unchanged", { + raw <- dplyr::tibble( + origin = c("XYZ", "XYZ"), + time_hour = lubridate::make_datetime(2023, 1, 1, c(9, 10), 0, 0) + ) + + out <- flights_time_hour_to_utc(raw, mock_airports()) + expect_equal(as.numeric(out$time_hour), as.numeric(raw$time_hour)) +}) + +test_that("row order is preserved across multi-tz input", { + flights <- dplyr::tibble( + origin = c("JFK", "PDX", "DEN", "JFK", "PDX"), + time_hour = lubridate::make_datetime(2023, 1, 1, c(6, 7, 8, 9, 10), 0, 0) + ) + + out <- flights_time_hour_to_utc(flights, mock_airports()) + expect_equal(out$origin, flights$origin) +}) + +test_that("output column is tagged UTC even for single-tz input", { + flights <- dplyr::tibble( + origin = c("JFK", "JFK", "JFK"), + time_hour = lubridate::make_datetime(2023, 6, 15, c(8, 12, 20), 0, 0) + ) + airports <- dplyr::tibble( + faa = "JFK", + tzone = "America/New_York" + ) + + out <- flights_time_hour_to_utc(flights, airports) + + expect_equal(lubridate::tz(out$time_hour), "UTC") + # 8am EDT == 12:00 UTC on June 15 (DST in effect) + expect_equal( + as.numeric(out$time_hour[1]), + as.numeric(as.POSIXct("2023-06-15 08:00:00", tz = "America/New_York")) + ) +}) + +test_that("DST shift is respected within a single airport (EST vs EDT)", { + # 9am at JFK is UTC-5 in January but UTC-4 in July; if the helper used a + # fixed offset instead of an IANA tz, both rows would land at the same UTC + # hour. + flights <- dplyr::tibble( + origin = c("JFK", "JFK"), + time_hour = c( + lubridate::make_datetime(2023, 1, 15, 9, 0, 0), + lubridate::make_datetime(2023, 7, 15, 9, 0, 0) + ) + ) + airports <- dplyr::tibble(faa = "JFK", tzone = "America/New_York") + + out <- flights_time_hour_to_utc(flights, airports) + + expect_equal(lubridate::hour(out$time_hour), c(14L, 13L)) +}) + +test_that("round-tripping UTC back to each origin's local tz recovers the input wall clock", { + airports <- mock_airports() + raw <- dplyr::tibble( + origin = c("PDX", "JFK", "DEN", "HNL"), + time_hour = lubridate::make_datetime(2023, 6, 15, c(6, 9, 12, 15), 30, 0) + ) + + out <- flights_time_hour_to_utc(raw, airports) + + tz_lookup <- airports$tzone[match(out$origin, airports$faa)] + recovered_hour <- purrr::map2_int(out$time_hour, tz_lookup, function(t, tz) { + as.integer(lubridate::hour(lubridate::with_tz(t, tz))) + }) + recovered_minute <- purrr::map2_int(out$time_hour, tz_lookup, function(t, tz) { + as.integer(lubridate::minute(lubridate::with_tz(t, tz))) + }) + + expect_equal(recovered_hour, c(6L, 9L, 12L, 15L)) + expect_equal(recovered_minute, rep(30L, 4)) +}) + +test_that("empty input returns an empty data frame with a UTC time_hour column", { + empty <- dplyr::tibble( + origin = character(0), + time_hour = as.POSIXct(character(0), tz = "UTC") + ) + + out <- flights_time_hour_to_utc(empty, mock_airports()) + + expect_equal(nrow(out), 0L) + expect_s3_class(out$time_hour, "POSIXct") + expect_equal(lubridate::tz(out$time_hour), "UTC") +})