Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Imports:
tidyr
URL: https://github.com/simonpcouch/anyflights, https://simonpcouch.github.io/anyflights/
BugReports: https://github.com/simonpcouch/anyflights/issues
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
Encoding: UTF-8
Suggests:
testthat,
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# anyflights (development version)

* `get_flights()` now reinterprets the BTS source's airport-local scheduled
departure wall clocks as proper UTC instants in the `time_hour` column,
using each origin's IANA time zone (looked up from `get_airports()$tzone`).
`get_weather()$time_hour` already publishes UTC observation instants, so
the canonical `(origin, time_hour)` join between flights and weather now
returns correct rows across multiple time zones, where previously the two
columns disagreed for any non-UTC airport (#28, @ismayc).

# anyflights 0.3.5

* Include `tz = "GMT"` argument to `ISOdatetime()` so that weather output isn't
Expand Down
17 changes: 13 additions & 4 deletions R/get_flights.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,13 @@
#' \code{\link{get_airports}} for additional metadata.}
#' \item{\code{air_time}}{Amount of time spent in the air, in minutes}
#' \item{\code{distance}}{Distance between airports, in miles}
#' \item{\code{time_hour}}{Scheduled date and hour of the flight as a
#' \code{POSIXct} date. Along with \code{origin}, can be used to join
#' flights data to weather data.}
#' \item{\code{time_hour}}{Scheduled date and hour of the flight as a UTC
#' \code{POSIXct} instant. The BTS source publishes scheduled departures in
#' the airport's local wall clock; \code{get_flights()} interprets those
#' wall clocks using the IANA \code{tzone} for each \code{origin} (from
#' \code{\link{get_airports}}) and stores the resulting UTC instant. Along
#' with \code{origin}, this column can be used to join flights data to
#' weather data.}
#' }
#'
#' @note
Expand Down Expand Up @@ -122,7 +126,12 @@ get_flights <- function(station, year, month = 1:12, dir = NULL, ...) {
station = station) %>%
dplyr::bind_rows() %>%
dplyr::arrange(year, month, day, dep_time)


# convert airport-local wall clocks (stored as naive UTC) to true UTC
# instants so the (origin, time_hour) join with get_weather() works across
# multiple time zones (#28)
flights <- flights_time_hour_to_utc(flights, get_airports())

# get rid of the "raw" data
unlink(x = flight_exdir, recursive = TRUE)

Expand Down
19 changes: 10 additions & 9 deletions R/get_weather.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,19 @@
#'
#' @return A data frame with ~1k-25k rows and 15 variables:
#' \describe{
#' \item{\code{origin}}{Weather station. Named \code{origin} to facilitate
#' \item{\code{origin}}{Weather station. Named \code{origin} to facilitate
#' merging with flights data}
#' \item{\code{year, month, day, hour}}{Time of recording, UTC}
#' \item{\code{year, month, day, hour}}{Time of recording in UTC.}
#' \item{\code{temp, dewp}}{Temperature and dewpoint in F}
#' \item{\code{humid}}{Relative humidity}
#' \item{\code{wind_dir, wind_speed, wind_gust}}{Wind direction (in degrees),
#' \item{\code{wind_dir, wind_speed, wind_gust}}{Wind direction (in degrees),
#' speed and gust speed (in mph)}
#' \item{\code{precip}}{Precipitation, in inches}
#' \item{\code{pressure}}{Sea level pressure in millibars}
#' \item{\code{visib}}{Visibility in miles}
#' \item{\code{time_hour}}{Date and hour of the recording as a \code{POSIXct}
#' date, UTC}
#' \item{\code{time_hour}}{Date and hour of the recording as a UTC
#' \code{POSIXct} instant. Along with \code{origin}, can be used to join
#' weather data to flights data.}
#' }
#' @source ASOS download from Iowa Environmental Mesonet,
#' \url{https://mesonet.agron.iastate.edu/request/download.phtml}
Expand Down Expand Up @@ -66,14 +67,14 @@ get_weather <- function(station, year, month = 1:12, dir = NULL) {
dir_is_null <- FALSE
}

weather <- purrr::map(station,
get_weather_for_station,
weather <- purrr::map(station,
get_weather_for_station,
year = year,
dir = dir,
dir = dir,
month_and_day_range = month_and_day_range,
month = month) %>%
dplyr::bind_rows()

# save the data if the user supplied a directory
if (!dir_is_null) {
save(weather, file = paste0(dir, "/weather.rda"), compress = "xz")
Expand Down
25 changes: 25 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,31 @@ download_file_wrapper <- function(url, file_path, quiet = TRUE){
out
}

# Convert `time_hour` from the airport-local wall clock that the BTS data
# arrives with (numerically stored as UTC, e.g. 9am-Pacific encoded as
# "2023-01-01 09:00:00 UTC") into the correct UTC instant ("2023-01-01
# 17:00:00 UTC"). Joining against `get_weather()`, which already publishes
# UTC observation instants, then works correctly across multiple origin time
# zones. Row order in `data` is preserved.
flights_time_hour_to_utc <- function(data, airports_data) {
tz_lookup <- airports_data$tzone[match(data$origin, airports_data$faa)]
unique_tz <- unique(stats::na.omit(tz_lookup))

out <- lubridate::with_tz(data$time_hour, "UTC")
for (tz in unique_tz) {
idx <- which(!is.na(tz_lookup) & tz_lookup == tz)
if (length(idx)) {
out[idx] <- lubridate::with_tz(
lubridate::force_tz(data$time_hour[idx], tzone = tz),
"UTC"
)
}
}
data$time_hour <- out
data
}


# get_flights utilities --------------------------------------------------

download_month <- function(year, month, dir, flight_exdir, pb, diff_fn) {
Expand Down
10 changes: 7 additions & 3 deletions man/get_flights.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 6 additions & 5 deletions man/get_weather.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

155 changes: 155 additions & 0 deletions tests/testthat/test-8-time-hour-tz.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
context("flights_time_hour_to_utc")

# A small in-memory airports lookup so these tests don't hit the network.
mock_airports <- function() {
dplyr::tibble(
faa = c("PDX", "JFK", "DEN", "HNL", "XYZ"),
tzone = c(
"America/Los_Angeles",
"America/New_York",
"America/Denver",
"Pacific/Honolulu",
NA_character_
)
)
}

test_that("airport-local wall clocks become correct UTC instants", {
# Simulates raw flights data: airport-local wall clock implicitly tagged UTC.
flights <- dplyr::tibble(
origin = c("PDX", "JFK", "DEN"),
time_hour = lubridate::make_datetime(2023, 1, 1, c(9, 9, 9), 0, 0)
)

out <- flights_time_hour_to_utc(flights, mock_airports())

expect_setequal(colnames(out), c("origin", "time_hour"))
expect_s3_class(out$time_hour, "POSIXct")

# Each row's instant equals "9am at airport local tz" expressed as UTC
expected <- c(
PDX = as.numeric(as.POSIXct("2023-01-01 09:00:00", tz = "America/Los_Angeles")),
JFK = as.numeric(as.POSIXct("2023-01-01 09:00:00", tz = "America/New_York")),
DEN = as.numeric(as.POSIXct("2023-01-01 09:00:00", tz = "America/Denver"))
)
by_origin <- stats::setNames(as.numeric(out$time_hour), out$origin)
expect_equal(by_origin[names(expected)], expected)
})

test_that("flights and weather time_hour join across multiple time zones (issue #28)", {
airports <- mock_airports()
origins <- c("PDX", "JFK", "DEN", "HNL")

# Flights data: airport-local 9am scheduled departure
flights <- dplyr::tibble(
origin = origins,
time_hour = lubridate::make_datetime(2023, 1, 1, 9, 0, 0)
) %>%
flights_time_hour_to_utc(airports)

# Weather data: UTC observation at the corresponding instant (as produced by
# get_weather_for_station with `tz = "GMT"`)
utc_hour <- c(PDX = 17L, JFK = 14L, DEN = 16L, HNL = 19L)
weather <- dplyr::tibble(
origin = names(utc_hour),
year = 2023L, month = 1L, day = 1L,
hour = unname(utc_hour),
time_hour = ISOdatetime(2023, 1, 1, unname(utc_hour), 0, 0, tz = "GMT")
)

joined <- dplyr::inner_join(flights, weather, by = c("origin", "time_hour"))
expect_equal(nrow(joined), length(origins))
expect_setequal(joined$origin, origins)
})

test_that("origins with NA tzone leave time_hour unchanged", {
raw <- dplyr::tibble(
origin = c("XYZ", "XYZ"),
time_hour = lubridate::make_datetime(2023, 1, 1, c(9, 10), 0, 0)
)

out <- flights_time_hour_to_utc(raw, mock_airports())
expect_equal(as.numeric(out$time_hour), as.numeric(raw$time_hour))
})

test_that("row order is preserved across multi-tz input", {
flights <- dplyr::tibble(
origin = c("JFK", "PDX", "DEN", "JFK", "PDX"),
time_hour = lubridate::make_datetime(2023, 1, 1, c(6, 7, 8, 9, 10), 0, 0)
)

out <- flights_time_hour_to_utc(flights, mock_airports())
expect_equal(out$origin, flights$origin)
})

test_that("output column is tagged UTC even for single-tz input", {
flights <- dplyr::tibble(
origin = c("JFK", "JFK", "JFK"),
time_hour = lubridate::make_datetime(2023, 6, 15, c(8, 12, 20), 0, 0)
)
airports <- dplyr::tibble(
faa = "JFK",
tzone = "America/New_York"
)

out <- flights_time_hour_to_utc(flights, airports)

expect_equal(lubridate::tz(out$time_hour), "UTC")
# 8am EDT == 12:00 UTC on June 15 (DST in effect)
expect_equal(
as.numeric(out$time_hour[1]),
as.numeric(as.POSIXct("2023-06-15 08:00:00", tz = "America/New_York"))
)
})

test_that("DST shift is respected within a single airport (EST vs EDT)", {
# 9am at JFK is UTC-5 in January but UTC-4 in July; if the helper used a
# fixed offset instead of an IANA tz, both rows would land at the same UTC
# hour.
flights <- dplyr::tibble(
origin = c("JFK", "JFK"),
time_hour = c(
lubridate::make_datetime(2023, 1, 15, 9, 0, 0),
lubridate::make_datetime(2023, 7, 15, 9, 0, 0)
)
)
airports <- dplyr::tibble(faa = "JFK", tzone = "America/New_York")

out <- flights_time_hour_to_utc(flights, airports)

expect_equal(lubridate::hour(out$time_hour), c(14L, 13L))
})

test_that("round-tripping UTC back to each origin's local tz recovers the input wall clock", {
airports <- mock_airports()
raw <- dplyr::tibble(
origin = c("PDX", "JFK", "DEN", "HNL"),
time_hour = lubridate::make_datetime(2023, 6, 15, c(6, 9, 12, 15), 30, 0)
)

out <- flights_time_hour_to_utc(raw, airports)

tz_lookup <- airports$tzone[match(out$origin, airports$faa)]
recovered_hour <- purrr::map2_int(out$time_hour, tz_lookup, function(t, tz) {
as.integer(lubridate::hour(lubridate::with_tz(t, tz)))
})
recovered_minute <- purrr::map2_int(out$time_hour, tz_lookup, function(t, tz) {
as.integer(lubridate::minute(lubridate::with_tz(t, tz)))
})

expect_equal(recovered_hour, c(6L, 9L, 12L, 15L))
expect_equal(recovered_minute, rep(30L, 4))
})

test_that("empty input returns an empty data frame with a UTC time_hour column", {
empty <- dplyr::tibble(
origin = character(0),
time_hour = as.POSIXct(character(0), tz = "UTC")
)

out <- flights_time_hour_to_utc(empty, mock_airports())

expect_equal(nrow(out), 0L)
expect_s3_class(out$time_hour, "POSIXct")
expect_equal(lubridate::tz(out$time_hour), "UTC")
})
Loading