Someone asked about how to get data in month length chunks from an individual dataset with rerddap - a general purpose R client for ERDDAP servers.
The following are my thoughts on this:
library("rerddap")
library("dplyr")
library("lubridate")
We’ll use the dataset hycom_gom310S
, found at http://upwell.pfeg.noaa.gov/erddap/griddap/hycom_gom310S.html
get info on the dataset
(res <- info('hycom_gom310S'))
#> <ERDDAP info> hycom_gom310S
#> Dimensions (range):
#> time: (2009-04-02T00:00:00Z, 2014-08-30T00:00:00Z)
#> latitude: (18.09165, 31.96065)
#> longitude: (-98.0, -76.40002)
#> Variables:
#> emp:
#> Units: kg/m2/s
#> mld:
#> Units: m
#> mlp:
#> Units: m
#> qtot:
#> Units: w/m2
#> ssh:
#> Units: m
#> surface_salinity_trend:
#> Units: PSU/day
#> surface_temperature_trend:
#> Units: degC/day
get start and end times
(times <- res$alldata$NC_GLOBAL %>%
filter(attribute_name %in% c('time_coverage_end', 'time_coverage_start')) %>%
select(value) %>%
.$value)
#> [1] "2014-08-30 UTC" "2009-04-02 UTC"
make month long time periods
times <- lubridate::ymd_hms(times)
starts <- seq(times[2], times[1], by = "1 month")
starts[1:3]
#> [1] "2009-04-02 UTC" "2009-05-02 UTC" "2009-06-02 UTC"
loop over each month with griddap()
set read=FALSE
to only download files
out <- list()
for (i in seq_along(starts)) {
dates <- c(ymd(starts[i]), ymd(starts[i] + days(30)))
cat("getting ", as.character(dates[1]),
" to ", as.character(dates[2]), "\n")
out[[i]] <- griddap(
res,
time = dates,
latitude = c(20, 21),
longitude = c(-80, -81),
read = FALSE
)
}
inspect
length(out)
read in netcdf files
out2 <- lapply(out, function(z) ncdf4::nc_open(z$summary$filename))
vapply(out2, class, "")
#> [1] "ncdf4" "ncdf4" "ncdf4"
OR - read in within the for loop
out_dfs <- list()
for (i in seq_along(starts[1:3])) {
dates <- c(ymd(starts[i]), ymd(starts[i] + days(30)))
cat("getting ", as.character(dates[1]),
" to ", as.character(dates[2]), "\n")
out_dfs[[i]] <- griddap(
res,
time = dates,
latitude = c(20, 21),
longitude = c(-80, -81),
read = TRUE
)
}
Then, data is already there, but reading in the data does take longer.
Then, bind all together in one data.frame
tbl_df(bind_rows(lapply(out_dfs, "[[", "data")))
#> # A tibble: 67,704 × 10
#> time lat lon emp mld mlp qtot
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2009-04-02T00:00:00Z 19.98216 -81.00000 1.001745e-07 12.47464 9.482147 -193.3023
#> 2 2009-04-02T00:00:00Z 19.98216 -80.96002 -9.975844e-07 11.03450 4.705999 -194.2480
#> 3 2009-04-02T00:00:00Z 19.98216 -80.91998 -2.570382e-07 13.87560 9.874534 -188.5700
#> 4 2009-04-02T00:00:00Z 19.98216 -80.88000 -3.189338e-06 12.41764 4.614823 -190.0845
#> 5 2009-04-02T00:00:00Z 19.98216 -80.83997 -4.883811e-06 16.26543 13.045880 -187.4980
#> 6 2009-04-02T00:00:00Z 19.98216 -80.79999 -7.715149e-06 13.75597 4.588535 -190.6221
#> 7 2009-04-02T00:00:00Z 19.98216 -80.76001 -8.380262e-06 16.91521 12.527599 -187.1993
#> 8 2009-04-02T00:00:00Z 19.98216 -80.71997 -1.041871e-05 12.47726 4.570753 -188.3784
#> 9 2009-04-02T00:00:00Z 19.98216 -80.67999 -1.129382e-05 11.95203 5.426512 -186.7068
#> 10 2009-04-02T00:00:00Z 19.98216 -80.64001 -1.149297e-05 16.31684 10.040488 -183.0806
#> # ... with 67,694 more rows, and 3 more variables: ssh <dbl>, surface_salinity_trend <dbl>,
#> # surface_temperature_trend <dbl>