I’ve been working on incorporating new faceting capabilities in rgbif
, see e.g., http://lists.gbif.org/pipermail/api-users/2016-June/000344.html and http://www.gbif.org/developer/occurrence#search
Install latest from github
devtools::install_github("ropensci/rgbif")
library("rgbif")
occ_facet
There’s a new function occ_facet()
that works on the same API route
as does occ_search()
and occ_data()
, but only supports facet parameters
and only returns facet data.
Facet by country
occ_facet(facet = "country")
#> $country
#> # A tibble: 10 × 2
#> name count
#> <chr> <int>
#> 1 US 209871927
#> 2 SE 51537511
#> 3 AU 42157681
#> 4 FR 34059936
#> 5 NO 24960097
#> 6 CA 23625498
#> 7 ZA 22449374
#> 8 NL 19061219
#> 9 DE 15180887
#> 10 GB 14919763
Facet by country
and basisOfRecord
occ_facet(facet = c("country", "basisOfRecord"))
#> $country
#> # A tibble: 10 × 2
#> name count
#> <chr> <int>
#> 1 US 209871927
#> 2 SE 51537511
#> 3 AU 42157681
#> 4 FR 34059936
#> 5 NO 24960097
#> 6 CA 23625498
#> 7 ZA 22449374
#> 8 NL 19061219
#> 9 DE 15180887
#> 10 GB 14919763
#>
#> $basisOfRecord
#> # A tibble: 9 × 2
#> name count
#> <chr> <int>
#> 1 HUMAN_OBSERVATION 406267079
#> 2 PRESERVED_SPECIMEN 119189526
#> 3 OBSERVATION 46093630
#> 4 UNKNOWN 35796627
#> 5 FOSSIL_SPECIMEN 6080382
#> 6 MACHINE_OBSERVATION 2963529
#> 7 LIVING_SPECIMEN 989833
#> 8 LITERATURE 504040
#> 9 MATERIAL_SAMPLE 225509
Facet by country
and basisOfRecord
, and limit country
to 3 records,
and basisOfRecord
to 6 records
occ_facet(
facet = c("country", "basisOfRecord", "hasCoordinate"),
country.facetLimit = 3,
basisOfRecord.facetLimit = 6
)
#> $country
#> # A tibble: 3 × 2
#> name count
#> <chr> <int>
#> 1 US 209871927
#> 2 SE 51537511
#> 3 AU 42157681
#>
#> $hasCoordinate
#> # A tibble: 2 × 2
#> name count
#> <chr> <int>
#> 1 true 548406002
#> 2 false 69704153
#>
#> $basisOfRecord
#> # A tibble: 6 × 2
#> name count
#> <chr> <int>
#> 1 HUMAN_OBSERVATION 406267079
#> 2 PRESERVED_SPECIMEN 119189526
#> 3 OBSERVATION 46093630
#> 4 UNKNOWN 35796627
#> 5 FOSSIL_SPECIMEN 6080382
#> 6 MACHINE_OBSERVATION 2963529
occ_search
Faceting is also in occ_search()
Facet by country
and basisOfRecord
(x <- occ_search(facet = c("country", "basisOfRecord"), limit = 10))
#> Records found [618110155]
#> Records returned [10]
#> No. unique hierarchies [7]
#> No. media records [1]
#> No. facets [2]
#> Args [limit=10, offset=0, facet=country, facet=basisOfRecord, fields=all]
#> # A tibble: 10 × 81
#> name key decimalLatitude decimalLongitude
#> <chr> <int> <dbl> <dbl>
#> 1 Vulpes vulpes 1253525903 53.78583 -9.23722
#> 2 Lycopodium 1253526238 -21.89889 -41.91194
#> 3 Dioscorea campanulata 1253526327 -21.86694 -41.91333
#> 4 Ericaceae 1253526332 -21.86333 -41.90667
#> 5 Asteraceae 1253526347 -21.86333 -41.90667
#> 6 Solenopsis 1253919861 -25.83243 30.00409
#> 7 Anoplolepis steingroeveri 1253919863 -28.97908 26.08894
#> 8 Anoplolepis steingroeveri 1253919874 -28.97908 26.08894
#> 9 Solenopsis 1253919878 -25.83243 30.00409
#> 10 Solenopsis 1253919894 -25.83243 30.00409
#> # ... with 77 more variables: issues <chr>, datasetKey <chr>,
#> # publishingOrgKey <chr>, publishingCountry <chr>, protocol <chr>,
#> # lastCrawled <chr>, lastParsed <chr>, crawlId <int>, extensions <chr>,
#> # basisOfRecord <chr>, taxonKey <int>, kingdomKey <int>,
#> # phylumKey <int>, classKey <int>, orderKey <int>, familyKey <int>,
#> # genusKey <int>, speciesKey <int>, scientificName <chr>, kingdom <chr>,
#> # phylum <chr>, order <chr>, family <chr>, genus <chr>, species <chr>,
#> # genericName <chr>, specificEpithet <chr>, taxonRank <chr>,
#> # coordinateUncertaintyInMeters <dbl>, continent <chr>, year <int>,
#> # month <int>, day <int>, eventDate <chr>, modified <chr>,
#> # lastInterpreted <chr>, license <chr>, identifiers <chr>, facts <chr>,
#> # relations <chr>, geodeticDatum <chr>, class <chr>, countryCode <chr>,
#> # country <chr>, identifier <chr>, catalogNumber <chr>,
#> # vernacularName <chr>, institutionCode <chr>, dynamicProperties <chr>,
#> # georeferenceVerificationStatus <chr>, locality <chr>, datasetID <chr>,
#> # gbifID <chr>, taxonomicStatus <chr>, collectionCode <chr>, type <chr>,
#> # stateProvince <chr>, rightsHolder <chr>, recordNumber <chr>,
#> # county <chr>, language <chr>, occurrenceID <chr>, recordedBy <chr>,
#> # otherCatalogNumbers <chr>, occurrenceRemarks <chr>,
#> # dateIdentified <chr>, identifiedBy <chr>, elevation <dbl>,
#> # habitat <chr>, nomenclaturalCode <chr>, verbatimEventDate <chr>,
#> # georeferenceRemarks <chr>, fieldNumber <chr>, preparations <chr>,
#> # ownerInstitutionCode <chr>, samplingProtocol <chr>,
#> # higherClassification <chr>
Get occurrence data
x$data
#> # A tibble: 10 × 81
#> name key decimalLatitude decimalLongitude
#> <chr> <int> <dbl> <dbl>
#> 1 Vulpes vulpes 1253525903 53.78583 -9.23722
#> 2 Lycopodium 1253526238 -21.89889 -41.91194
#> 3 Dioscorea campanulata 1253526327 -21.86694 -41.91333
#> 4 Ericaceae 1253526332 -21.86333 -41.90667
#> 5 Asteraceae 1253526347 -21.86333 -41.90667
#> 6 Solenopsis 1253919861 -25.83243 30.00409
#> 7 Anoplolepis steingroeveri 1253919863 -28.97908 26.08894
#> 8 Anoplolepis steingroeveri 1253919874 -28.97908 26.08894
#> 9 Solenopsis 1253919878 -25.83243 30.00409
#> 10 Solenopsis 1253919894 -25.83243 30.00409
#> # ... with 77 more variables: issues <chr>, datasetKey <chr>,
#> # publishingOrgKey <chr>, publishingCountry <chr>, protocol <chr>,
#> # lastCrawled <chr>, lastParsed <chr>, crawlId <int>, extensions <chr>,
#> # basisOfRecord <chr>, taxonKey <int>, kingdomKey <int>,
#> # phylumKey <int>, classKey <int>, orderKey <int>, familyKey <int>,
#> # genusKey <int>, speciesKey <int>, scientificName <chr>, kingdom <chr>,
#> # phylum <chr>, order <chr>, family <chr>, genus <chr>, species <chr>,
#> # genericName <chr>, specificEpithet <chr>, taxonRank <chr>,
#> # coordinateUncertaintyInMeters <dbl>, continent <chr>, year <int>,
#> # month <int>, day <int>, eventDate <chr>, modified <chr>,
#> # lastInterpreted <chr>, license <chr>, identifiers <chr>, facts <chr>,
#> # relations <chr>, geodeticDatum <chr>, class <chr>, countryCode <chr>,
#> # country <chr>, identifier <chr>, catalogNumber <chr>,
#> # vernacularName <chr>, institutionCode <chr>, dynamicProperties <chr>,
#> # georeferenceVerificationStatus <chr>, locality <chr>, datasetID <chr>,
#> # gbifID <chr>, taxonomicStatus <chr>, collectionCode <chr>, type <chr>,
#> # stateProvince <chr>, rightsHolder <chr>, recordNumber <chr>,
#> # county <chr>, language <chr>, occurrenceID <chr>, recordedBy <chr>,
#> # otherCatalogNumbers <chr>, occurrenceRemarks <chr>,
#> # dateIdentified <chr>, identifiedBy <chr>, elevation <dbl>,
#> # habitat <chr>, nomenclaturalCode <chr>, verbatimEventDate <chr>,
#> # georeferenceRemarks <chr>, fieldNumber <chr>, preparations <chr>,
#> # ownerInstitutionCode <chr>, samplingProtocol <chr>,
#> # higherClassification <chr>
Get facet data
x$facets
#> $country
#> # A tibble: 10 × 2
#> name count
#> <chr> <int>
#> 1 US 209871927
#> 2 SE 51537511
#> 3 AU 42157681
#> 4 FR 34059936
#> 5 NO 24960097
#> 6 CA 23625498
#> 7 ZA 22449374
#> 8 NL 19061219
#> 9 DE 15180887
#> 10 GB 14919763
#>
#> $basisOfRecord
#> # A tibble: 9 × 2
#> name count
#> <chr> <int>
#> 1 HUMAN_OBSERVATION 406267079
#> 2 PRESERVED_SPECIMEN 119189526
#> 3 OBSERVATION 46093630
#> 4 UNKNOWN 35796627
#> 5 FOSSIL_SPECIMEN 6080382
#> 6 MACHINE_OBSERVATION 2963529
#> 7 LIVING_SPECIMEN 989833
#> 8 LITERATURE 504040
#> 9 MATERIAL_SAMPLE 225509
Paging per each faceted variable
(x <- occ_search(
facet = c("country", "basisOfRecord", "hasCoordinate"),
country.facetLimit = 3,
basisOfRecord.facetLimit = 6,
limit = 0
))
#> Records found [618110155]
#> Records returned [0]
#> No. unique hierarchies [0]
#> No. media records [0]
#> No. facets [3]
#> Args [limit=0, offset=0, facet=country, facet=basisOfRecord,
#> facet=hasCoordinate, country.facetLimit=3,
#> basisOfRecord.facetLimit=6, fields=all]
x$facets
#> $country
#> # A tibble: 3 × 2
#> name count
#> <chr> <int>
#> 1 US 209871927
#> 2 SE 51537511
#> 3 AU 42157681
#>
#> $hasCoordinate
#> # A tibble: 2 × 2
#> name count
#> <chr> <int>
#> 1 true 548406002
#> 2 false 69704153
#>
#> $basisOfRecord
#> # A tibble: 6 × 2
#> name count
#> <chr> <int>
#> 1 HUMAN_OBSERVATION 406267079
#> 2 PRESERVED_SPECIMEN 119189526
#> 3 OBSERVATION 46093630
#> 4 UNKNOWN 35796627
#> 5 FOSSIL_SPECIMEN 6080382
#> 6 MACHINE_OBSERVATION 2963529
occ_data
Faceting won’t be in occ_data()
function - that function just focuses on getting occurrence data as fast as possible.
Feedback?
What do you think? Let us know!