rgbif - draft faceting tools

gbif
rgbif
Tags: #<Tag:0x00007f65782f75a8> #<Tag:0x00007f65782f7468>

#1

I’ve been working on incorporating new faceting capabilities in rgbif, see e.g., http://lists.gbif.org/pipermail/api-users/2016-June/000344.html and http://www.gbif.org/developer/occurrence#search

Install latest from github

devtools::install_github("ropensci/rgbif")
library("rgbif")

occ_facet

There’s a new function occ_facet() that works on the same API route
as does occ_search() and occ_data(), but only supports facet parameters
and only returns facet data.

Facet by country

occ_facet(facet = "country")
#> $country
#> # A tibble: 10 × 2
#>     name     count
#>    <chr>     <int>
#> 1     US 209871927
#> 2     SE  51537511
#> 3     AU  42157681
#> 4     FR  34059936
#> 5     NO  24960097
#> 6     CA  23625498
#> 7     ZA  22449374
#> 8     NL  19061219
#> 9     DE  15180887
#> 10    GB  14919763

Facet by country and basisOfRecord

occ_facet(facet = c("country", "basisOfRecord"))
#> $country
#> # A tibble: 10 × 2
#>     name     count
#>    <chr>     <int>
#> 1     US 209871927
#> 2     SE  51537511
#> 3     AU  42157681
#> 4     FR  34059936
#> 5     NO  24960097
#> 6     CA  23625498
#> 7     ZA  22449374
#> 8     NL  19061219
#> 9     DE  15180887
#> 10    GB  14919763
#> 
#> $basisOfRecord
#> # A tibble: 9 × 2
#>                  name     count
#>                 <chr>     <int>
#> 1   HUMAN_OBSERVATION 406267079
#> 2  PRESERVED_SPECIMEN 119189526
#> 3         OBSERVATION  46093630
#> 4             UNKNOWN  35796627
#> 5     FOSSIL_SPECIMEN   6080382
#> 6 MACHINE_OBSERVATION   2963529
#> 7     LIVING_SPECIMEN    989833
#> 8          LITERATURE    504040
#> 9     MATERIAL_SAMPLE    225509

Facet by country and basisOfRecord, and limit country to 3 records,
and basisOfRecord to 6 records

occ_facet(
  facet = c("country", "basisOfRecord", "hasCoordinate"),
  country.facetLimit = 3,
  basisOfRecord.facetLimit = 6
)
#> $country
#> # A tibble: 3 × 2
#>    name     count
#>   <chr>     <int>
#> 1    US 209871927
#> 2    SE  51537511
#> 3    AU  42157681
#> 
#> $hasCoordinate
#> # A tibble: 2 × 2
#>    name     count
#>   <chr>     <int>
#> 1  true 548406002
#> 2 false  69704153
#> 
#> $basisOfRecord
#> # A tibble: 6 × 2
#>                  name     count
#>                 <chr>     <int>
#> 1   HUMAN_OBSERVATION 406267079
#> 2  PRESERVED_SPECIMEN 119189526
#> 3         OBSERVATION  46093630
#> 4             UNKNOWN  35796627
#> 5     FOSSIL_SPECIMEN   6080382
#> 6 MACHINE_OBSERVATION   2963529

occ_search

Faceting is also in occ_search()

Facet by country and basisOfRecord

(x <- occ_search(facet = c("country", "basisOfRecord"), limit = 10))
#> Records found [618110155] 
#> Records returned [10] 
#> No. unique hierarchies [7] 
#> No. media records [1] 
#> No. facets [2] 
#> Args [limit=10, offset=0, facet=country, facet=basisOfRecord, fields=all] 
#> # A tibble: 10 × 81
#>                         name        key decimalLatitude decimalLongitude
#>                        <chr>      <int>           <dbl>            <dbl>
#> 1              Vulpes vulpes 1253525903        53.78583         -9.23722
#> 2                 Lycopodium 1253526238       -21.89889        -41.91194
#> 3      Dioscorea campanulata 1253526327       -21.86694        -41.91333
#> 4                  Ericaceae 1253526332       -21.86333        -41.90667
#> 5                 Asteraceae 1253526347       -21.86333        -41.90667
#> 6                 Solenopsis 1253919861       -25.83243         30.00409
#> 7  Anoplolepis steingroeveri 1253919863       -28.97908         26.08894
#> 8  Anoplolepis steingroeveri 1253919874       -28.97908         26.08894
#> 9                 Solenopsis 1253919878       -25.83243         30.00409
#> 10                Solenopsis 1253919894       -25.83243         30.00409
#> # ... with 77 more variables: issues <chr>, datasetKey <chr>,
#> #   publishingOrgKey <chr>, publishingCountry <chr>, protocol <chr>,
#> #   lastCrawled <chr>, lastParsed <chr>, crawlId <int>, extensions <chr>,
#> #   basisOfRecord <chr>, taxonKey <int>, kingdomKey <int>,
#> #   phylumKey <int>, classKey <int>, orderKey <int>, familyKey <int>,
#> #   genusKey <int>, speciesKey <int>, scientificName <chr>, kingdom <chr>,
#> #   phylum <chr>, order <chr>, family <chr>, genus <chr>, species <chr>,
#> #   genericName <chr>, specificEpithet <chr>, taxonRank <chr>,
#> #   coordinateUncertaintyInMeters <dbl>, continent <chr>, year <int>,
#> #   month <int>, day <int>, eventDate <chr>, modified <chr>,
#> #   lastInterpreted <chr>, license <chr>, identifiers <chr>, facts <chr>,
#> #   relations <chr>, geodeticDatum <chr>, class <chr>, countryCode <chr>,
#> #   country <chr>, identifier <chr>, catalogNumber <chr>,
#> #   vernacularName <chr>, institutionCode <chr>, dynamicProperties <chr>,
#> #   georeferenceVerificationStatus <chr>, locality <chr>, datasetID <chr>,
#> #   gbifID <chr>, taxonomicStatus <chr>, collectionCode <chr>, type <chr>,
#> #   stateProvince <chr>, rightsHolder <chr>, recordNumber <chr>,
#> #   county <chr>, language <chr>, occurrenceID <chr>, recordedBy <chr>,
#> #   otherCatalogNumbers <chr>, occurrenceRemarks <chr>,
#> #   dateIdentified <chr>, identifiedBy <chr>, elevation <dbl>,
#> #   habitat <chr>, nomenclaturalCode <chr>, verbatimEventDate <chr>,
#> #   georeferenceRemarks <chr>, fieldNumber <chr>, preparations <chr>,
#> #   ownerInstitutionCode <chr>, samplingProtocol <chr>,
#> #   higherClassification <chr>

Get occurrence data

x$data
#> # A tibble: 10 × 81
#>                         name        key decimalLatitude decimalLongitude
#>                        <chr>      <int>           <dbl>            <dbl>
#> 1              Vulpes vulpes 1253525903        53.78583         -9.23722
#> 2                 Lycopodium 1253526238       -21.89889        -41.91194
#> 3      Dioscorea campanulata 1253526327       -21.86694        -41.91333
#> 4                  Ericaceae 1253526332       -21.86333        -41.90667
#> 5                 Asteraceae 1253526347       -21.86333        -41.90667
#> 6                 Solenopsis 1253919861       -25.83243         30.00409
#> 7  Anoplolepis steingroeveri 1253919863       -28.97908         26.08894
#> 8  Anoplolepis steingroeveri 1253919874       -28.97908         26.08894
#> 9                 Solenopsis 1253919878       -25.83243         30.00409
#> 10                Solenopsis 1253919894       -25.83243         30.00409
#> # ... with 77 more variables: issues <chr>, datasetKey <chr>,
#> #   publishingOrgKey <chr>, publishingCountry <chr>, protocol <chr>,
#> #   lastCrawled <chr>, lastParsed <chr>, crawlId <int>, extensions <chr>,
#> #   basisOfRecord <chr>, taxonKey <int>, kingdomKey <int>,
#> #   phylumKey <int>, classKey <int>, orderKey <int>, familyKey <int>,
#> #   genusKey <int>, speciesKey <int>, scientificName <chr>, kingdom <chr>,
#> #   phylum <chr>, order <chr>, family <chr>, genus <chr>, species <chr>,
#> #   genericName <chr>, specificEpithet <chr>, taxonRank <chr>,
#> #   coordinateUncertaintyInMeters <dbl>, continent <chr>, year <int>,
#> #   month <int>, day <int>, eventDate <chr>, modified <chr>,
#> #   lastInterpreted <chr>, license <chr>, identifiers <chr>, facts <chr>,
#> #   relations <chr>, geodeticDatum <chr>, class <chr>, countryCode <chr>,
#> #   country <chr>, identifier <chr>, catalogNumber <chr>,
#> #   vernacularName <chr>, institutionCode <chr>, dynamicProperties <chr>,
#> #   georeferenceVerificationStatus <chr>, locality <chr>, datasetID <chr>,
#> #   gbifID <chr>, taxonomicStatus <chr>, collectionCode <chr>, type <chr>,
#> #   stateProvince <chr>, rightsHolder <chr>, recordNumber <chr>,
#> #   county <chr>, language <chr>, occurrenceID <chr>, recordedBy <chr>,
#> #   otherCatalogNumbers <chr>, occurrenceRemarks <chr>,
#> #   dateIdentified <chr>, identifiedBy <chr>, elevation <dbl>,
#> #   habitat <chr>, nomenclaturalCode <chr>, verbatimEventDate <chr>,
#> #   georeferenceRemarks <chr>, fieldNumber <chr>, preparations <chr>,
#> #   ownerInstitutionCode <chr>, samplingProtocol <chr>,
#> #   higherClassification <chr>

Get facet data

x$facets
#> $country
#> # A tibble: 10 × 2
#>     name     count
#>    <chr>     <int>
#> 1     US 209871927
#> 2     SE  51537511
#> 3     AU  42157681
#> 4     FR  34059936
#> 5     NO  24960097
#> 6     CA  23625498
#> 7     ZA  22449374
#> 8     NL  19061219
#> 9     DE  15180887
#> 10    GB  14919763
#> 
#> $basisOfRecord
#> # A tibble: 9 × 2
#>                  name     count
#>                 <chr>     <int>
#> 1   HUMAN_OBSERVATION 406267079
#> 2  PRESERVED_SPECIMEN 119189526
#> 3         OBSERVATION  46093630
#> 4             UNKNOWN  35796627
#> 5     FOSSIL_SPECIMEN   6080382
#> 6 MACHINE_OBSERVATION   2963529
#> 7     LIVING_SPECIMEN    989833
#> 8          LITERATURE    504040
#> 9     MATERIAL_SAMPLE    225509

Paging per each faceted variable

(x <- occ_search(
  facet = c("country", "basisOfRecord", "hasCoordinate"),
  country.facetLimit = 3,
  basisOfRecord.facetLimit = 6,
  limit = 0
))
#> Records found [618110155] 
#> Records returned [0] 
#> No. unique hierarchies [0] 
#> No. media records [0] 
#> No. facets [3] 
#> Args [limit=0, offset=0, facet=country, facet=basisOfRecord,
#>      facet=hasCoordinate, country.facetLimit=3,
#>      basisOfRecord.facetLimit=6, fields=all]
x$facets
#> $country
#> # A tibble: 3 × 2
#>    name     count
#>   <chr>     <int>
#> 1    US 209871927
#> 2    SE  51537511
#> 3    AU  42157681
#> 
#> $hasCoordinate
#> # A tibble: 2 × 2
#>    name     count
#>   <chr>     <int>
#> 1  true 548406002
#> 2 false  69704153
#> 
#> $basisOfRecord
#> # A tibble: 6 × 2
#>                  name     count
#>                 <chr>     <int>
#> 1   HUMAN_OBSERVATION 406267079
#> 2  PRESERVED_SPECIMEN 119189526
#> 3         OBSERVATION  46093630
#> 4             UNKNOWN  35796627
#> 5     FOSSIL_SPECIMEN   6080382
#> 6 MACHINE_OBSERVATION   2963529

occ_data

Faceting won’t be in occ_data() function - that function just focuses on getting occurrence data as fast as possible.

Feedback?

What do you think? Let us know!