Skip to content

Instantly share code, notes, and snippets.

@sckott
Last active August 29, 2015 14:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sckott/5077c069be86ae6096a5 to your computer and use it in GitHub Desktop.
Save sckott/5077c069be86ae6096a5 to your computer and use it in GitHub Desktop.

The scrapenames() function in taxize resolves names using the Global Names Recognition and Discovery web service.

install

install from github to get fixes to the function

devtools::install_github("ropensci/scrapenames")

Load the library

library("taxize")

From a website using its URL

res <- scrapenames(url = 'http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0080498')

detailed metadat on the response

res$meta
## $token_url
## [1] "http://gnrd.globalnames.org/name_finder.json?token=J3YNA1xETsKykQeQOjFmJg"
## 
## $input_url
## [1] "http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0080498"
## 
## $file
## [1] ""
## 
## $status
## [1] 200
## 
## $engines
## [1] "TaxonFinder" "NetiNeti"   
## 
## $unique
## [1] FALSE
## 
## $verbatim
## [1] TRUE
## 
## $english
## [1] TRUE
## 
## $execution_time
## $execution_time$find_names_duration
## [1] 5.184621
## 
## $execution_time$total_duration
## [1] 11.38191
## 
## 
## $agent
## $agent$code
## [1] "200"
## 
## $agent$content_type
## [1] "text/html;charset=UTF-8"
## 
## $agent$filename
## [1] "article.html_id=10.1371_journal.pone.0080498"
## 
## 
## $created
## [1] "2015-08-06T00:37:42Z"
## 
## $total
## [1] 362

and the data itself

head(res$data)
##            verbatim    scientificName offsetStart offsetEnd
## 1 Cyatta abscondita Cyatta abscondita          12        28
## 2            Scopus            Scopus        2786      2791
## 3 Cyatta abscondita Cyatta abscondita        3036      3052
## 4          Curitiba          Curitiba        4489      4496
## 5          Paulista          Paulista        4938      4945
## 6 Cyatta abscondita Cyatta abscondita        5440      5456
##      identifiedName
## 1 Cyatta abscondita
## 2            Scopus
## 3 Cyatta abscondita
## 4          Curitiba
## 5          Paulista
## 6 Cyatta abscondita

From a pdf at a URL

url <- 'http://www.plosone.org/article/fetchObject.action?uri=info%3Adoi%2F10.1371%2Fjournal.pone.0058268&representation=PDF'
res2 <- scrapenames(url = url)
head(res2$data)
##               verbatim      scientificName offsetStart offsetEnd
## 1              Xylaria             Xylaria          24        30
## 2 Dendrobium  precious Dendrobium precious          37        56
## 3              Petrini             Petrini        1495      1501
## 4              Petrini             Petrini        1505      1511
## 5           Dendrobium          Dendrobium        2027      2036
## 6         Xylariaceae,         Xylariaceae        2967      2978
##        identifiedName
## 1             Xylaria
## 2 Dendrobium precious
## 3             Petrini
## 4             Petrini
## 5          Dendrobium
## 6         Xylariaceae

From a file

Contents of the file

speciesfile <- system.file("examples", "species.txt", package = "taxize")
readLines(speciesfile)
##  [1] "Achnatherum eminens"      "Achnatherum inebrians"   
##  [3] "Achnatherum lettermanii"  "Achnatherum lobatum"     
##  [5] "Achnatherum nelsonii"     "Achnatherum pinetorum"   
##  [7] "Achnatherum purpurascens" "Achnatherum robustum"    
##  [9] "Achnatherum sibiricum"    "Achnatherum speciosum"

Find names in the file (this is too easy, but you can imagine files with other text mixed in with the names)

res3 <- scrapenames(file = speciesfile)
head(res3$data)
##                  verbatim          scientificName offsetStart offsetEnd
## 1     Achnatherum eminens     Achnatherum eminens           0        18
## 2   Achnatherum inebrians   Achnatherum inebrians          20        40
## 3 Achnatherum lettermanii Achnatherum lettermanii          42        64
## 4     Achnatherum lobatum     Achnatherum lobatum          66        84
## 5    Achnatherum nelsonii    Achnatherum nelsonii          86       105
## 6   Achnatherum pinetorum   Achnatherum pinetorum         107       127
##            identifiedName
## 1     Achnatherum eminens
## 2   Achnatherum inebrians
## 3 Achnatherum lettermanii
## 4     Achnatherum lobatum
## 5    Achnatherum nelsonii
## 6   Achnatherum pinetorum

From a text string

res4 <- scrapenames(text = 'A spider named Pardosa moesta Banks, 1892')
head(res4$data)
##         verbatim scientificName offsetStart offsetEnd identifiedName
## 1 Pardosa moesta Pardosa moesta          15        28 Pardosa moesta
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment