Skip to content

Instantly share code, notes, and snippets.

@njtierney
Created September 1, 2022 04:10
Show Gist options
  • Save njtierney/0bfaa2c1fe356accdec563c2b2ec1be4 to your computer and use it in GitHub Desktop.
Save njtierney/0bfaa2c1fe356accdec563c2b2ec1be4 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(rvest)
#> 
#> Attaching package: 'rvest'
#> The following object is masked from 'package:readr':
#> 
#>     guess_encoding
library(polite)

url <- "https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table"

bow_scrape <- function(x){
  bow(x) %>% 
    scrape()
}

wiki_raw <- bow_scrape(url)

all_tables <- html_table(wiki_raw)
#> Warning in table_fill(cells, trim = trim): NAs introduced by coercion

#> Warning in table_fill(cells, trim = trim): NAs introduced by coercion

all_tables[[1]]
#> # A tibble: 6 × 1
#>   `Olympic Games`                                                               
#>   <chr>                                                                         
#> 1 ""                                                                            
#> 2 "Main topics"                                                                 
#> 3 "Bids\nBoycotts\nCeremonies\nCharter\nHost cities\nIFs\nIOC\nMedal\nMedal tab…
#> 4 "Games"                                                                       
#> 5 "Summer\nWinterYouthAfrican\nAsian\nEuropean\nPacific\nPan-AmericanAncient\nI…
#> 6 ".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.m…

which_table <- function(table, which){
  table[[which]]
}

which_table(all_tables, 3)
#> # A tibble: 76 × 4
#>    `Team (IOC code)`         `No. Summer` `No. Winter` `No. Games`
#>    <chr>                            <int>        <int>       <int>
#>  1 Albania (ALB)                        9            5          14
#>  2 American Samoa (ASA)                 9            2          11
#>  3 Andorra (AND)                       12           13          25
#>  4 Angola (ANG)                        10            0          10
#>  5 Antigua and Barbuda (ANT)           11            0          11
#>  6 Aruba (ARU)                          9            0           9
#>  7 Bangladesh (BAN)                    10            0          10
#>  8 Belize (BIZ) [BIZ]                  13            0          13
#>  9 Benin (BEN) [BEN]                   12            0          12
#> 10 Bhutan (BHU)                        10            0          10
#> # … with 66 more rows
#> # ℹ Use `print(n = ...)` to see more rows

get_brackets <- function(x){
  stringr::str_extract(x, "\\((.*?)\\)")
}

remove_brackets <- function(x){
  stringr::str_remove_all(x, "\\(|\\)")
}

get_brackets("Albania (ALB)") %>% 
  remove_brackets()
#> [1] "ALB"

which_table(all_tables,3) %>% 
  pull(1) %>% 
  get_brackets() %>% 
  remove_brackets()
#>  [1] "ALB" "ASA" "AND" "ANG" "ANT" "ARU" "BAN" "BIZ" "BEN" "BHU" "BOL" "BIH"
#> [13] "IVB" "BRU" "CAM" "CPV" "CAY" "CAF" "CHA" "COM" "CGO" "COD" "COK" "DMA"
#> [25] "ESA" "SWZ" "GEQ" "GAM" "GUM" "GUI" "GBS" "HON" "KIR" "LAO" "LES" "LBR"
#> [37] "LBA" "MAD" "MAW" "MDV" "MLI" "MLT" "MHL" "MTN" "FSM" "MON" "MYA" "NRU"
#> [49] "NEP" "NCA" "OMA" "PLW" "PLE" "PNG" "RWA" "SKN" "LCA" "VIN" "STP" "SEY"
#> [61] "SLE" "SOL" "SOM" "SSD" "TLS" "TUV" "VAN" "YEM" "COR" "MAL" "NBO" "ROC"
#> [73] "SAA" "YAR" "YMD" "ROT"

Created on 2022-09-01 by the reprex package (v2.0.1)

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.2.0 (2022-04-22)
#>  os       macOS Monterey 12.3.1
#>  system   aarch64, darwin20
#>  ui       X11
#>  language (EN)
#>  collate  en_AU.UTF-8
#>  ctype    en_AU.UTF-8
#>  tz       Australia/Perth
#>  date     2022-09-01
#>  pandoc   2.18 @ /Applications/RStudio.app/Contents/MacOS/quarto/bin/tools/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package       * version    date (UTC) lib source
#>  assertthat      0.2.1      2019-03-21 [1] CRAN (R 4.2.0)
#>  backports       1.4.1      2021-12-13 [1] CRAN (R 4.2.0)
#>  broom           1.0.0      2022-07-01 [1] CRAN (R 4.2.0)
#>  cachem          1.0.6      2021-08-19 [1] CRAN (R 4.2.0)
#>  cellranger      1.1.0      2016-07-27 [1] CRAN (R 4.2.0)
#>  cli             3.3.0.9000 2022-06-15 [1] Github (r-lib/cli@31a5db5)
#>  colorspace      2.0-3      2022-02-21 [1] CRAN (R 4.2.0)
#>  crayon          1.5.1      2022-03-26 [1] CRAN (R 4.2.0)
#>  curl            4.3.2      2021-06-23 [1] CRAN (R 4.2.0)
#>  DBI             1.1.3      2022-06-18 [1] CRAN (R 4.2.0)
#>  dbplyr          2.2.1      2022-06-27 [1] CRAN (R 4.2.0)
#>  digest          0.6.29     2021-12-01 [1] CRAN (R 4.2.0)
#>  dplyr         * 1.0.9      2022-04-28 [1] CRAN (R 4.2.0)
#>  ellipsis        0.3.2      2021-04-29 [1] CRAN (R 4.2.0)
#>  evaluate        0.16       2022-08-09 [1] CRAN (R 4.2.0)
#>  fansi           1.0.3      2022-03-24 [1] CRAN (R 4.2.0)
#>  fastmap         1.1.0      2021-01-25 [1] CRAN (R 4.2.0)
#>  forcats       * 0.5.1      2021-01-27 [1] CRAN (R 4.2.0)
#>  fs              1.5.2      2021-12-08 [1] CRAN (R 4.2.0)
#>  gargle          1.2.0      2021-07-02 [1] CRAN (R 4.2.0)
#>  generics        0.1.3      2022-07-05 [1] CRAN (R 4.2.0)
#>  ggplot2       * 3.3.6      2022-05-03 [1] CRAN (R 4.2.0)
#>  glue            1.6.2      2022-02-24 [1] CRAN (R 4.2.0)
#>  googledrive     2.0.0      2021-07-08 [1] CRAN (R 4.2.0)
#>  googlesheets4   1.0.1      2022-08-13 [1] CRAN (R 4.2.0)
#>  gtable          0.3.0      2019-03-25 [1] CRAN (R 4.2.0)
#>  haven           2.5.0      2022-04-15 [1] CRAN (R 4.2.0)
#>  highr           0.9        2021-04-16 [1] CRAN (R 4.2.0)
#>  hms             1.1.1      2021-09-26 [1] CRAN (R 4.2.0)
#>  htmltools       0.5.3      2022-07-18 [1] CRAN (R 4.2.0)
#>  httr            1.4.3      2022-05-04 [1] CRAN (R 4.2.0)
#>  jsonlite        1.8.0      2022-02-22 [1] CRAN (R 4.2.0)
#>  knitr           1.39       2022-04-26 [1] CRAN (R 4.2.0)
#>  lifecycle       1.0.1      2021-09-24 [1] CRAN (R 4.2.0)
#>  lubridate       1.8.0      2021-10-07 [1] CRAN (R 4.2.0)
#>  magrittr        2.0.3      2022-03-30 [1] CRAN (R 4.2.0)
#>  memoise         2.0.1      2021-11-26 [1] CRAN (R 4.2.0)
#>  mime            0.12       2021-09-28 [1] CRAN (R 4.2.0)
#>  modelr          0.1.8      2020-05-19 [1] CRAN (R 4.2.0)
#>  munsell         0.5.0      2018-06-12 [1] CRAN (R 4.2.0)
#>  pillar          1.8.0      2022-07-18 [1] CRAN (R 4.2.0)
#>  pkgconfig       2.0.3      2019-09-22 [1] CRAN (R 4.2.0)
#>  polite        * 0.1.2      2022-08-09 [1] CRAN (R 4.2.0)
#>  purrr         * 0.3.4      2020-04-17 [1] CRAN (R 4.2.0)
#>  R.cache         0.16.0     2022-07-21 [1] CRAN (R 4.2.0)
#>  R.methodsS3     1.8.2      2022-06-13 [1] CRAN (R 4.2.0)
#>  R.oo            1.25.0     2022-06-12 [1] CRAN (R 4.2.0)
#>  R.utils         2.12.0     2022-06-28 [1] CRAN (R 4.2.0)
#>  R6              2.5.1      2021-08-19 [1] CRAN (R 4.2.0)
#>  ratelimitr      0.4.1      2018-10-07 [1] CRAN (R 4.2.0)
#>  Rcpp            1.0.9      2022-07-08 [1] CRAN (R 4.2.0)
#>  readr         * 2.1.2      2022-01-30 [1] CRAN (R 4.2.0)
#>  readxl          1.4.0      2022-03-28 [1] CRAN (R 4.2.0)
#>  reprex          2.0.1      2021-08-05 [1] CRAN (R 4.2.0)
#>  rlang           1.0.4      2022-07-12 [1] CRAN (R 4.2.0)
#>  rmarkdown       2.14       2022-04-25 [1] CRAN (R 4.2.0)
#>  robotstxt       0.7.13     2020-09-03 [1] CRAN (R 4.2.0)
#>  rstudioapi      0.13       2020-11-12 [1] CRAN (R 4.2.0)
#>  rvest         * 1.0.2      2021-10-16 [1] CRAN (R 4.2.0)
#>  scales          1.2.0      2022-04-13 [1] CRAN (R 4.2.0)
#>  sessioninfo     1.2.2      2021-12-06 [1] CRAN (R 4.2.0)
#>  spiderbar       0.2.4      2021-05-16 [1] CRAN (R 4.2.0)
#>  stringi         1.7.8      2022-07-11 [1] CRAN (R 4.2.0)
#>  stringr       * 1.4.0      2019-02-10 [1] CRAN (R 4.2.0)
#>  styler          1.7.0      2022-03-13 [1] CRAN (R 4.2.0)
#>  tibble        * 3.1.8      2022-07-22 [1] CRAN (R 4.2.0)
#>  tidyr         * 1.2.0      2022-02-01 [1] CRAN (R 4.2.0)
#>  tidyselect      1.1.2      2022-02-21 [1] CRAN (R 4.2.0)
#>  tidyverse     * 1.3.2      2022-07-18 [1] CRAN (R 4.2.0)
#>  tzdb            0.3.0      2022-03-28 [1] CRAN (R 4.2.0)
#>  usethis         2.1.6      2022-05-25 [1] CRAN (R 4.2.0)
#>  utf8            1.2.2      2021-07-24 [1] CRAN (R 4.2.0)
#>  vctrs           0.4.1      2022-04-13 [1] CRAN (R 4.2.0)
#>  withr           2.5.0      2022-03-03 [1] CRAN (R 4.2.0)
#>  xfun            0.32.1     2022-08-11 [1] https://yihui.r-universe.dev (R 4.2.0)
#>  xml2            1.3.3      2021-11-30 [1] CRAN (R 4.2.0)
#>  yaml            2.3.5      2022-02-21 [1] CRAN (R 4.2.0)
#> 
#>  [1] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment