Skip to content

Instantly share code, notes, and snippets.

@jimapps
Last active August 29, 2015 14:19
Show Gist options
  • Save jimapps/094d8c054c8c5d5b86e9 to your computer and use it in GitHub Desktop.
Save jimapps/094d8c054c8c5d5b86e9 to your computer and use it in GitHub Desktop.
Download patent files from google
## 0.1 get the zip file list
get_zip_file_list <- function(url="http://www.google.com/googlebooks/uspto-patents-grants-text.html"){
library(XML)
library(RCurl)
baseURL <- getURL(url)
xmltext <- htmlParse(baseURL, encoding='UTF-8')
link.list0 <- getNodeSet(xmltext, "//a")
link.list <- sapply(link.list0, xmlGetAttr, "href")
iid <- grepl("/grant_full_text/\\d+/.*\\.zip", link.list)
link.list <- link.list[iid]
return(link.list)
}
# 0.2 functions for download each file into folder with year folder
# e.g. data/2010/
get_patent_zip_file <- function(url, download_dir="data") {
year1 <- gsub(".*/(\\d+)/.*\\.zip", "\\1", url)
local_filename <- paste0(download_dir, "/", year1, "/", basename(url))
remote_filename <- url
file.list <- list()
# Only download the file if we don't already have a local copy
if (!file.exists(local_filename)) {
dir.create(dirname(local_filename), showWarnings=FALSE)
try(download.file(url=url, destfile=local_filename) )
t <- sample(1:10,1)/3
Sys.sleep(t)
}
# Return T if the file exists
if (file.exists(local_filename)) {
return(T)
} else {
return(F)
}
}
# 0.3 download by year
get_patent_zip_file_byYear <- function(year, url.list, download_dir="data"){
year.all <- gsub(".*/(\\d+)/.*\\.zip", "\\1", url.list)
year.all <- as.integer(year.all)
url.list1 <- url.list[ year.all %in% year]
url.list1.success <- NULL
for ( i in seq_along(url.list1)){
success <- get_patent_zip_file(url=url.list1[i],
download_dir=download_dir)
url.list1.success[i] <- success
}
data.frame(url=url.list1, success=url.list1.success, stringsAsFactors = F)
}
## 1. run
zip.url.list <- get_zip_file_list()
get_patent_zip_file_byYear(1977, zip.url.list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment