Skip to content

Instantly share code, notes, and snippets.

@jeffThompson
Last active June 19, 2017 21:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeffThompson/7e2d3fdf148614bf99bb4f3ce8b53d62 to your computer and use it in GitHub Desktop.
Save jeffThompson/7e2d3fdf148614bf99bb4f3ce8b53d62 to your computer and use it in GitHub Desktop.
A little Python script that automatically downloads images associated with a patent listing
'''
GET PATENT IMAGES
Jeff Thompson | 2017 | jeffreythompson.org
A little Python script that automatically downloads
images associated with a patent listing. Use Google's
Patent Search first, then the "Download (CSV)" button
to save a file with all the results.
'''
import csv, urllib, urllib2, re, os
csv_filename = 'search.csv' # file to load from
image_folder = 'images' # folder to save images to (will be created)
print 'extracting urls...'
with open('search.csv') as f:
f.next() # skip first line (with search details)
f.next() # skip second line (csv header)
data = csv.reader(f, quotechar='"')
listings = []
for d in data:
id = d[0]
date = d[7]
url = d[8]
listings.append([id, date, url])
print '- found ' + str(len(listings))
print 'getting images...'
if not os.path.exists(image_folder):
os.mkdir(image_folder)
for listing in listings:
id = listing[0]
date = listing[1]
url = listing[2]
print '- ' + url
print ' - downloading page source...'
# use a "user agent" otherwise we'll get blocked by Google :)
user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3'
headers = { 'User-Agent': user_agent }
req = urllib2.Request(url, None, headers)
response = urllib2.urlopen(req)
html = response.read()
response.close()
images = re.findall('<meta itemprop="full" content="(.*?)"', html, re.M)
listing.append(images)
print ' - downloading ' + str(len(images)) + ' images...'
for i, image in enumerate(images):
filename = id + '-' + str(i).zfill(3) + '.png'
urllib.urlretrieve(image, os.path.join(image_folder, filename))
print 'saving listing data to file...'
with open('listings.csv', 'w') as f:
f.write('id,date,url,images\n')
for l in listings:
f.write(l[0] + ',' + l[1] + ',' + l[2] + ',')
f.write('"' + ','.join(l[3]) + '"\n')
print '- all done'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment