jeffThompson/GetPatentImages.py

## GetPatentImages.py

'''
GET PATENT IMAGES
Jeff Thompson | 2017 | jeffreythompson.org

A little Python script that automatically downloads
images associated with a patent listing. Use Google's
Patent Search first, then the "Download (CSV)" button
to save a file with all the results.

'''

import csv, urllib, urllib2, re, os

csv_filename = 'search.csv'   # file to load from
image_folder = 'images'       # folder to save images to (will be created)


print 'extracting urls...'
with open('search.csv') as f:
	f.next()			# skip first line (with search details)
	f.next()			# skip second line (csv header)

	data = csv.reader(f, quotechar='"')
	listings = []
	for d in data:
		id =   d[0]
		date = d[7]
		url =  d[8]
		listings.append([id, date, url])
print '- found ' + str(len(listings))


print 'getting images...'
if not os.path.exists(image_folder):
	os.mkdir(image_folder)

for listing in listings:
	id =   listing[0]
	date = listing[1]
	url =  listing[2]
	print '- ' + url
	print '  - downloading page source...'

	# use a "user agent" otherwise we'll get blocked by Google :)
	user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3'
	headers = { 'User-Agent': user_agent }
	req = urllib2.Request(url, None, headers)
	response = urllib2.urlopen(req)
	html = response.read()
	response.close()

	images = re.findall('<meta itemprop="full" content="(.*?)"', html, re.M)
	listing.append(images)
	print '  - downloading ' + str(len(images)) + ' images...'
	for i, image in enumerate(images):
		filename = id + '-' + str(i).zfill(3) + '.png'
		urllib.urlretrieve(image, os.path.join(image_folder, filename))


print 'saving listing data to file...'
with open('listings.csv', 'w') as f:
	f.write('id,date,url,images\n')
	for l in listings:
		f.write(l[0] + ',' + l[1] + ',' + l[2] + ',')
		f.write('"' + ','.join(l[3]) + '"\n')
print '- all done'

	'''
	GET PATENT IMAGES
	Jeff Thompson \| 2017 \| jeffreythompson.org

	A little Python script that automatically downloads
	images associated with a patent listing. Use Google's
	Patent Search first, then the "Download (CSV)" button
	to save a file with all the results.

	'''

	import csv, urllib, urllib2, re, os

	csv_filename = 'search.csv' # file to load from
	image_folder = 'images' # folder to save images to (will be created)


	print 'extracting urls...'
	with open('search.csv') as f:
	f.next() # skip first line (with search details)
	f.next() # skip second line (csv header)

	data = csv.reader(f, quotechar='"')
	listings = []
	for d in data:
	id = d[0]
	date = d[7]
	url = d[8]
	listings.append([id, date, url])
	print '- found ' + str(len(listings))


	print 'getting images...'
	if not os.path.exists(image_folder):
	os.mkdir(image_folder)

	for listing in listings:
	id = listing[0]
	date = listing[1]
	url = listing[2]
	print '- ' + url
	print ' - downloading page source...'

	# use a "user agent" otherwise we'll get blocked by Google :)
	user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3'
	headers = { 'User-Agent': user_agent }
	req = urllib2.Request(url, None, headers)
	response = urllib2.urlopen(req)
	html = response.read()
	response.close()

	images = re.findall('<meta itemprop="full" content="(.*?)"', html, re.M)
	listing.append(images)
	print ' - downloading ' + str(len(images)) + ' images...'
	for i, image in enumerate(images):
	filename = id + '-' + str(i).zfill(3) + '.png'
	urllib.urlretrieve(image, os.path.join(image_folder, filename))


	print 'saving listing data to file...'
	with open('listings.csv', 'w') as f:
	f.write('id,date,url,images\n')
	for l in listings:
	f.write(l[0] + ',' + l[1] + ',' + l[2] + ',')
	f.write('"' + ','.join(l[3]) + '"\n')
	print '- all done'