Skip to content

Instantly share code, notes, and snippets.

@artlogic
Created August 25, 2016 18:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save artlogic/61f54ba6bad505b81ecca94d1a08d376 to your computer and use it in GitHub Desktop.
Save artlogic/61f54ba6bad505b81ecca94d1a08d376 to your computer and use it in GitHub Desktop.
GPS Scraper
from io import BytesIO
from bs4 import BeautifulSoup
import exifread
import requests
# the url to rip from... you could easily use .format to replace 1
# with any page you like, or even loop through many pages
url = 'http://tinypic.com/images.php?page=1'
# retrieve the URL
summary_req = requests.get(url)
# parse the text of the request into HTML
summary_html = BeautifulSoup(summary_req.text, 'html.parser')
# find the <div class="browse">...</div>
browse_div = summary_html.find('div', class_='browse')
# find all the <a href> tags inside the browse div
browse_links = browse_div.find_all('a')
# loop over the browse links
for link in browse_links:
# the URL for the individual image page
image_page = link['href']
# retrieve the image page
image_page_req = requests.get(image_page)
# parse the image page HTML
image_page_html = BeautifulSoup(image_page_req.text, 'html.parser')
# find the <div id="imgFrame">...</div>
img_frame_div = image_page_html.find('div', id='imgFrame')
# find the first <a href> inside the imgFrame div and get the value of href
img_frame_link = img_frame_div.find('a')['href']
# request the actual image (jpg)
image_req = requests.get(img_frame_link)
# convert the response into an in memory file-like object
image_file = BytesIO(image_req.content)
# look for EXIF information
tags = exifread.process_file(image_file)
# print the link to the image
print(img_frame_link)
# print the GPS info - we use tags.get('blah') as opposed to
# tags['blah'] because if 'blah' doesn't exist, get returns None
# instead of giving us an error
print('Long: {}'.format(tags.get('GPS GPSLongitude')))
print('Long Ref: {}'.format(tags.get('GPS GPSLongitudeRef')))
print('Lat: {}'.format(tags.get('GPS GPSLatitude')))
print('Lat Ref: {}'.format(tags.get('GPS GPSLatitudeRef')))
# print a blank line before the next entry
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment