Created
April 27, 2018 04:39
-
-
Save limitpointinf0/0c31b029dafac1688495b3c67e4a2ae0 to your computer and use it in GitHub Desktop.
Scrape Assistant
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import sys | |
import requests | |
import re | |
import string | |
import time | |
import random | |
import urllib | |
import traceback | |
""" | |
functions for retreiving and navigating through the soup by level. | |
""" | |
def get_soup(u): | |
"""return the soup""" | |
try: | |
r = requests.get(u) | |
html = r.text | |
soup = BeautifulSoup(html, 'html5lib') | |
return soup | |
except: | |
exc_type, exc_value, exc_traceback = sys.exc_info() | |
ex = traceback.format_exception(exc_type, exc_value,exc_traceback) | |
print('///'.join(ex)) | |
def nav_tags(int_soup, tags=[], level=1): | |
"""navigate tags""" | |
try: | |
if len(tags) == level: | |
int_soup = int_soup.findAll(tags[0]) | |
if level > 1: | |
for i in range(1, level): | |
int_soup = sum([x.findAll(tags[i]) for x in int_soup],[]) | |
return int_soup | |
else: | |
print('length of tags does not equal number of levels') | |
except: | |
exc_type, exc_value, exc_traceback = sys.exc_info() | |
ex = traceback.format_exception(exc_type, exc_value,exc_traceback) | |
print('///'.join(ex)) | |
def get_text(soups=[]): | |
try: | |
return sum([x.findAll(text=True) for x in soups],[]) | |
except: | |
exc_type, exc_value, exc_traceback = sys.exc_info() | |
ex = traceback.format_exception(exc_type, exc_value,exc_traceback) | |
print('///'.join(ex)) | |
""" | |
The following functions are useful for scraping image src | |
and saving to file. | |
""" | |
def get_tag_attr(url, tag='img', attr='src'): | |
"""return the text from tag attributes""" | |
try: | |
content_list = [] | |
r = requests.get(url) | |
html = r.text | |
soup = BeautifulSoup(html, 'html5lib') | |
for i in soup.findAll(tag): | |
attr_content = i.get(attr) | |
content_list.append(attr_content) | |
print(attr_content) | |
return content_list | |
except: | |
exc_type, exc_value, exc_traceback = sys.exc_info() | |
ex = traceback.format_exception(exc_type, exc_value,exc_traceback) | |
print('///'.join(ex)) | |
def iterate_img_list(base_url, imgs=[], timeout=(5,10), save=False): | |
"""Iterate of a list of images in the webpage and save. Random timeout | |
can be set to | |
""" | |
try: | |
image_names = [] | |
for c in imgs: | |
time.sleep(random.randint(*timeout)) | |
img_name = c.split('/')[-1] | |
image_names.append(img_name) | |
if save: | |
urllib.urlretrieve(base_url + '/{}'.format(c), img_name) | |
print('saved img as', img_name) | |
return image_names | |
except: | |
exc_type, exc_value, exc_traceback = sys.exc_info() | |
ex = traceback.format_exception(exc_type, exc_value,exc_traceback) | |
print('///'.join(ex)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment