Skip to content

Instantly share code, notes, and snippets.

@limitpointinf0
Created April 27, 2018 04:39
Show Gist options
  • Save limitpointinf0/0c31b029dafac1688495b3c67e4a2ae0 to your computer and use it in GitHub Desktop.
Save limitpointinf0/0c31b029dafac1688495b3c67e4a2ae0 to your computer and use it in GitHub Desktop.
Scrape Assistant
from bs4 import BeautifulSoup
import sys
import requests
import re
import string
import time
import random
import urllib
import traceback
"""
functions for retreiving and navigating through the soup by level.
"""
def get_soup(u):
"""return the soup"""
try:
r = requests.get(u)
html = r.text
soup = BeautifulSoup(html, 'html5lib')
return soup
except:
exc_type, exc_value, exc_traceback = sys.exc_info()
ex = traceback.format_exception(exc_type, exc_value,exc_traceback)
print('///'.join(ex))
def nav_tags(int_soup, tags=[], level=1):
"""navigate tags"""
try:
if len(tags) == level:
int_soup = int_soup.findAll(tags[0])
if level > 1:
for i in range(1, level):
int_soup = sum([x.findAll(tags[i]) for x in int_soup],[])
return int_soup
else:
print('length of tags does not equal number of levels')
except:
exc_type, exc_value, exc_traceback = sys.exc_info()
ex = traceback.format_exception(exc_type, exc_value,exc_traceback)
print('///'.join(ex))
def get_text(soups=[]):
try:
return sum([x.findAll(text=True) for x in soups],[])
except:
exc_type, exc_value, exc_traceback = sys.exc_info()
ex = traceback.format_exception(exc_type, exc_value,exc_traceback)
print('///'.join(ex))
"""
The following functions are useful for scraping image src
and saving to file.
"""
def get_tag_attr(url, tag='img', attr='src'):
"""return the text from tag attributes"""
try:
content_list = []
r = requests.get(url)
html = r.text
soup = BeautifulSoup(html, 'html5lib')
for i in soup.findAll(tag):
attr_content = i.get(attr)
content_list.append(attr_content)
print(attr_content)
return content_list
except:
exc_type, exc_value, exc_traceback = sys.exc_info()
ex = traceback.format_exception(exc_type, exc_value,exc_traceback)
print('///'.join(ex))
def iterate_img_list(base_url, imgs=[], timeout=(5,10), save=False):
"""Iterate of a list of images in the webpage and save. Random timeout
can be set to
"""
try:
image_names = []
for c in imgs:
time.sleep(random.randint(*timeout))
img_name = c.split('/')[-1]
image_names.append(img_name)
if save:
urllib.urlretrieve(base_url + '/{}'.format(c), img_name)
print('saved img as', img_name)
return image_names
except:
exc_type, exc_value, exc_traceback = sys.exc_info()
ex = traceback.format_exception(exc_type, exc_value,exc_traceback)
print('///'.join(ex))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment