Skip to content

Instantly share code, notes, and snippets.

@vinovator
Created April 29, 2016 13:50
Show Gist options
  • Star 13 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save vinovator/1307a90ff335e62732a38c82a623f154 to your computer and use it in GitHub Desktop.
Save vinovator/1307a90ff335e62732a38c82a623f154 to your computer and use it in GitHub Desktop.
Crawl a page and extract all urls recursively within same domain
# crawler.py
# Python 2.7.6
"""
Crawl a page and extract all urls recursively within same domain
"""
from BeautifulSoup import BeautifulSoup
from selenium import webdriver
from urlparse import urlparse
import pandas as pd
import openpyxl as op
parent_url = "https://automatetheboringstuff.com/"
domain = urlparse(parent_url).netloc
xl_name = "Crawler.xlsx"
sheet_name = "URLs"
def crawl_urls(url_list, crawled_urls, driver, url):
""" get a set of urls and crawl each url recursively"""
# Once the url is parsed, add it to crawled url list
crawled_urls.append(url)
driver.get(url)
html = driver.page_source.encode("utf-8")
soup = BeautifulSoup(html)
urls = soup.findAll("a")
# Even if the url is not part of the same domain, it is still collected
# But those urls not in the same domain are not parsed
for a in urls:
if (a.get("href")) and (a.get("href") not in url_list):
url_list.append(a.get("href"))
# Recursively parse each url within same domain
for page in set(url_list): # set to remove duplicates
# Check if the url belong to the same domain
# And if this url is already parsed ignore it
if (urlparse(page).netloc == domain) and (page not in crawled_urls):
# print this_url
crawl_urls(url_list, crawled_urls, driver, page)
# Once all urls are crawled return the list to calling function
else:
return crawled_urls, url_list
def load_to_excel(lst):
"""
Load the list into excel file using pandas
"""
# Load list to dataframe
df = pd.DataFrame(lst)
df.index += 1 # So that the excel column starts from 1
# Write dataframe to excel
xlw = pd.ExcelWriter(xl_name)
df.to_excel(xlw, sheet_name=sheet_name, index_label="#", header=["URL"])
xlw.save()
def format_excel(xl, sheet="Sheet1"):
"""
Get the excel file path and format the file
If no sheet name is passed, by default take Sheet1
"""
# Open the excel file
wb = op.load_workbook(xl)
ws = wb.get_sheet_by_name(sheet)
# Freeze panes
ws.freeze_panes = "B2"
# Adjust column width
cols = ("A", "B")
widths = (5, 80)
for combo in zip(cols, widths):
ws.column_dimensions[combo[0]].width = combo[1]
# define color formmatting
blue_fill = op.styles.PatternFill(start_color="00aadd",
fill_type='solid')
# define border style
thin_border = op.styles.borders.Border(left=op.styles.Side(style='thin'),
right=op.styles.Side(style='thin'),
top=op.styles.Side(style='thin'),
bottom=op.styles.Side(style='thin'))
# define Text wrap
text_wrap = op.styles.Alignment(wrap_text=True)
# Format the header row
for row in range(1, 2): # Loop only the 1st row
for col in range(1, ws.max_column + 1): # loop through all columns
ws.cell(row=row, column=col).fill = blue_fill
# Format all cells
for row in ws.iter_rows():
for cell in row:
# Draw borders
cell.border = thin_border
# Wrap all columns
cell.alignment = text_wrap
# Save back as same file name
wb.save(xl)
if __name__ == "__main__":
""" Starting block """
driver = webdriver.Firefox()
url_list = list()
crawled_urls = list()
url_list.append(parent_url)
# Initiate the crawling by passind the beginning url
crawled_urls, url_list = crawl_urls(url_list, crawled_urls,
driver, parent_url)
# Finally quit the browser
driver.quit()
print "FULL URLs LIST"
print len(set(url_list))
print "CRAWLED URLs LIST"
print len(set(crawled_urls))
# Load the match list to excel
load_to_excel(url_list)
# Format the excel file
format_excel(xl_name, sheet_name)
@Treebug842
Copy link

When I ran it it couldn't import BeautifulSoup, where would you download this?

@sephi0485
Copy link

When I ran it it couldn't import BeautifulSoup, where would you download this?

https://pypi.org/project/beautifulsoup4/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment