Skip to content

Instantly share code, notes, and snippets.

@StarJade-Park
Created May 4, 2017 05:14
Show Gist options
  • Save StarJade-Park/d9c6382149d45ff8241edd8605fb3171 to your computer and use it in GitHub Desktop.
Save StarJade-Park/d9c6382149d45ff8241edd8605fb3171 to your computer and use it in GitHub Desktop.
roughly crawler
from bs4 import BeautifulSoup
from progressbar import Bar, SimpleProgress, Percentage, ProgressBar
import urllib
blog1 = "blog_url" # url input
input1 = 1 # temporary input var
input2 = 41
def replaceTxt(f, txt):
if txt.find("#") is not -1:
txt = txt.replace("_", "\\_")
f.write("\\" + txt)
return True
else:
return False
def spider(max_pages):
page = input1
# virtural headers
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1;)'
headers = {'User-Agent': user_agent, }
# progressBar widgets def
widgets = ['Running: ', Percentage(), ' ',
Bar(marker='#', left='[', right=']'),
' ', SimpleProgress()]
fail_list = list()
pbar = ProgressBar(widgets=widgets, maxval=max_pages).start()
while page <= max_pages:
url = blog1 + str(page)
# request
request_url = urllib.request.Request(url, None, headers)
# try open
try:
url_open = urllib.request.urlopen(request_url)
except:
fail_list.append(url)
page += 1
continue
# read page
source_code = url_open.read()
soup = BeautifulSoup(source_code, 'lxml')
f = open(page.__str__() + ".md", "w")
# title
for link in soup.select('h2 > a'):
title = link.string
f.write("#" + title + "\n")
# paragraph
for paragraph in soup.select('p'):
# case: p > span
for span in paragraph.select('span'):
txt = span.string
if txt == None:
continue
if replaceTxt(f, txt):
continue
f.write(txt)
f.write("\n")
# case: p
txt = paragraph.string
if txt == None:
continue
if replaceTxt(f, txt):
continue
f.write(txt)
# end for paragraph
# writing end, next page
f.close()
pbar.update(page)
page += 1
print("\ncrawling is complete.")
print("fail pages:", fail_list)
spider(input2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment