Skip to content

Instantly share code, notes, and snippets.

@felix-d
Created March 14, 2015 17:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save felix-d/bf0e68a4fa0e9c17d95b to your computer and use it in GitHub Desktop.
Save felix-d/bf0e68a4fa0e9c17d95b to your computer and use it in GitHub Desktop.
Mathieu Bock Cote Scraper
import requests
import threading
from bs4 import BeautifulSoup
from Queue import Queue
import re
import time
max_page = 230
num_workers = 200
sentence_regex = re.compile(r'([^.!?(\n)+]+[.!?]+)')
root = r'http://www.journaldemontreal.com/blogues/mathieu-bock-cote/page/'
longests = []
q = Queue()
lock = threading.Lock()
def get_longest():
i = None
while True:
try:
i = q.get()
lock.acquire()
print("Crawling page {}".format(i))
lock.release()
url = root + str(i)
current_page_text = requests.get(url).text
soup = BeautifulSoup(current_page_text, 'html5')
for link in soup.find_all('a', attrs={'class': 'read-more'}):
article = requests.get(link['href'])
article_soup = BeautifulSoup(article.text, 'html5')
article_main_text = article_soup.find('article')
[c.extract() for c in article_main_text(
'div', attrs={'class': 'wp-comment-body'})]
matches = sentence_regex.findall(article_main_text.get_text())
max_s = max(matches, key=len)
longests.append(dict(length=str(len(max_s)),
url=link['href'],
sen=max_s))
except:
lock.acquire()
print("Error while crawling page {}".format(i))
lock.release()
finally:
q.task_done()
def main():
global longests
for i in range(num_workers):
t = threading.Thread(target=get_longest)
t.daemon = True
t.start()
for i in range(max_page):
time.sleep(0.04)
q.put(i)
q.join()
longests = map(lambda x: "LENGTH: {}\nURL: {}\n{}"
.format(
x['length'],
x['url'],
x['sen'].encode('utf-8')),
sorted(longests, key=lambda x: x['length']))
f = open('bock.txt', 'w')
f.write("\n\n".join(longests))
f.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment