h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import urllib.request | |
import urllib.error | |
import time | |
import csv | |
import re | |
import json | |
from tabulate import tabulate | |
from random import randint | |
from proxy_parser import get_proxy | |
from bs4 import BeautifulSoup | |
from multiprocessing.dummy import Pool as ThreadPool | |
import os.path | |
# autolaunching script, prints program work time | |
#TODO add it here, it's not a default program | |
# import timing | |
class AppURLopener(urllib.request.FancyURLopener): | |
version = "Chrome/55.0 " | |
class Parser(): | |
def __init__(self, time_await): | |
self.time = time_await | |
self.script_path = os.path.dirname(__file__) | |
def open_url(self, url): | |
time.sleep(randint(1, self.time)) | |
try: | |
page = urllib.request.urlopen(url) | |
page = page.read() | |
print("getting %s" % url[24:]) | |
return page | |
except urllib.error.HTTPError: | |
time.sleep(randint(1, self.time)) | |
print("%s not found, trying one more time" % url) | |
try: | |
page = urllib.request.urlopen(url) | |
page = page.read() | |
print("getting %s" % url[24:]) | |
return page | |
except urllib.error.HTTPError: | |
print("second try failed") | |
def threading_url_open(self, url_array): | |
pool = ThreadPool(len(url_array)) | |
results = pool.map(self.open_url, url_array) | |
results = [x for x in results if x is not None] | |
pool.close() | |
pool.join() | |
return results | |
class Habr(Parser): | |
def __init__(self, url_arr, time_await): | |
super(Habr, self).__init__(time_await) | |
self.main_hubs = url_arr | |
def get_page_amount(self, hub): | |
url = self.main_hubs[hub]["link"] | |
soup = BeautifulSoup(self.open_url(url), "html.parser") | |
nav_pages = soup.find("div", class_="page-nav") | |
ul = nav_pages.find_all("ul")[1] | |
# getting last li in ul children | |
li = ul.find_all("li")[-1:][0] | |
# getting a in li | |
a = li.find("a").get("href") | |
# parsing integers from href (last page) | |
parse_int = re.findall(r'\d+', a) | |
#parse int returns array, you know )00 | |
page_amount = int(parse_int[0]) | |
self.main_hubs[hub].update({ | |
"page_amount": page_amount | |
}) | |
def threading_page_amount(self): | |
pool = ThreadPool(len(self.main_hubs)) | |
results = pool.map(self.get_page_amount, self.main_hubs) | |
pool.close() | |
pool.join() | |
def parse_page(self, page): | |
soup = BeautifulSoup(page, "html.parser") | |
article_blocks = soup.find_all( | |
'div', class_="post post_teaser shortcuts_item" | |
) | |
articles = [] | |
for article in article_blocks: | |
a_r = "" | |
if article.find("span", class_="voting-wjt__counter-score"): | |
a_r = article.find("span", class_="voting-wjt__counter-score").text | |
articles.append({ | |
"article_title": article.find("a", class_="post__title_link").text, | |
"article_link": article.find("a", class_="post__title_link").get("href"), | |
"time_published": article.find("span", class_="post__time_published").text, | |
"article_watches": article.find("div", class_="views-count_post").text, | |
"article_rating": a_r | |
}) | |
return articles | |
def threading_parse_page(self, pages): | |
pool = ThreadPool(len(pages)) | |
results = pool.map(self.parse_page, pages) | |
results = [x for x in results if x is not None] | |
pool.close() | |
pool.join() | |
return results | |
def save(self, pages, name): | |
p = os.path.join(self.script_path, 'articles/') | |
with open("%s%s.csv" % (p, name), 'w') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(('Название статьи', "Ссылка", "Рейтинг хабра", 'Кол-во просмотров', 'Дата написания')) | |
for x in pages: | |
writer.writerows( | |
(article["article_title"], | |
article["article_link"], | |
article['article_rating'], | |
article["article_watches"], article["time_published"]) for article in x | |
) | |
def start_parsing(self): | |
self.threading_page_amount() | |
for hub in self.main_hubs: | |
urls = ["%spage%d" % (self.main_hubs[hub]["link"], n) | |
for n in range(1, self.main_hubs[hub]["page_amount"])] | |
pages = self.threading_url_open(urls) | |
parsed_content = self.threading_parse_page(pages) | |
name = hub | |
self.save(parsed_content, name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment