Skip to content

Instantly share code, notes, and snippets.

@lloss
Created April 3, 2018 10:36
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save lloss/8db429660b9569fbcef93f8a465c1e34 to your computer and use it in GitHub Desktop.
h
# -*- coding: utf-8 -*-
import urllib.request
import urllib.error
import time
import csv
import re
import json
from tabulate import tabulate
from random import randint
from proxy_parser import get_proxy
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool
import os.path
# autolaunching script, prints program work time
#TODO add it here, it's not a default program
# import timing
class AppURLopener(urllib.request.FancyURLopener):
version = "Chrome/55.0 "
class Parser():
def __init__(self, time_await):
self.time = time_await
self.script_path = os.path.dirname(__file__)
def open_url(self, url):
time.sleep(randint(1, self.time))
try:
page = urllib.request.urlopen(url)
page = page.read()
print("getting %s" % url[24:])
return page
except urllib.error.HTTPError:
time.sleep(randint(1, self.time))
print("%s not found, trying one more time" % url)
try:
page = urllib.request.urlopen(url)
page = page.read()
print("getting %s" % url[24:])
return page
except urllib.error.HTTPError:
print("second try failed")
def threading_url_open(self, url_array):
pool = ThreadPool(len(url_array))
results = pool.map(self.open_url, url_array)
results = [x for x in results if x is not None]
pool.close()
pool.join()
return results
class Habr(Parser):
def __init__(self, url_arr, time_await):
super(Habr, self).__init__(time_await)
self.main_hubs = url_arr
def get_page_amount(self, hub):
url = self.main_hubs[hub]["link"]
soup = BeautifulSoup(self.open_url(url), "html.parser")
nav_pages = soup.find("div", class_="page-nav")
ul = nav_pages.find_all("ul")[1]
# getting last li in ul children
li = ul.find_all("li")[-1:][0]
# getting a in li
a = li.find("a").get("href")
# parsing integers from href (last page)
parse_int = re.findall(r'\d+', a)
#parse int returns array, you know )00
page_amount = int(parse_int[0])
self.main_hubs[hub].update({
"page_amount": page_amount
})
def threading_page_amount(self):
pool = ThreadPool(len(self.main_hubs))
results = pool.map(self.get_page_amount, self.main_hubs)
pool.close()
pool.join()
def parse_page(self, page):
soup = BeautifulSoup(page, "html.parser")
article_blocks = soup.find_all(
'div', class_="post post_teaser shortcuts_item"
)
articles = []
for article in article_blocks:
a_r = ""
if article.find("span", class_="voting-wjt__counter-score"):
a_r = article.find("span", class_="voting-wjt__counter-score").text
articles.append({
"article_title": article.find("a", class_="post__title_link").text,
"article_link": article.find("a", class_="post__title_link").get("href"),
"time_published": article.find("span", class_="post__time_published").text,
"article_watches": article.find("div", class_="views-count_post").text,
"article_rating": a_r
})
return articles
def threading_parse_page(self, pages):
pool = ThreadPool(len(pages))
results = pool.map(self.parse_page, pages)
results = [x for x in results if x is not None]
pool.close()
pool.join()
return results
def save(self, pages, name):
p = os.path.join(self.script_path, 'articles/')
with open("%s%s.csv" % (p, name), 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(('Название статьи', "Ссылка", "Рейтинг хабра", 'Кол-во просмотров', 'Дата написания'))
for x in pages:
writer.writerows(
(article["article_title"],
article["article_link"],
article['article_rating'],
article["article_watches"], article["time_published"]) for article in x
)
def start_parsing(self):
self.threading_page_amount()
for hub in self.main_hubs:
urls = ["%spage%d" % (self.main_hubs[hub]["link"], n)
for n in range(1, self.main_hubs[hub]["page_amount"])]
pages = self.threading_url_open(urls)
parsed_content = self.threading_parse_page(pages)
name = hub
self.save(parsed_content, name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment