Skip to content

Instantly share code, notes, and snippets.

@kirussian911
Created September 12, 2018 21:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kirussian911/35b25890d6cd30faf8e69ed2a41665c1 to your computer and use it in GitHub Desktop.
Save kirussian911/35b25890d6cd30faf8e69ed2a41665c1 to your computer and use it in GitHub Desktop.
Для форума
from bs4 import BeautifulSoup
import requests
import csv
from datetime import datetime
from multiprocessing import Pool
#https://gist.github.com/kirussian911/b87287fda5a112a8a35e38cfd8d55c2a
def get_html(url):
r = requests.get(url)
encoding = r.encoding if 'charset' in r.headers.get('content-type', '').lower() else None
#soup = BeautifulSoup(r.content, from_encoding=encoding)
#print(soup)
return r.content
def get_page_count(html):
soup = BeautifulSoup(html, 'html.parser')
paggination = int(soup.find('div', class_='nav-links').find_all('a', class_='page-numbers')[-2].text.replace('Page ', ''))
return paggination
def get_all_links(html):
soup = BeautifulSoup(html, 'lxml')
a_shop = soup.find('div', {'class': 'blog-grid-wrap'}).find_all('h2', class_='entry-title')
links = []
for i in a_shop:
a = i.find('a').get('href')
links.append(a)
# 'https://aliholic.com/product/wooden-pillow/'
return links
def get_page_data(html):
soup = BeautifulSoup(html, 'lxml')
try:
title = soup.find('h1', class_='product_title').text.strip()
except:
title = ''
try:
old_price = soup.find('p', class_='price').find('del').text.strip()
except:
old_price = ''
try:
now_price = soup.find('p', class_='price').find('ins').text.strip()
except:
now_price = soup.find('p', class_='price').find('span', class_='woocommerce-Price-amount').text.strip()
try:
links_ali = soup.find('form', class_='cart').get('action')
except:
links_ali = ''
try:
categories = soup.find('span', class_='posted_in').find('a').find_all('rel').text.replace('Categories: ', '').strip()
except:
categories = soup.find('span', class_='posted_in').text.replace('Categories: ', '').replace('Category: ', '').strip().encode("ascii", errors='ignore')
data = {'title': title,
'old_price': old_price,
'now_price': now_price,
'links_ali': links_ali,
'categories': categories}
# {'title': 'Vintage-style leather backpack', 'old_price': '35.99$', 'now_price': '31.67$',
# 'links_ali': 'http://ali.ski/Jqhbc2', 'categories': b" Mens's Fashion, All, PREMIUM"}
return data
def write_csv(data):
with open('ali_file.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(('Название', 'Старая цена', 'Цена', 'Ссылка', 'Категории'))
writer.writerow((data['title'],
data['old_price'],
data['now_price'],
data['links_ali'],
data['categories']))
print(data['title'])
def make_all(url):
html = get_html(url)
data = get_page_data(html)
write_csv(data)
def main():
start = datetime.now()
url = 'https://aliholic.com/shop/page/2/'
all_links = get_all_links(get_html(url))
with Pool(2) as p:
p.map(make_all, all_links)
# for index, url in enumerate(all_links):
#https://aliholic.com/product/mason-jar-cup/
# html = get_html(url)
# data = get_page_data(html)
# write_csv(data)
#print(index, data)
end = datetime.now()
total = end - start
print(str(total))
if __name__== "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment