Skip to content

Instantly share code, notes, and snippets.

@allatambov
Last active March 15, 2022 15:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allatambov/a9853d3deff299fac9d706c960a5e8f7 to your computer and use it in GitHub Desktop.
Save allatambov/a9853d3deff299fac9d706c960a5e8f7 to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
page = requests.get("http://nplus1.ru/")
soup = BeautifulSoup(page.text)
links_raw = soup.find_all("a")
links_all = []
for link in links_raw:
links_all.append(link["href"])
news = []
for link in links_all:
if "/news/" in link:
news.append(link)
news_full = []
for n in news:
news_full.append("https://nplus1.ru" + n)
link0 = news_full[0]
page0 = requests.get(link0)
soup0 = BeautifulSoup(page0.text)
title = soup0.find_all("title")[0].text
date = soup0.find("meta", {"itemprop" : "datePublished"})["content"]
author = soup0.find("meta", {"name" : "mediator_author"})["content"]
desc = soup0.find("meta", {"name" : "description"})["content"]
tabs = soup0.find_all("p", {"class" : "table"})
rubs_raw = tabs[0].find_all("a")
rubs_str = [r.text for r in rubs_raw]
rubrics = " ".join(rubs_str)
ntime = tabs[1].find("span").text
diffc = tabs[2].find("span", {"class" : "difficult-value"}).text
pars_raw = soup0.find_all("p", {"class" : None})
pars_str = [p.text for p in pars_raw]
text = " ".join(pars_str)
text = text.replace("\xa0", " ")
text_final = text.split("Нашли опечатку?")[0]
def get_info(link0):
page0 = requests.get(link0)
soup0 = BeautifulSoup(page0.text)
title = soup0.find_all("title")[0].text
date = soup0.find("meta", {"itemprop" : "datePublished"})["content"]
author = soup0.find("meta", {"name" : "mediator_author"})["content"]
desc = soup0.find("meta", {"name" : "description"})["content"]
tabs = soup0.find_all("p", {"class" : "table"})
rubs_raw = tabs[0].find_all("a")
rubs_str = [r.text for r in rubs_raw]
rubrics = " ".join(rubs_str)
ntime = tabs[1].find("span").text
diffc = tabs[2].find("span", {"class" : "difficult-value"}).text
pars_raw = soup0.find_all("p", {"class" : None})
pars_str = [p.text for p in pars_raw]
text = " ".join(pars_str)
text = text.replace("\xa0", " ")
text_final = text.split("Нашли опечатку?")[0]
return title, author, desc, date, ntime, rubrics, diffc, text_final
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment