Skip to content

Instantly share code, notes, and snippets.

@nsfyn55
Created November 10, 2018 22:44
Show Gist options
  • Save nsfyn55/f63ebb64744bf76e16469fe1243e3015 to your computer and use it in GitHub Desktop.
Save nsfyn55/f63ebb64744bf76e16469fe1243e3015 to your computer and use it in GitHub Desktop.
Scraps over all of media bias's links and outputs a csv
import requests
import re
from bs4 import BeautifulSoup
import time
def parse_page(link):
resp = requests.get(link)
if resp.status_code == 404:
return
soup = BeautifulSoup(resp.content, 'html.parser')
# get name
site_name = soup.find_all("h1", class_="page-title")[0].text.strip(' \t\n\r')
# get bias icon
if site_name == "NDTV":
bias_icon = "leftcenter06"
elif site_name == "Knoxville News Sentinel":
bias_icon = "leftcenter06"
elif site_name == "Free Wheel Media":
bias_icon = "leftcenter06"
elif site_name == "Egyptian Streets":
bias_icon = "leftcenter06"
elif site_name == "The Times and Democrat":
bias_icon = "leastbiased011"
else:
img = soup.find_all("div", class_="container mh-mobile")[0].find_all("img")[:1]
src = img[0]["src"]
pattern = re.compile("http.*\/([a-zA-z0-9]*)\.png")
ms = pattern.match(src)
bias_icon = ms.group(1)
# get factual reporting rating
factual_reporting_rating = None
pgs = soup.find_all("p")
for pg in pgs:
if ("Factual News:" in pg.text or "Factual Reporting:" in pg.text):
try:
factual_reporting_rating = pg.find_all("strong")[0].text.strip('\n\r')
except:
try:
factual_reporting_rating = pg.find_all("b")[0].text.strip('\n\r')
except:
factual_reporting_rating = pg.find_all("span")[0].text.strip('\n\r')
if not factual_reporting_rating:
divs = soup.find_all("div")
for div in divs:
if ("Factual News:" in div.text or "Factual Reporting:" in div.text):
try:
factual_reporting_rating = div.find_all("strong")[0].text.strip('\n\r')
except:
try:
factual_reporting_rating = div.find_all("b")[0].text.strip('\n\r')
except:
factual_reporting_rating = div.find_all("span")[0].text.strip('\n\r')
if not factual_reporting_rating:
raise Exception("no mas")
# Get Domain
if "Elko Daily Free Press" == site_name:
domain="https://elkodaily.com/"
elif "American Enterprise Institute" == site_name:
domain="https://www.aei.org/"
elif "Act.TV" == site_name:
domain="http://act.tv"
elif "The Fucking News" == site_name:
domain="http://thefingnews.com/"
elif "Kyiv Post" == site_name:
domain="https://www.kyivpost.com/"
elif "Philadelphia Tribune" == site_name:
domain="http://www.phillytrib.com/"
elif "Reuters" == site_name:
domain="http://www.reuters.com/"
elif "South Bend Tribune" == site_name:
domain="https://www.southbendtribune.com/"
elif "ConservativeOpinion.com" == site_name:
domain="https://conservativeopinion.com/"
else:
content = soup.find_all("div", class_="main")[0]
anchors = content.find_all("a")
for anchor in anchors:
if (anchor.get("href", False) and
(site_name == "Wikipedia" or "wikipedia" not in anchor["href"]) and
("Sources:" in anchor.parent.text or "Notes:" in anchor.parent.text or "Source:" in anchor.parent.text)):
domain = anchor["href"]
return site_name, domain, bias_icon, factual_reporting_rating
urls = []
with open("all-links", "r") as f:
with open("final-result", "a") as final_result:
for link in f:
res = parse_page(link.strip('\t\n\r'))
if res:
print(res)
final_result.write(",".join(res) + "\n")
#link = 'https://mediabiasfactcheck.com/reuters/'
#print(parse_page(link))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment