Created
May 22, 2020 15:11
-
-
Save colmjude/7285eec6c9ed7ecd3c2dc04aae1884b7 to your computer and use it in GitHub Desktop.
Loop through list of local authority sites and list links to twitter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import json | |
from urllib.request import Request, urlopen | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
def parse_page_for_twitter_links(url): | |
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) | |
try: | |
html = urlopen(req).read() | |
except Exception as ex: | |
html = "" | |
print(ex) | |
soup = BeautifulSoup(html, 'html.parser') | |
return [ a["href"] for a in soup.find_all("a", href=True) if("twitter" in a["href"])] | |
def make_wikidata_url(organisation): | |
return f'https://wikidata.org/wiki/{organisation["wikidata"]}' | |
organisation_csv = os.environ.get("organisation_csv", "https://raw.githubusercontent.com/digital-land/organisation-dataset/master/collection/organisation.csv") | |
org_pd = pd.read_csv(organisation_csv, sep=",") | |
org_data = json.loads(org_pd.to_json(orient='records')) | |
collected = [] | |
count = 0 | |
for organisation in org_data: | |
if count < 50: | |
if organisation.get('twitter') is None: | |
if organisation.get('website') is not None: | |
print(count, organisation.get('name')) | |
collected.append({ | |
'name': organisation.get('name'), | |
'wikidata': make_wikidata_url(organisation), | |
'twitters': parse_page_for_twitter_links(organisation['website']) | |
}) | |
count = count + 1 | |
else: | |
print(f'Already got it for {organisation["organisation"]}: {organisation["twitter"]}') | |
print(json.dumps(collected, indent=4, sort_keys=True)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment