Skip to content

Instantly share code, notes, and snippets.

@colmjude
Created May 22, 2020 15:11
Show Gist options
  • Save colmjude/7285eec6c9ed7ecd3c2dc04aae1884b7 to your computer and use it in GitHub Desktop.
Save colmjude/7285eec6c9ed7ecd3c2dc04aae1884b7 to your computer and use it in GitHub Desktop.
Loop through list of local authority sites and list links to twitter
#!/usr/bin/env python3
import os
import json
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import pandas as pd
def parse_page_for_twitter_links(url):
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
html = urlopen(req).read()
except Exception as ex:
html = ""
print(ex)
soup = BeautifulSoup(html, 'html.parser')
return [ a["href"] for a in soup.find_all("a", href=True) if("twitter" in a["href"])]
def make_wikidata_url(organisation):
return f'https://wikidata.org/wiki/{organisation["wikidata"]}'
organisation_csv = os.environ.get("organisation_csv", "https://raw.githubusercontent.com/digital-land/organisation-dataset/master/collection/organisation.csv")
org_pd = pd.read_csv(organisation_csv, sep=",")
org_data = json.loads(org_pd.to_json(orient='records'))
collected = []
count = 0
for organisation in org_data:
if count < 50:
if organisation.get('twitter') is None:
if organisation.get('website') is not None:
print(count, organisation.get('name'))
collected.append({
'name': organisation.get('name'),
'wikidata': make_wikidata_url(organisation),
'twitters': parse_page_for_twitter_links(organisation['website'])
})
count = count + 1
else:
print(f'Already got it for {organisation["organisation"]}: {organisation["twitter"]}')
print(json.dumps(collected, indent=4, sort_keys=True))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment