colmjude/twitter.py

## twitter.py
#!/usr/bin/env python3

import os
import json
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import pandas as pd


def parse_page_for_twitter_links(url):
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    try:
        html = urlopen(req).read()
    except Exception as ex:
        html = ""
        print(ex)

    soup = BeautifulSoup(html, 'html.parser')
    return [ a["href"] for a in soup.find_all("a", href=True) if("twitter" in a["href"])]


def make_wikidata_url(organisation):
    return f'https://wikidata.org/wiki/{organisation["wikidata"]}'


organisation_csv = os.environ.get("organisation_csv", "https://raw.githubusercontent.com/digital-land/organisation-dataset/master/collection/organisation.csv")

org_pd = pd.read_csv(organisation_csv, sep=",")
org_data = json.loads(org_pd.to_json(orient='records'))

collected = []

count = 0
for organisation in org_data:
    if count < 50:
        if organisation.get('twitter') is None:
            if organisation.get('website') is not None:
                print(count, organisation.get('name'))
                collected.append({
                    'name': organisation.get('name'),
                    'wikidata': make_wikidata_url(organisation),
                    'twitters': parse_page_for_twitter_links(organisation['website'])
                })
                count = count + 1
        else:
            print(f'Already got it for {organisation["organisation"]}: {organisation["twitter"]}')

print(json.dumps(collected, indent=4, sort_keys=True))
	#!/usr/bin/env python3

	import os
	import json
	from urllib.request import Request, urlopen
	from bs4 import BeautifulSoup
	import pandas as pd


	def parse_page_for_twitter_links(url):
	req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
	try:
	html = urlopen(req).read()
	except Exception as ex:
	html = ""
	print(ex)

	soup = BeautifulSoup(html, 'html.parser')
	return [ a["href"] for a in soup.find_all("a", href=True) if("twitter" in a["href"])]


	def make_wikidata_url(organisation):
	return f'https://wikidata.org/wiki/{organisation["wikidata"]}'


	organisation_csv = os.environ.get("organisation_csv", "https://raw.githubusercontent.com/digital-land/organisation-dataset/master/collection/organisation.csv")

	org_pd = pd.read_csv(organisation_csv, sep=",")
	org_data = json.loads(org_pd.to_json(orient='records'))

	collected = []

	count = 0
	for organisation in org_data:
	if count < 50:
	if organisation.get('twitter') is None:
	if organisation.get('website') is not None:
	print(count, organisation.get('name'))
	collected.append({
	'name': organisation.get('name'),
	'wikidata': make_wikidata_url(organisation),
	'twitters': parse_page_for_twitter_links(organisation['website'])
	})
	count = count + 1
	else:
	print(f'Already got it for {organisation["organisation"]}: {organisation["twitter"]}')

	print(json.dumps(collected, indent=4, sort_keys=True))