Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active September 6, 2020 19:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edsu/28fc72988029f1aa560af32772ac9655 to your computer and use it in GitHub Desktop.
Save edsu/28fc72988029f1aa560af32772ac9655 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
import csv
import sys
import random
import collections
import requests_html
http = requests_html.HTMLSession()
new_fields = ["twitter", "facebook", "instagram", "youtube", "rss"]
def main():
csv_file = sys.argv[1]
url_col = sys.argv[2]
in_csv = csv.DictReader(open(csv_file))
fieldnames = in_csv.fieldnames + new_fields
new_csv_file = csv_file.replace('.csv', '-new.csv')
out_csv = csv.DictWriter(open(new_csv_file, "w"), fieldnames=fieldnames)
out_csv.writeheader()
for row in in_csv:
name = row[url_col]
info = get_social(name)
row.update(info)
out_csv.writerow(row)
print(row)
def get_social(url):
if not url.startswith('http'):
url = 'http://' + url
no_result = {k: None for k in new_fields}
if not url:
return no_result
try:
doc = http.get(url, timeout=10)
except Exception as e:
return no_result
return {
"twitter": get_meta(doc, 'twitter:creator') or find_url(doc, r".*twitter.com/(?:#!/)?([a-z0-9_]+).*", "https://twitter.com/", ["intent"]),
"facebook": find_url(doc, r".*facebook.com/([a-z0-9_]+)/?$", "https://www.facebook.com/"),
"instagram": find_url(doc, r".*instagram.com/([a-z0-9_]+)/?$", "https://www.instagram.com/"),
"youtube": find_url(doc, r".*youtube.com/user/([a-z0-9_]+)/?$", "https://www.youtube.com/user/"),
"rss": get_rss(doc)
}
def get_meta(doc, name):
meta = doc.html.find('meta[name="{}"]'.format(name), first=True)
if meta and meta.attrs['content']:
# TODO: make not twitter specific
return meta.attrs['content'].replace('@', 'https://twitter.com/')
else:
return None
def find_url(doc, pattern, prefix, ignore=[]):
accounts = collections.Counter()
for a in doc.html.find("a[href]"):
m = re.match(pattern, a.attrs["href"], re.IGNORECASE)
if m and m.group(1) not in ignore:
accounts[m.group(1)] += 1
if len(accounts) > 0:
return prefix + accounts.most_common()[0][0]
else:
return None
def get_rss(doc):
links = doc.html.find('head link[rel="alternate"]')
for link in links:
if 'href' in link.attrs and 'comments' not in link.attrs['href']:
return link.attrs['href']
return None
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment