emsesc/twitter_handles.py

## twitter_handles.py
import csv
import urllib.request
import re
import time
rows = []

# regex function (finds twitter handle)
def getHandle(pattern, html):
  result = re.findall(pattern, html)
  return result[0][1]

# reads website urls from input.csv
with open("input.csv", 'r') as file:
    csvreader = csv.reader(file)
    for row in csvreader:
        rows.append(row)

handles = []
# attempts to scrape twitter handles from website
for url in rows:
  try:
    timeout = time.time() + 60
    print("Scraping " + url[0])
    response = urllib.request.urlopen("http://" + url[0], timeout = 30)
    html = response.read().decode("utf-8")
    handle = getHandle('(https:\/\/twitter.com\/(?![a-zA-Z0-9_]+\/)([a-zA-Z0-9_]+))', html)
    handles.append("@" + handle)
    print("Found: @" + handle + " for " + url[0])
  except:
    handles.append("")
    print("Not found: " + url[0])

# write found twitter handles to output.csv
with open("output.csv", 'w') as file:
    writer = csv.writer(file)
    for i in range(len(rows)):
        writer.writerow([rows[i][0], handles[i]])
	import csv
	import urllib.request
	import re
	import time
	rows = []

	# regex function (finds twitter handle)
	def getHandle(pattern, html):
	result = re.findall(pattern, html)
	return result[0][1]

	# reads website urls from input.csv
	with open("input.csv", 'r') as file:
	csvreader = csv.reader(file)
	for row in csvreader:
	rows.append(row)

	handles = []
	# attempts to scrape twitter handles from website
	for url in rows:
	try:
	timeout = time.time() + 60
	print("Scraping " + url[0])
	response = urllib.request.urlopen("http://" + url[0], timeout = 30)
	html = response.read().decode("utf-8")
	handle = getHandle('(https:\/\/twitter.com\/(?![a-zA-Z0-9_]+\/)([a-zA-Z0-9_]+))', html)
	handles.append("@" + handle)
	print("Found: @" + handle + " for " + url[0])
	except:
	handles.append("")
	print("Not found: " + url[0])

	# write found twitter handles to output.csv
	with open("output.csv", 'w') as file:
	writer = csv.writer(file)
	for i in range(len(rows)):
	writer.writerow([rows[i][0], handles[i]])