Skip to content

Instantly share code, notes, and snippets.

@emsesc
Last active December 3, 2022 22:23
Show Gist options
  • Save emsesc/3e8e47e66838be2284e73dceb9905f74 to your computer and use it in GitHub Desktop.
Save emsesc/3e8e47e66838be2284e73dceb9905f74 to your computer and use it in GitHub Desktop.
Reads web sources from "input.csv" and outputs websites with corresponding Twitter handles.
import csv
import urllib.request
import re
import time
rows = []
# regex function (finds twitter handle)
def getHandle(pattern, html):
result = re.findall(pattern, html)
return result[0][1]
# reads website urls from input.csv
with open("input.csv", 'r') as file:
csvreader = csv.reader(file)
for row in csvreader:
rows.append(row)
handles = []
# attempts to scrape twitter handles from website
for url in rows:
try:
timeout = time.time() + 60
print("Scraping " + url[0])
response = urllib.request.urlopen("http://" + url[0], timeout = 30)
html = response.read().decode("utf-8")
handle = getHandle('(https:\/\/twitter.com\/(?![a-zA-Z0-9_]+\/)([a-zA-Z0-9_]+))', html)
handles.append("@" + handle)
print("Found: @" + handle + " for " + url[0])
except:
handles.append("")
print("Not found: " + url[0])
# write found twitter handles to output.csv
with open("output.csv", 'w') as file:
writer = csv.writer(file)
for i in range(len(rows)):
writer.writerow([rows[i][0], handles[i]])
@emsesc
Copy link
Author

emsesc commented Dec 3, 2022

YouTube Version

import csv
import urllib.request
import re
import time
rows = []

# regex function (finds twitter handle)
def getHandle(pattern, html):
  result = re.findall(pattern, html)
  return result[0][1]

# reads website urls from input.csv
with open("input.csv", 'r') as file:
    csvreader = csv.reader(file)
    for row in csvreader:
        rows.append(row)

handles = []
# attempts to scrape twitter handles from website
for url in rows:
  try:
    timeout = time.time() + 60
    print("Scraping " + url[0])
    req = urllib.request.Request("http://" + url[0], headers={'User-Agent': 'Mozilla/5.0'})
    response = urllib.request.urlopen(req, timeout=30)
    html = response.read().decode("utf-8")
    handle = getHandle('(https:\/\/www.youtube.com\/([a-zA-Z0-9_/]+))', html)
    handles.append("https://www.youtube.com/" + handle)
    print("✅ Found: https://www.youtube.com/" + handle + " for " + url[0])
  except Exception as e:
    print("🚨: " + str(e))
    handles.append("")
    print("❌ Not found: " + url[0])

# write found twitter handles to output.csv
with open("output.csv", 'w') as file:
    writer = csv.writer(file)
    for i in range(len(rows)):
        writer.writerow([rows[i][0], handles[i]])

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment