Skip to content

Instantly share code, notes, and snippets.

@clayote
Created May 18, 2021 17:03
Show Gist options
  • Save clayote/ff1aed82053863b9c204dccc4aad219d to your computer and use it in GitHub Desktop.
Save clayote/ff1aed82053863b9c204dccc4aad219d to your computer and use it in GitHub Desktop.
My effort to get useful data out of Twitter profiles
from requests_html import HTMLSession, HTML
from lxml.etree import ParserError
import json
import time
import random
with open('following.json', 'rt') as inf:
useless = json.load(inf)
session = HTMLSession()
useful = []
for i, you in enumerate(useless):
userid = you['following']['accountId']
# headers are what chromium spits out
# I've tried passing in an active cookie as well and it doesn't seem to work
headers = {
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="90"''
'sec-ch-ua-mobile': '?0',
'Upgrade-Insecure-Requests': '1',
"User-Agent": "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
page = session.get(f"https://twitter.com/i/user/{userid}", headers=headers)
page.html.render()
header = page.html.find('[data-testid="UserProfileHeader_Items"]')
# I would do some more processing here but the find call never returns anything!
useful.append(header)
time.sleep(random.randrange(1, 10))
if i % 100 == 0:
print(i)
with open('following_detailed.json', 'w') as outf:
json.dump(useful, outf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment