Skip to content

Instantly share code, notes, and snippets.

@yarko
Last active May 31, 2017 01:41
Show Gist options
  • Save yarko/1edf7a6ff8ac524ec98928158a86012e to your computer and use it in GitHub Desktop.
Save yarko/1edf7a6ff8ac524ec98928158a86012e to your computer and use it in GitHub Desktop.
debugging scraping threads from twitter
import requests
# 2017-05-30 17:02:10.049243
url = 'https://twitter.com/MinaMarkham/status/865606994614296576'
r = requests.get(url)
# Save the text, see if response is there or not manually:
# 2017-05-30 17:04:07.293014
with open('bug_reqraw.txt', 'w') as f:
f.write(r.text)
# Or parse, and see if response is in parsed tree
# 2017-05-30 17:09:04.052462
from bs4 import BeautifulSoup
# 2017-05-30 17:10:33.255331
soup = BeautifulSoup(r.text, "html.parser")
# 2017-05-30 17:11:09.909756
import re
# 2017-05-30 17:11:43.421237
t = soup.find(string=re.compile("Reading"))
# 2017-05-30 17:11:48.822705
len(t)
# the simplest (and fails):
import urllib.request
url = 'https://twitter.com/MinaMarkham/status/865606994614296576'
local_filename, headers = urllib.request.urlretrieve(url, 'bug-urllib.txt')
#!/usr/bin/env python
import vcr
import requests
from hashlib import sha1
from bs4 import BeautifulSoup
from textwrap import wrap
# by default, this is for grabbing twitter threads,
# which are otherwise impossible to print!
def grab(url, update=False, dev=None):
'''
grab a url and save it locally w/ vcr;
update = True; use vcrpy record_mode "all",
to update the cassette;
dev - develop: save an uncompressed text
copy if results, with the url, to
the text file named in 'dev' argument.
'''
vcr_settings = {
'cassette_library_dir': 'vcr_cassettes'
}
if update:
vcr_settings.update({'record_mode':'all'})
my_vcr = vcr.VCR(**vcr_settings)
# need bytestring for hashlib routines;
burl = sha1(bytearray(url, 'utf8'))
# name the vcr saved file w/ hexdigest of url
hd = burl.hexdigest() # for yaml filename
with my_vcr.use_cassette(f'{hd}.yaml'):
soup = requests.get(url).text
# if you want to test a bit more:
if dev: # save this, so you can develop a script
with open(dev, 'w') as f:
f.write(url+'\n\n')
for line in soup.splitlines():
s = '\n'.join(wrap(line, width=70)) + '\n'
f.write(s)
return soup
if __name__ == "__main__":
#DEV:
url = 'https://twitter.com/MinaMarkham/status/865606994614296576'
soup = grab(url, dev="grab_net.txt", update=True)
soup = grab(url, dev="grab_vcr.txt")
# this also fails:
curl -o bug-curl.txt https://twitter.com/MinaMarkham/status/865606994614296576
@yarko
Copy link
Author

yarko commented May 30, 2017

This is a work in progress - there is a reply in this thread which seems:

  • picked up by vcrpy, when getting it from the net (but not from a vcr);
  • not picked up by requests (so doesn't seem to be a BeautifulSoup issue);

The missing thread response I noticed is https://twitter.com/yarkot/status/866166065113571329,
with text beginning with "Reading all the responses, 2 things clear:"

Since this reply shows up in "show source" from a browser (e.g. Chrome), but fails also in a simple curl,
it begs the question: what is special about some thread responses (and getting them)?!?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment