yarko/bug-requests.py

## bug-requests.py
import requests

# 2017-05-30 17:02:10.049243
url = 'https://twitter.com/MinaMarkham/status/865606994614296576'

r = requests.get(url)

# Save the text, see if response is there or not manually:
# 2017-05-30 17:04:07.293014
with open('bug_reqraw.txt', 'w') as f:
    f.write(r.text)

# Or parse, and see if response is in parsed tree

# 2017-05-30 17:09:04.052462
from bs4 import BeautifulSoup

# 2017-05-30 17:10:33.255331
soup = BeautifulSoup(r.text, "html.parser")

# 2017-05-30 17:11:09.909756
import re

# 2017-05-30 17:11:43.421237
t = soup.find(string=re.compile("Reading"))

# 2017-05-30 17:11:48.822705
len(t)

## bug-urllib.py
# the simplest (and fails):
import urllib.request

url = 'https://twitter.com/MinaMarkham/status/865606994614296576'

local_filename, headers = urllib.request.urlretrieve(url, 'bug-urllib.txt')

## bug.py
#!/usr/bin/env python

import vcr
import requests
from hashlib import sha1
from bs4 import BeautifulSoup
from textwrap import wrap

# by default, this is for grabbing twitter threads,
#  which are otherwise impossible to print!
def grab(url, update=False, dev=None):
    '''
    grab a url and save it locally w/ vcr;
    update = True; use vcrpy record_mode "all",
          to update the cassette;
    dev - develop: save an uncompressed text
          copy if results, with the url, to
          the text file named in 'dev' argument.
    '''
    vcr_settings = {
        'cassette_library_dir': 'vcr_cassettes'
    }
    if update:
        vcr_settings.update({'record_mode':'all'})
    my_vcr = vcr.VCR(**vcr_settings)

    # need bytestring for hashlib routines;
    burl = sha1(bytearray(url, 'utf8'))
    # name the vcr saved file w/ hexdigest of url
    hd = burl.hexdigest()  # for yaml filename
    with my_vcr.use_cassette(f'{hd}.yaml'):
        soup = requests.get(url).text

    # if you want to test a bit more:

    if dev:  # save this, so you can develop a script
        with open(dev, 'w') as f:
            f.write(url+'\n\n')
            for line in soup.splitlines():
                s = '\n'.join(wrap(line, width=70)) + '\n'
                f.write(s)

    return soup


if __name__ == "__main__":
    #DEV:
    url = 'https://twitter.com/MinaMarkham/status/865606994614296576'
    soup = grab(url, dev="grab_net.txt", update=True)
    soup = grab(url, dev="grab_vcr.txt")


## curl.sh
# this also fails:

curl -o bug-curl.txt https://twitter.com/MinaMarkham/status/865606994614296576
	import requests

	# 2017-05-30 17:02:10.049243
	url = 'https://twitter.com/MinaMarkham/status/865606994614296576'

	r = requests.get(url)

	# Save the text, see if response is there or not manually:
	# 2017-05-30 17:04:07.293014
	with open('bug_reqraw.txt', 'w') as f:
	f.write(r.text)

	# Or parse, and see if response is in parsed tree

	# 2017-05-30 17:09:04.052462
	from bs4 import BeautifulSoup

	# 2017-05-30 17:10:33.255331
	soup = BeautifulSoup(r.text, "html.parser")

	# 2017-05-30 17:11:09.909756
	import re

	# 2017-05-30 17:11:43.421237
	t = soup.find(string=re.compile("Reading"))

	# 2017-05-30 17:11:48.822705
	len(t)
	# the simplest (and fails):
	import urllib.request

	url = 'https://twitter.com/MinaMarkham/status/865606994614296576'

	local_filename, headers = urllib.request.urlretrieve(url, 'bug-urllib.txt')
	#!/usr/bin/env python

	import vcr
	import requests
	from hashlib import sha1
	from bs4 import BeautifulSoup
	from textwrap import wrap

	# by default, this is for grabbing twitter threads,
	# which are otherwise impossible to print!
	def grab(url, update=False, dev=None):
	'''
	grab a url and save it locally w/ vcr;
	update = True; use vcrpy record_mode "all",
	to update the cassette;
	dev - develop: save an uncompressed text
	copy if results, with the url, to
	the text file named in 'dev' argument.
	'''
	vcr_settings = {
	'cassette_library_dir': 'vcr_cassettes'
	}
	if update:
	vcr_settings.update({'record_mode':'all'})
	my_vcr = vcr.VCR(**vcr_settings)

	# need bytestring for hashlib routines;
	burl = sha1(bytearray(url, 'utf8'))
	# name the vcr saved file w/ hexdigest of url
	hd = burl.hexdigest() # for yaml filename
	with my_vcr.use_cassette(f'{hd}.yaml'):
	soup = requests.get(url).text

	# if you want to test a bit more:

	if dev: # save this, so you can develop a script
	with open(dev, 'w') as f:
	f.write(url+'\n\n')
	for line in soup.splitlines():
	s = '\n'.join(wrap(line, width=70)) + '\n'
	f.write(s)

	return soup


	if __name__ == "__main__":
	#DEV:
	url = 'https://twitter.com/MinaMarkham/status/865606994614296576'
	soup = grab(url, dev="grab_net.txt", update=True)
	soup = grab(url, dev="grab_vcr.txt")
	# this also fails:

	curl -o bug-curl.txt https://twitter.com/MinaMarkham/status/865606994614296576