Matthew Phillips phillipsm

## gist:1629473
...

/**
 * A container object to house our incoming HTTP request
 *
 * @author     Matt Phillips <mphillips@law.harvard.edu>
 * @license    http://www.gnu.org/licenses/lgpl.html GNU Lesser Public License
 */

class http_request {

## gist:8601065
    # Construct wget command
    command = 'wget '
    command = command + '--quiet ' # turn off wget's output
    command = command + '--tries=' + str(settings.NUMBER_RETRIES) + ' ' # number of retries (assuming no 404 or the like)
    command = command + '--wait=' + str(settings.WAIT_BETWEEN_TRIES) + ' ' # number of seconds between requests (lighten the load on a page that has a lot of assets)
    command = command + '--quota=' + settings.ARCHIVE_QUOTA + ' ' # only store this amount
    command = command + '--random-wait ' # random wait between .5 seconds and --wait=
    command = command + '--limit-rate=' + settings.ARCHIVE_LIMIT_RATE  + ' ' # we'll be performing multiple archives at once. let's not download too much in one stream
    command = command + '--adjust-extension '  # if a page is served up at .asp, adjust to .html. (this is the new --html-extension flag)
    command = command + '--span-hosts ' # sometimes things like images are hosted at a CDN. let's span-hosts to get those

## gist:ad7981ed6bf8571e0c5b
function check_status() {

// Check our status service to see if we have archivng jobs pending
	var request = $.ajax({
		url: status_url + newLinky.linky_id,
		type: "GET",
		dataType: "json",
		cache: false
	});

## gist:0ed98b2585f0ada5a769
import requests
from bs4 import BeautifulSoup

# We've now imported the two packages that will do the heavy lifting
# for us, reqeusts and BeautifulSoup

# Let's put the URL of the page we want to scrape in a variable
# so that our code down below can be a little cleaner
url_to_scrape = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/'

## gist:c832c825c994735b31fe

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                phillipsm
                / gist:c832c825c994735b31fe
            
            
              Last active
              August 29, 2015 14:21
            
              
                All material for dgmde15
              
          
    All material used for dgmde15

still dumping material in here

  
## gist:404780e419c49a5b62a8
import requests
from bs4 import BeautifulSoup
import time

# We've now imported the two packages that will do the heavy lifting
# for us, reqeusts and BeautifulSoup

# This is the URL that lists the current inmates
# Should this URL go away, and archive is available at
# http://perma.cc/2HZR-N38X

## gist:2bdb5f622cbabe107c5b
import requests
from bs4 import BeautifulSoup

## gist:7199f931a2de6787c0b6
url_to_scrape = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/'

r = requests.get(url_to_scrape)

soup = BeautifulSoup(r.text)

inmates_links = []

for table_row in soup.select(".inmatesList tr"):
	table_cells = table_row.findAll('td')

## gist:1f272a7caec08e44df2f
inmates = []

for inmate_link in inmates_links[:10]:
    r = requests.get(inmate_link)
    soup = BeautifulSoup(r.text)

    inmate_details = {}

    inmate_profile_rows = soup.select("#inmateProfile tr")
    inmate_details['age'] = inmate_profile_rows[0].findAll('td')[0].text.strip()

## gist:29d4cb4addb5c5a21ae7
inmate_cities =  {}

for inmate in inmates:
    if inmate['city'] in inmate_cities:
        inmate_cities[inmate['city']] += 1
    else:
        inmate_cities[inmate['city']] = 1

print inmate_cities
	...

	/**
	* A container object to house our incoming HTTP request
	*
	* @author Matt Phillips <mphillips@law.harvard.edu>
	* @license http://www.gnu.org/licenses/lgpl.html GNU Lesser Public License
	*/

	class http_request {
	# Construct wget command
	command = 'wget '
	command = command + '--quiet ' # turn off wget's output
	command = command + '--tries=' + str(settings.NUMBER_RETRIES) + ' ' # number of retries (assuming no 404 or the like)
	command = command + '--wait=' + str(settings.WAIT_BETWEEN_TRIES) + ' ' # number of seconds between requests (lighten the load on a page that has a lot of assets)
	command = command + '--quota=' + settings.ARCHIVE_QUOTA + ' ' # only store this amount
	command = command + '--random-wait ' # random wait between .5 seconds and --wait=
	command = command + '--limit-rate=' + settings.ARCHIVE_LIMIT_RATE + ' ' # we'll be performing multiple archives at once. let's not download too much in one stream
	command = command + '--adjust-extension ' # if a page is served up at .asp, adjust to .html. (this is the new --html-extension flag)
	command = command + '--span-hosts ' # sometimes things like images are hosted at a CDN. let's span-hosts to get those
	function check_status() {

	// Check our status service to see if we have archivng jobs pending
	var request = $.ajax({
	url: status_url + newLinky.linky_id,
	type: "GET",
	dataType: "json",
	cache: false
	});
	import requests
	from bs4 import BeautifulSoup

	# We've now imported the two packages that will do the heavy lifting
	# for us, reqeusts and BeautifulSoup

	# Let's put the URL of the page we want to scrape in a variable
	# so that our code down below can be a little cleaner
	url_to_scrape = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/'
	import requests
	from bs4 import BeautifulSoup
	import time

	# We've now imported the two packages that will do the heavy lifting
	# for us, reqeusts and BeautifulSoup

	# This is the URL that lists the current inmates
	# Should this URL go away, and archive is available at
	# http://perma.cc/2HZR-N38X
	url_to_scrape = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/'

	r = requests.get(url_to_scrape)

	soup = BeautifulSoup(r.text)

	inmates_links = []

	for table_row in soup.select(".inmatesList tr"):
	table_cells = table_row.findAll('td')
	inmates = []

	for inmate_link in inmates_links[:10]:
	r = requests.get(inmate_link)
	soup = BeautifulSoup(r.text)

	inmate_details = {}

	inmate_profile_rows = soup.select("#inmateProfile tr")
	inmate_details['age'] = inmate_profile_rows[0].findAll('td')[0].text.strip()
	inmate_cities = {}

	for inmate in inmates:
	if inmate['city'] in inmate_cities:
	inmate_cities[inmate['city']] += 1
	else:
	inmate_cities[inmate['city']] = 1

	print inmate_cities