amandabee/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Just ... some scripts. Nothing special.

  
## bingo_card_maker.py
""" Generate PDF bingo cards."""
import os
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch
from reportlab.lib.colors import Color
import random

""" TO DO: use better fonts.
from reportlab.pdfbase.ttfonts import TTFont

Then can register TTF with
pdfmetrics.registerFont(TTFont('Vera','Vera.ttf'))
"""


def set_canvas(filename):
    """Take a filename and initiate a canvas object that we can draw to."""
    this_canvas = canvas.Canvas(filename)
    this_canvas.setPageSize(letter)
    # Was setting font here, turns out to be easier to set on draw strings.
    return this_canvas


def get_box_width(word="bingo", page_size=letter, page_margin=1 * inch):
    """For any word, and any page dimensions, figure out how wide your boxes can
    be to accomodate the word and fill the page.
    !!! FIX: If I was clever, I'd set the base font size here, too.
    """
    #page_margin = page_margin * inch
    available_width = page_size[0] - page_margin * 2
    box_width = available_width / len(word)
    return box_width


def get_coords(word="bingo"):
    """
    Take some word and get a list of tuples of x,y coordinates for each letter,
    and two list, one of x values, another of y values, that can be used to draw
    a grid
    """
    box = get_box_width(word)
    # Start x so this is centered
    # x is where the first letter gets drawn
    # so the gutter is 0.5 boxes away.
    x_coordinate = 0.5 * box + (letter[0] - box * len(word)) / 2
    y_offset = 0.5 * box + (letter[1] - box * (len(word) + 1)) / 2
    y_coordinate = 11 * inch - y_offset
    x_values = []  # will get one extra value
    y_values = []
    for _unused in range(0, len(word) + 1):
        x_values.append(x_coordinate)
        x_coordinate = x_coordinate + box
        y_values.append(y_coordinate)
        y_coordinate = y_coordinate - box
    x_values.pop()
    coords = []
    for x in x_values:
        for y in y_values:
            # print x, y
            coords.append((x, y))
    x_list = []
    y_list = []
    # Set the X and Y values for the grid
    for x in x_values:
        x_list.append(int(x - box * 0.5))
    x_list.append(int(x_values[-1] + box * 0.5))
    for y in y_values:
        y_list.append(int(y + box * 0.65))
    # This math is cheating: I just looked at what gets returned
    # to figure out that it was off by 13.
    y_list.append(int(y_values[-1] - box * 0.5 + 13))
    return x_list, y_list, coords


def set_ranges(word="bingo", i=15):
    """Takes a word (string) and an interval (integer) and returns a dict of max
    and min values for each letter. Defaults to "bingo" with an interval of 15.
    The resulting dict also includes the starting word.
    """
    ranges = {}
    ranges['word'] = word
    j = 0
    for ltr in word:
        ranges[ltr] = [j + 1, j + i]
        j = j + i
    return ranges


def set_strings(ranges):
    """Takes a dictionary produced by set_ranges() and generates a list of
    random numbers for each letter in the range. How many random numbers depends
    on how longthe base word is.
    """
    # Get the word we're working with (probably "bingo")
    word = ranges['word']
    # How long is the word?
    length = len(word)
    strings = []
    # For each letter in the word, pick length random numbers in the range.
    for ltr in word:
        strings.append(str.upper(ltr))
        random_numbers = random.sample(
            range(ranges[ltr][0], ranges[ltr][1] + 1), length)
        for i in random_numbers:
            strings.append(i)
    # Put a free cell in the middle, but only if word length is odd
    if len(word) % 2 == 0:
        print "No free cell"
    else:
        mid = len(strings) / 2
        strings[mid] = "FREE"
    return strings


def draw_grid(this_canvas, coords):
    """ Takes a canvas instance and x and y coordinate lists returned by
    get_coords() and draws a grid (in red) on the canvas."""
    this_canvas.setLineWidth(2.0)
    red50transparent = Color(100, 0, 0, alpha=0.5)
    this_canvas.setStrokeColor(red50transparent)
    # c.setStrokeGrey(0.75)
    x_list = coords[0]
    y_list = coords[1]
    this_canvas.grid(x_list, y_list)


def draw_strings(this_canvas, coords, strings):
    """ Takes a canvas, a list of tuples, a list of strings/ integers and draws
    each string on the canvas according to the coordinates. Assumes that
    "strings" includes letters as column headers and integers for the grid
    itself.
    !!! FIX: If the boxes are too small, the font should get reduced.
    """
    for i in range(0, len(strings)):
        # Print the "Free" cell in a smaller font.
        if (strings[i] == "FREE"):
            #  print strings[i]
            this_canvas.setFont('Helvetica', 28)
        elif isinstance(strings[i], str):
            this_canvas.setFont('Courier-Bold', 42)
            # print "string"
        else:
            this_canvas.setFont('Helvetica-Bold', 36)
        printable_string = str(strings[i])
        x = coords[i][0]
        y = coords[i][1]
        this_canvas.drawCentredString(x, y, printable_string)


def draw_cards(path, filename="bingo.pdf", i=45, word="bingo"):
    """Take a path, a filename, some integer (i), some word. Draw i bingo
    cards at path/filename.
    """
    os.chdir(path)
    # The canvas, ranges and coordinates only need to be set once.
    this_canvas = set_canvas(filename)
    this_ranges = set_ranges(word)
    this_coordinates = get_coords(word)
    this_coord_tuples = this_coordinates[2]

    # Generate new random strings for each card.
    for _unused in range(0, i):
        card_strings = set_strings(this_ranges)
        draw_strings(this_canvas, this_coord_tuples, card_strings)
        draw_grid(this_canvas, this_coordinates)
        this_canvas.save()

"""
draw_cards("~", "bingo.pdf", 13, "bingo")
"""

## json_scraper.py
"""
Feb 2014: I started trying to scrape this w/Beautiful Soup, but it turns out the data is all in JSON.

Extract all facility addresses from http://www.bop.gov/locations/list.jsp
"""
import urllib2
import json
import csv

url = "http://www.bop.gov/PublicInfo/execute/locations?todo=query&output=json"
json_string = urllib2.urlopen(url).read()
## Load the string of JSON into a dict
jsondata = json.loads(json_string)
## Review the keys of the dict
## or just use http://jsbeautifier.org/ to see what it looks like
for item in jsondata:
    print item
## So I know there are three top level items

## Get the full list of items in "Locations"
for item in jsondata['Locations'][0]:
    print item

### Open a CSV WRiter
f=csv.writer(open('/tmp/locations.csv','wb'))

###and write to it.
# and write to it.
for item in jsondata['Locations']:
    f.writerow(
        [item['hasFsl'],
         item['code'],
         item['contactEmail'],
         item['special'],
         item['city'],
         item['privateFacl'],
         item['nameDisplay'],
         item['faclTypeDescription'],
         item['state'],
         item['phoneNumber'],
         item['latitude'],
         item['type'],
         item['locationtype'],
         item['zipCode'],
         item['hasCamp'],
         item['complexCode'],
         item['address'],
         item['securityLevel'],
         item['name'],
         item['gender'],
         item['region'],
         item['longitude'],
         item['hasFdc'],
         item['timeZone'],
         item['nameTitle']])

## mlb_salaries.py
#!/usr/bin/env python

### Jan 2014
### This is rough, but works. I wanted a spreadsheet of MLB Salaries for a
### basic lesson on means and medians and how wildly extravagant salaries
### distort the mean. So I scraped the data from Newsday's salary database.

#import scraperwiki
import urllib2
from bs4 import BeautifulSoup
import csv

def get_soup(url):
    #soup = BeautifulSoup(scraperwiki.scrape(url))
    soup = BeautifulSoup(urllib2.urlopen(url))
    return soup


def get_salaries(soup, linewriter):
    table = soup.find("table", {"id":"sdb-results"})

    for row in table.findAll('tr'):
      cells = row.find_all("td")
      try:
        data = {
            'player' : cells[0].get_text().strip(),
            'team' : cells[1].get_text().strip(),
            'position' : cells[2].get_text().strip(),
            'state' : cells[3].get_text().strip(),
            'league' : cells[4].get_text().strip(),
            'division' : cells[5].get_text().strip(),
            '2013_salary' : cells[6].get_text().strip('$,').strip(),
            'age' : cells[7].get_text().strip()
        }
        #scraperwiki.sqlite.save(unique_keys=['player'],data=data)
        linewriter.writerow(data)
        print "Saved " + data['player']
      except Exception,e:
        print str(e)


base_url = "http://data.newsday.com/long-island/data/baseball/mlb-salaries-2013/?currentRecord="

print range(1, 854, 50)


with open('/home/amanda/Desktop/mlb_salaries_alt.csv', 'a+') as csvfile:
    fieldorder = ['player' , 'team' , 'position' , 'state' ,
                    'league', 'division', '2013_salary', 'age']
    linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='|',
                        quotechar='"', quoting=csv.QUOTE_MINIMAL)

    for record in range(1, 854, 50):
        print "starting..."
        url = base_url + str(record);
        soup = get_soup(url)
        get_salaries(soup, linewriter)
        print url;


## scrape_mailman.py
"""
Pull down a bunch of public mailman archives.
In this case, they're archived monthly and the URLs look something like this:

http://lists.example.net/pipermail/listname/2010-October.txt.gz

"""

import urllib2
import calendar
import gzip
import os


def get_all_gzs(base, years):
    """
    for some base URL (unique to your list) and range of years,
    download all the archives.
    """
    archive_folder = 'list_archives'
    if not os.path.exists(archive_folder):
        os.makedirs(archive_folder)
    for year in years:
        print year
        for month in calendar.month_name:
            filename = str(year) + "-" + month + ".txt.gz"
            print filename
            url = base + filename
            print url
            try:
                req = urllib2.urlopen(url)
                output = open(filename, 'wb')
                output.write(req.read())
                output.close()

                with gzip.open(filename, 'rb') as z:
                    file_content = z.read()
                    textfile = archive_folder + "/" + \
                        str(year) + "-" + month + ".txt"
                    f = open(textfile, 'w')
                    f.write(file_content)
                    f.close
            except Exception as e:
                print e

URLBASE = "http://lists.example.net/pipermail/listname/"
YEARS = range(2008, 2016)

get_all_gzs(URLBASE, YEARS)
	""" Generate PDF bingo cards."""
	import os
	from reportlab.pdfgen import canvas
	from reportlab.lib.pagesizes import letter
	from reportlab.lib.units import inch
	from reportlab.lib.colors import Color
	import random

	""" TO DO: use better fonts.
	from reportlab.pdfbase.ttfonts import TTFont

	Then can register TTF with
	pdfmetrics.registerFont(TTFont('Vera','Vera.ttf'))
	"""


	def set_canvas(filename):
	"""Take a filename and initiate a canvas object that we can draw to."""
	this_canvas = canvas.Canvas(filename)
	this_canvas.setPageSize(letter)
	# Was setting font here, turns out to be easier to set on draw strings.
	return this_canvas


	def get_box_width(word="bingo", page_size=letter, page_margin=1 * inch):
	"""For any word, and any page dimensions, figure out how wide your boxes can
	be to accomodate the word and fill the page.
	!!! FIX: If I was clever, I'd set the base font size here, too.
	"""
	#page_margin = page_margin * inch
	available_width = page_size[0] - page_margin * 2
	box_width = available_width / len(word)
	return box_width


	def get_coords(word="bingo"):
	"""
	Take some word and get a list of tuples of x,y coordinates for each letter,
	and two list, one of x values, another of y values, that can be used to draw
	a grid
	"""
	box = get_box_width(word)
	# Start x so this is centered
	# x is where the first letter gets drawn
	# so the gutter is 0.5 boxes away.
	x_coordinate = 0.5 * box + (letter[0] - box * len(word)) / 2
	y_offset = 0.5 * box + (letter[1] - box * (len(word) + 1)) / 2
	y_coordinate = 11 * inch - y_offset
	x_values = [] # will get one extra value
	y_values = []
	for _unused in range(0, len(word) + 1):
	x_values.append(x_coordinate)
	x_coordinate = x_coordinate + box
	y_values.append(y_coordinate)
	y_coordinate = y_coordinate - box
	x_values.pop()
	coords = []
	for x in x_values:
	for y in y_values:
	# print x, y
	coords.append((x, y))
	x_list = []
	y_list = []
	# Set the X and Y values for the grid
	for x in x_values:
	x_list.append(int(x - box * 0.5))
	x_list.append(int(x_values[-1] + box * 0.5))
	for y in y_values:
	y_list.append(int(y + box * 0.65))
	# This math is cheating: I just looked at what gets returned
	# to figure out that it was off by 13.
	y_list.append(int(y_values[-1] - box * 0.5 + 13))
	return x_list, y_list, coords


	def set_ranges(word="bingo", i=15):
	"""Takes a word (string) and an interval (integer) and returns a dict of max
	and min values for each letter. Defaults to "bingo" with an interval of 15.
	The resulting dict also includes the starting word.
	"""
	ranges = {}
	ranges['word'] = word
	j = 0
	for ltr in word:
	ranges[ltr] = [j + 1, j + i]
	j = j + i
	return ranges


	def set_strings(ranges):
	"""Takes a dictionary produced by set_ranges() and generates a list of
	random numbers for each letter in the range. How many random numbers depends
	on how longthe base word is.
	"""
	# Get the word we're working with (probably "bingo")
	word = ranges['word']
	# How long is the word?
	length = len(word)
	strings = []
	# For each letter in the word, pick length random numbers in the range.
	for ltr in word:
	strings.append(str.upper(ltr))
	random_numbers = random.sample(
	range(ranges[ltr][0], ranges[ltr][1] + 1), length)
	for i in random_numbers:
	strings.append(i)
	# Put a free cell in the middle, but only if word length is odd
	if len(word) % 2 == 0:
	print "No free cell"
	else:
	mid = len(strings) / 2
	strings[mid] = "FREE"
	return strings


	def draw_grid(this_canvas, coords):
	""" Takes a canvas instance and x and y coordinate lists returned by
	get_coords() and draws a grid (in red) on the canvas."""
	this_canvas.setLineWidth(2.0)
	red50transparent = Color(100, 0, 0, alpha=0.5)
	this_canvas.setStrokeColor(red50transparent)
	# c.setStrokeGrey(0.75)
	x_list = coords[0]
	y_list = coords[1]
	this_canvas.grid(x_list, y_list)


	def draw_strings(this_canvas, coords, strings):
	""" Takes a canvas, a list of tuples, a list of strings/ integers and draws
	each string on the canvas according to the coordinates. Assumes that
	"strings" includes letters as column headers and integers for the grid
	itself.
	!!! FIX: If the boxes are too small, the font should get reduced.
	"""
	for i in range(0, len(strings)):
	# Print the "Free" cell in a smaller font.
	if (strings[i] == "FREE"):
	# print strings[i]
	this_canvas.setFont('Helvetica', 28)
	elif isinstance(strings[i], str):
	this_canvas.setFont('Courier-Bold', 42)
	# print "string"
	else:
	this_canvas.setFont('Helvetica-Bold', 36)
	printable_string = str(strings[i])
	x = coords[i][0]
	y = coords[i][1]
	this_canvas.drawCentredString(x, y, printable_string)


	def draw_cards(path, filename="bingo.pdf", i=45, word="bingo"):
	"""Take a path, a filename, some integer (i), some word. Draw i bingo
	cards at path/filename.
	"""
	os.chdir(path)
	# The canvas, ranges and coordinates only need to be set once.
	this_canvas = set_canvas(filename)
	this_ranges = set_ranges(word)
	this_coordinates = get_coords(word)
	this_coord_tuples = this_coordinates[2]

	# Generate new random strings for each card.
	for _unused in range(0, i):
	card_strings = set_strings(this_ranges)
	draw_strings(this_canvas, this_coord_tuples, card_strings)
	draw_grid(this_canvas, this_coordinates)
	this_canvas.save()

	"""
	draw_cards("~", "bingo.pdf", 13, "bingo")
	"""
	"""
	Feb 2014: I started trying to scrape this w/Beautiful Soup, but it turns out the data is all in JSON.

	Extract all facility addresses from http://www.bop.gov/locations/list.jsp
	"""
	import urllib2
	import json
	import csv

	url = "http://www.bop.gov/PublicInfo/execute/locations?todo=query&output=json"
	json_string = urllib2.urlopen(url).read()
	## Load the string of JSON into a dict
	jsondata = json.loads(json_string)
	## Review the keys of the dict
	## or just use http://jsbeautifier.org/ to see what it looks like
	for item in jsondata:
	print item
	## So I know there are three top level items

	## Get the full list of items in "Locations"
	for item in jsondata['Locations'][0]:
	print item

	### Open a CSV WRiter
	f=csv.writer(open('/tmp/locations.csv','wb'))

	###and write to it.
	# and write to it.
	for item in jsondata['Locations']:
	f.writerow(
	[item['hasFsl'],
	item['code'],
	item['contactEmail'],
	item['special'],
	item['city'],
	item['privateFacl'],
	item['nameDisplay'],
	item['faclTypeDescription'],
	item['state'],
	item['phoneNumber'],
	item['latitude'],
	item['type'],
	item['locationtype'],
	item['zipCode'],
	item['hasCamp'],
	item['complexCode'],
	item['address'],
	item['securityLevel'],
	item['name'],
	item['gender'],
	item['region'],
	item['longitude'],
	item['hasFdc'],
	item['timeZone'],
	item['nameTitle']])
	#!/usr/bin/env python

	### Jan 2014
	### This is rough, but works. I wanted a spreadsheet of MLB Salaries for a
	### basic lesson on means and medians and how wildly extravagant salaries
	### distort the mean. So I scraped the data from Newsday's salary database.

	#import scraperwiki
	import urllib2
	from bs4 import BeautifulSoup
	import csv

	def get_soup(url):
	#soup = BeautifulSoup(scraperwiki.scrape(url))
	soup = BeautifulSoup(urllib2.urlopen(url))
	return soup


	def get_salaries(soup, linewriter):
	table = soup.find("table", {"id":"sdb-results"})

	for row in table.findAll('tr'):
	cells = row.find_all("td")
	try:
	data = {
	'player' : cells[0].get_text().strip(),
	'team' : cells[1].get_text().strip(),
	'position' : cells[2].get_text().strip(),
	'state' : cells[3].get_text().strip(),
	'league' : cells[4].get_text().strip(),
	'division' : cells[5].get_text().strip(),
	'2013_salary' : cells[6].get_text().strip('$,').strip(),
	'age' : cells[7].get_text().strip()
	}
	#scraperwiki.sqlite.save(unique_keys=['player'],data=data)
	linewriter.writerow(data)
	print "Saved " + data['player']
	except Exception,e:
	print str(e)


	base_url = "http://data.newsday.com/long-island/data/baseball/mlb-salaries-2013/?currentRecord="

	print range(1, 854, 50)


	with open('/home/amanda/Desktop/mlb_salaries_alt.csv', 'a+') as csvfile:
	fieldorder = ['player' , 'team' , 'position' , 'state' ,
	'league', 'division', '2013_salary', 'age']
	linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='\|',
	quotechar='"', quoting=csv.QUOTE_MINIMAL)

	for record in range(1, 854, 50):
	print "starting..."
	url = base_url + str(record);
	soup = get_soup(url)
	get_salaries(soup, linewriter)
	print url;
	"""
	Pull down a bunch of public mailman archives.
	In this case, they're archived monthly and the URLs look something like this:

	http://lists.example.net/pipermail/listname/2010-October.txt.gz

	"""

	import urllib2
	import calendar
	import gzip
	import os


	def get_all_gzs(base, years):
	"""
	for some base URL (unique to your list) and range of years,
	download all the archives.
	"""
	archive_folder = 'list_archives'
	if not os.path.exists(archive_folder):
	os.makedirs(archive_folder)
	for year in years:
	print year
	for month in calendar.month_name:
	filename = str(year) + "-" + month + ".txt.gz"
	print filename
	url = base + filename
	print url
	try:
	req = urllib2.urlopen(url)
	output = open(filename, 'wb')
	output.write(req.read())
	output.close()

	with gzip.open(filename, 'rb') as z:
	file_content = z.read()
	textfile = archive_folder + "/" + \
	str(year) + "-" + month + ".txt"
	f = open(textfile, 'w')
	f.write(file_content)
	f.close
	except Exception as e:
	print e

	URLBASE = "http://lists.example.net/pipermail/listname/"
	YEARS = range(2008, 2016)

	get_all_gzs(URLBASE, YEARS)