Skip to content

Instantly share code, notes, and snippets.

View JBlumstein's full-sized avatar

Jonah Blumstein JBlumstein

View GitHub Profile
@JBlumstein
JBlumstein / socrata_nyc_parking_tickets_get.py
Last active November 10, 2016 18:58
Get NYC Parking Tickets Data from Socrata
#import librarires
import numpy as np
import pandas as pd
import datetime
import urllib
from bokeh.plotting import *
from bokeh.models import HoverTool
from collections import OrderedDict
@JBlumstein
JBlumstein / gist:2d8d69652ad34f9a9c6766ceb3fc4630
Created August 26, 2016 16:35
Last.FM Script for Pitchfork Album Reviews
#script to add genre, playcount, listener count, album duration, and number of songs information from lastfm
#to all album reviews scraped from pitchfork before the site's redesign
import requests
import json
import pandas as pd
import time
#load in table of album reviews metadata (album name, artist, review date, and rating)
album_table = pd.read_csv("pitchforkalbumreviews.csv")
@JBlumstein
JBlumstein / gist:d475f62e686e0d55ab56dae1a0503211
Created August 26, 2016 15:14
Pitchfork Album Review Scraper
import urllib.request as ur #url.request lib for handling the url
from bs4 import BeautifulSoup #bs for parsing the page
import requests
from lxml import html
import numpy as np
import pandas as pd
#get index pages for album reviews
#base url to add to
@JBlumstein
JBlumstein / Pitchfork Scraper
Last active August 29, 2015 14:17
Pitchfork Scraper
#This scraper finds album review titles, dates, and scores from Pitchfork. It scrapes the child pages of the object-grid css class on http://pitchfork.com/reviews/albums/1 (...n) and puts them into a list of lists, a list of dicts, and a Pandas dataframe.
#The second half of this script converts the list of list that results from the first half and finds the average album review score for each month and puts it into a list of dicts and a Pandas dataframe.
#Note that this scraper takes a long time to run because it uses BeautifulSoup to scrape each parent page (the child pages are scraped using xpath). If you want a demo, lower the number in the while loop to something like 3 or 4.
#As of March 2015, there are 818 pages of album reviews.
import urllib.request as ur #url.request lib for handling the url
from bs4 import BeautifulSoup #bs for parsing the page
import requests
from lxml import html
import numpy
@JBlumstein
JBlumstein / Epicurious scraper
Last active August 29, 2015 14:16
Scraper to find a recipe on epicurious using user-provided ingredients
#For a homework assignment to ask the user what ingredients they want to use in a recipe, go to the epicurious results page, select the first result, and return the recipe name, description, ingredients, and instructions.
import urllib.request as ur #url.request lib for handling the url
from bs4 import BeautifulSoup #bs for parsing the page
def recipegetter(searchterms):
searchterms_nocommas = searchterms.replace(",","")
urlslug = searchterms_nocommas.replace(" ","+")
baseurl = "http://www.epicurious.com/tools/searchresults?search="
fullurl = baseurl+urlslug
@JBlumstein
JBlumstein / BuzzFeed Scraper
Created December 22, 2014 03:01
Scrapes BuzzFeed by author or category (OMG, LOL, etc.) feed
from lxml import html
import requests
import pandas
root_url = 'http://www.buzzfeed.com/'
page_ids = ['bensmith', 'wtf', 'omg', '1uapps', 'amazon', 'amazonfiretv', 'discovercard', 'verizonholiday', 'autotraderdotcom','amazonkindle', 'jackdaniels', 'ibm', 'microsoftbing', 'wakeupcall', 'nhtsagov', 'intelinnovationslab', 'volvo', 'yahoo', 'walgreens', 'directv', 'motorolamotox', 'android', 'fruitwater', 'directvcinema', 'curve', 'loft', 'ford', 'lol', 'cute', 'fail', 'quizzes', 'videos', 'entertainment', 'buzz', 'mbvd','hayesbrown','jimdalrympleii','alexcampbell','kirstenking','christianzamora','candacelowry','kristinchirico','polls','alannaokun','jessicaprobus','katenocera','evanmcsan','mathewzeitlin','mikegiglio','adolfoflores', 'mollyhensleyclancy','chrisgeidner','jasonwells','andrewkaczynski','emilyorley','claudiakoerner','ryanhatesthis','sapna','lindseyadler','tasneemnashrulla','dominicholden','rossalynwarren','maryanngeorgantopoulos','nicholasmedinamora','ashleyford','adamdavis','lyapalater','alexalvarez
@JBlumstein
JBlumstein / Reddit posts by domain
Created December 22, 2014 02:56
Script to search articles by publication on reddit
#note that this script takes a couple minutes to run
import pandas
import time
import json
import requests
import datetime
import sys
#Enter your reddit username and password here