Alex Woolford alexwoolford

## dpsClosestSchools.py
#!/usr/bin/python

from bs4 import BeautifulSoup
import urllib2
import re
from pygeocoder import Geocoder
import time
from pymongo import MongoClient

# Get the HTML for the school list and create BeautifulSoup object.

## mongousExcelGobbler.py
#!/usr/bin/python

"""
From time-to-time it's necessary to consolidate data from multiple spreadsheets. If the spreadsheets lack validation
rules, users may enter values that break business rules, add columns, change the column order, etc... it can become a
messy problem.

One possible way to handle this is to load all the workbooks/worksheets into a NoSQL database (e.g. MongoDB), and explore
the dataset in its entirety in order to identify and address anomalies.

## createNeo4jLinkGraph.py
# A JSON file containing the url and html is loaded into a Neo4j graph database.

import json
from bs4 import BeautifulSoup
from py2neo import neo4j

graph_db = neo4j.GraphDatabaseService()
urls = graph_db.get_or_create_index(neo4j.Node, "Urls")
connectedTo = graph_db.get_or_create_index(neo4j.Relationship, "ConnectedTo")

## dpsCreateSolrIndex.py
# Interesting features from the HTML can then be loaded into a Solr index.

import json
from bs4 import BeautifulSoup
import solr

s = solr.SolrConnection('http://localhost:8983/solr')

for recordNum, line in enumerate(open('/Users/awoolford/Documents/scrapeDPS/dpsk12_org/dpsk12_org.json', 'r').readlines()):
	try:

## loadCrimestatsToElasticsearch.py
# The csv dataset was downloaded from http://data.denvergov.org/dataset/city-and-county-of-denver-crime

import pandas as pd
from elasticsearch import Elasticsearch
import json

data = pd.read_csv('/Users/awoolford/Downloads/crime.csv')
es = Elasticsearch()

# ignore 400 cause by IndexAlreadyExistsException when creating an index

## scrapyToHive.py
# The scrapy framework has a helper function to create a skeleton scraper project, e.g.

scrapy genspider somesiteSpider somesite.com


# Sometimes patterns in the URL's make data acquisition a breeze. A textfile containing a list of URL's is created:

outfile = open('urls.txt', 'w')
for id in range(1, 17723):
    url = "https://somesite.com/getDetail.do?someId=" + str(id)

## IPythonMagicWithR.py
$ ipython

In [1]: import pandas as pd

In [2]: mpg = pd.read_csv('mpg.csv')

# move mpg dataframe to R. The "-i mpg" is an input. Dataframe moves from pandas to R.

In [3]: %load_ext rpy2.ipython

## amznDIstributionCenterLocations.py
"""
This scrapes Amazon's distribution centers from the web and creates a file of markers for an interactive map created in Leaflet.js

For now (10/03/2014) it can be viewed at http://54.172.30.118/amzn-locations.html
"""

import urllib2
from bs4 import BeautifulSoup
from pygeocoder import Geocoder
import time

## modelCreationExecutionTimingsGPUvsNonGPU.R
library(gputools)
library(e1071)
library(fueleconomy)
data(vehicles)

vehicles$id <- NULL
vehicles$hwy <- NULL

vehicles <- as.data.frame(unclass(vehicles))
vehicles <- vehicles[complete.cases(vehicles), ]

## mysqlMeltSuggestionForStackoverflowQuestion.py
"""
# http://stackoverflow.com/questions/26248943/mysql-sseparate-value-from-one-column-to-several-rows-and-join-with-other-column

setup the tables:

  create table papers
  (
  	paper_title varchar(10),
  	authors	varchar(10),
  	year integer
	#!/usr/bin/python

	from bs4 import BeautifulSoup
	import urllib2
	import re
	from pygeocoder import Geocoder
	import time
	from pymongo import MongoClient

	# Get the HTML for the school list and create BeautifulSoup object.
	#!/usr/bin/python

	"""
	From time-to-time it's necessary to consolidate data from multiple spreadsheets. If the spreadsheets lack validation
	rules, users may enter values that break business rules, add columns, change the column order, etc... it can become a
	messy problem.

	One possible way to handle this is to load all the workbooks/worksheets into a NoSQL database (e.g. MongoDB), and explore
	the dataset in its entirety in order to identify and address anomalies.
	# A JSON file containing the url and html is loaded into a Neo4j graph database.

	import json
	from bs4 import BeautifulSoup
	from py2neo import neo4j

	graph_db = neo4j.GraphDatabaseService()
	urls = graph_db.get_or_create_index(neo4j.Node, "Urls")
	connectedTo = graph_db.get_or_create_index(neo4j.Relationship, "ConnectedTo")
	# Interesting features from the HTML can then be loaded into a Solr index.

	import json
	from bs4 import BeautifulSoup
	import solr

	s = solr.SolrConnection('http://localhost:8983/solr')

	for recordNum, line in enumerate(open('/Users/awoolford/Documents/scrapeDPS/dpsk12_org/dpsk12_org.json', 'r').readlines()):
	try:
	# The csv dataset was downloaded from http://data.denvergov.org/dataset/city-and-county-of-denver-crime

	import pandas as pd
	from elasticsearch import Elasticsearch
	import json

	data = pd.read_csv('/Users/awoolford/Downloads/crime.csv')
	es = Elasticsearch()

	# ignore 400 cause by IndexAlreadyExistsException when creating an index
	# The scrapy framework has a helper function to create a skeleton scraper project, e.g.

	scrapy genspider somesiteSpider somesite.com


	# Sometimes patterns in the URL's make data acquisition a breeze. A textfile containing a list of URL's is created:

	outfile = open('urls.txt', 'w')
	for id in range(1, 17723):
	url = "https://somesite.com/getDetail.do?someId=" + str(id)
	$ ipython

	In [1]: import pandas as pd

	In [2]: mpg = pd.read_csv('mpg.csv')

	# move mpg dataframe to R. The "-i mpg" is an input. Dataframe moves from pandas to R.

	In [3]: %load_ext rpy2.ipython
	"""
	This scrapes Amazon's distribution centers from the web and creates a file of markers for an interactive map created in Leaflet.js

	For now (10/03/2014) it can be viewed at http://54.172.30.118/amzn-locations.html
	"""

	import urllib2
	from bs4 import BeautifulSoup
	from pygeocoder import Geocoder
	import time
	library(gputools)
	library(e1071)
	library(fueleconomy)
	data(vehicles)

	vehicles$id <- NULL
	vehicles$hwy <- NULL

	vehicles <- as.data.frame(unclass(vehicles))
	vehicles <- vehicles[complete.cases(vehicles), ]
	"""
	# http://stackoverflow.com/questions/26248943/mysql-sseparate-value-from-one-column-to-several-rows-and-join-with-other-column

	setup the tables:

	create table papers
	(
	paper_title varchar(10),
	authors varchar(10),
	year integer