Chase Davis cjdd3b

## fingerprint.py
# -*- coding: utf-8 -*-

import re, string
from unidecode import unidecode

PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))

class Fingerprinter(object):
    '''
    Python implementation of Google Refine fingerprinting algorithm described here:

## csvjoin.py
import csv, os

# This chunk iterates through all of the csv files in a directory, turns them
# into 2-dimensional arrays (lists of lists), and puts all those arrays into
# a list called "tables"

tables = []

# Loop over all files in the current directory (which is what "." means)
for f in os.listdir('.'):

## dates-output.json
{
  "took": 14,
  "timed_out": false,
  "_shards": {
    "total": 6,
    "successful": 6,
    "failed": 0
  },
  "hits": {
    "total": 1419,

## company-detail.json
{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 6,
    "successful": 6,
    "failed": 0
  },
  "hits": {
    "total": 1,

## cluster.py
'''
cluster.py

Uses the Hamming distance between perceptual hashes to surface near-duplicate
images.

To install and run:

1. pip install imagehash
2. Put some .dat files in a folder someplace (script assumes ./data/imgs/*.dat)

## scraping_solution.py
import csv, mechanize
from bs4 import BeautifulSoup

# Get the output file ready
# datafile = open('output.csv', 'w')
# writer = csv.writer(datafile)

br = mechanize.Browser()
br.open('http://enr.sos.mo.gov/EnrNet/CountyResults.aspx')

## virtualenv.txt
sudo pip install virtualenvwrapper
export WORKON_HOME=~/Envs
mkdir -p $WORKON_HOME
source /usr/local/bin/virtualenvwrapper.sh
echo 'export WORKON_HOME=$HOME/Envs; source /usr/local/bin/virtualenvwrapper.sh' >> ~/.bash_profile
mkvirtualenv dataj

pip install jupyter
pip install agate
pip install WHATEVER_ELSE

## s3count.md

      
              1 file
            
          
              1 fork
            
          
              0 comments
            
          
              7 stars
            
          
                cjdd3b
                / s3count.md
            
            
              Last active
              June 18, 2020 18:31
            
              
                How to count files in an S3 bucket
              
          
    Counting files in S3 buckets and folders is harder than it should be. But here's a way to get it done using s3cmd:

Install S3cmd


On Mac, brew install s3cmd
On Windows, go here


From the command line, run s3cmd --configure


Add your credentials when prompted.


## data-journalism-software.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                cjdd3b
                / data-journalism-software.md
            
            
              Last active
              August 31, 2016 11:52
            
              
                Software installation guide for Mizzou's Advanced Data Journalism course, Fall 2016.
              
          
    Advanced Data Journalism (J4432) software requirements

Below is a list of the key software you'll need for class, along with some resources offering tips about how to get it installed.
Text editor

A good programming text editor will help you organize your code, catch typos and generally make your life a lot easier. We recommend Sublime Text 2, which you can easily download and install from their website.
Terminal client


## strib-suicides.txt
Year	Suicides
1981-01-01	442
1982-01-01	470
1983-01-01	444
1984-01-01	443
1985-01-01	459
1986-01-01	541
1987-01-01	546
1988-01-01	488
1989-01-01	515
	# -- coding: utf-8 --

	import re, string
	from unidecode import unidecode

	PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))

	class Fingerprinter(object):
	'''
	Python implementation of Google Refine fingerprinting algorithm described here:
	import csv, os

	# This chunk iterates through all of the csv files in a directory, turns them
	# into 2-dimensional arrays (lists of lists), and puts all those arrays into
	# a list called "tables"

	tables = []

	# Loop over all files in the current directory (which is what "." means)
	for f in os.listdir('.'):
	{
	"took": 14,
	"timed_out": false,
	"_shards": {
	"total": 6,
	"successful": 6,
	"failed": 0
	},
	"hits": {
	"total": 1419,
	{
	"took": 2,
	"timed_out": false,
	"_shards": {
	"total": 6,
	"successful": 6,
	"failed": 0
	},
	"hits": {
	"total": 1,
	'''
	cluster.py

	Uses the Hamming distance between perceptual hashes to surface near-duplicate
	images.

	To install and run:

	1. pip install imagehash
	2. Put some .dat files in a folder someplace (script assumes ./data/imgs/*.dat)
	import csv, mechanize
	from bs4 import BeautifulSoup

	# Get the output file ready
	# datafile = open('output.csv', 'w')
	# writer = csv.writer(datafile)

	br = mechanize.Browser()
	br.open('http://enr.sos.mo.gov/EnrNet/CountyResults.aspx')
	sudo pip install virtualenvwrapper
	export WORKON_HOME=~/Envs
	mkdir -p $WORKON_HOME
	source /usr/local/bin/virtualenvwrapper.sh
	echo 'export WORKON_HOME=$HOME/Envs; source /usr/local/bin/virtualenvwrapper.sh' >> ~/.bash_profile
	mkvirtualenv dataj

	pip install jupyter
	pip install agate
	pip install WHATEVER_ELSE
	Year Suicides
	1981-01-01 442
	1982-01-01 470
	1983-01-01 444
	1984-01-01 443
	1985-01-01 459
	1986-01-01 541
	1987-01-01 546
	1988-01-01 488
	1989-01-01 515