dannguyen/Notes-on-stop-and-frisk-data.md

## Notes-on-stop-and-frisk-data.md

      
    Raw
  

              Notes-on-stop-and-frisk-data.md
            
          
    The shell script was used to clean the data and upload it to BigQuery. You can access the data yourself there:
https://bigquery.cloud.google.com/table/nypd-data:nypd.stop_and_frisk
Or if you simply want a giant zip of CSV files, go here (note: You'll have to do the work of reconciling the columns in order to get all the files into one database):
http://stash.compciv.org.s3.amazonaws.com/nypd-stop-and-frisk/2003-2013.zip
The R script in this gist produces the image below:


## nypd-stop-and-frisk-cleanup.bash
mkdir -p stop-and-frisk
cd stop-and-frisk
# This zip contains data files that I've already converted from the
# original SAS into CSV fields
curl http://stash.compciv.org.s3.amazonaws.com/nypd-stop-and-frisk/2003-2013.zip -o data.zip && unzip data.zip

# Get the common fields, i.e. that are common in the 11 files
for y in $(seq 2003 2013); do  head -n 1 $y.csv; done | grep -oE '[^,]+' | sort | uniq -c | grep -E '\b11\b'

# Get the fields that are not common:

for y in $(seq 2003 2013); do  head -n 1 $y.csv; done | grep -oE '[^,"]+' | sort | uniq -c | grep -v '\b11\b' | sort

#   1 adrnum
#   1 adrpct
#   1 detail1_
#   1 details_
#   1 dettyp_c
#   1 premtyp
#   1 prenam
#   1 rescod
#   1 strintr
#   1 strname
#   1 wepfound
#   3 forceuse
#  10 addrnum
#  10 addrpct
#  10 detailcm
#  10 dettypcm
#  10 linecm
#  10 premname
#  10 premtype
#  10 rescode
#  10 stinter
#  10 stname
#
# file 2006.csv has all the weird things
# detail1_ and details_ are exact copies of each other

# and 2006.csv does not have a field for linecm
# and only 2006 has wepfound field, so we take it out

mv 2006.csv 2006.csv.old

head -n 1 2006.csv.old | csvfix exclude -f 80,83 | sed -e  s/adrnum/addrnum/ -e s/adrpct/addrpct/ -e s/details_/detailcm/ \
-e s/dettyp_c/dettypcm/ -e s/premtyp/premtype/ -e s/prenam/premname/ -e s/rescod/rescode/ -e s/strintr/stinter/ -e s/strname/stname/ |\
 csvfix put -v "linecm" | csvfix put -v 'forceuse' > 2006.csv

tail -n +2 2006.csv.old  | csvfix exclude -f 80,83 | csvfix put -v "" | csvfix put -v "" >> 2006.csv


# pad all of the pre-2011 tables with an empty forceuse column
for y in 2003 2004 2005 2007 2008 2009 2010; do
  echo $y
  mv $y.csv $y.csv.old
  head -n 1 $y.csv.old  | csvfix put -v 'forceuse' > $y.csv
  tail -n +2 $y.csv.old | csvfix put -v "" >> $y.csv
done

# now merge all the files into one big file, with common fields
common_headers=$(csvfix echo -smq 2013.csv | head -n 1)
echo $common_headers > 2003-2013.csv
for y in $(seq 2003 2013); do
  echo "Adding $y"
  csvfix order -fn "$common_headers" $y.csv | sed 1d | csvfix trim -l -t |  csvfix edit -f 4 -e 's/\([0-9][0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-\1-\2/' -e 's/\([0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-0\1-\2/' | csvfix edit -f 5 -e 's/\([0-9][0-9]\):?\([0-9][0-9]\)/\1:\2/' -e 's/^\([0-9]\):?\([0-9][0-9]\)/0\1:\2/' -e 's/^\([0-9][0-9]\)$/00:\1/' -e 's/^\([0-9]\)$/00:0\1/' >> 2003-2013.csv
done


# now make a file with the timestamps set up
export LC_CTYPE=C
export LANG=C
# http://stackoverflow.com/questions/19242275/re-error-illegal-byte-sequence-on-mac-os-x

# fixing the datestop and timestop fields
csvfix trim -l -t 2003-2013.csv |  csvfix edit -f 4 -e 's/\([0-9][0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-\1-\2/' -e 's/\([0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-0\1-\2/' | csvfix edit -f 5 -e 's/\([0-9][0-9]\):?\([0-9][0-9]\)/\1:\2/' -e 's/^\([0-9]\):?\([0-9][0-9]\)/0\1:\2/' -e 's/^\([0-9][0-9]\)$/00:\1/' -e 's/^\([0-9]\)$/00:0\1/' > 2003-2013.timefixed.csv

# Now load into BigQuery
gsutil -m cp 2003-2013.csv gs://nypd-stop-and-frisk/2003-2013.csv
# first, create the table with 2013's data, since it has the most fields
bq load --project_id='nypd-data' --skip_leading_rows=1 'nypd-data:nypd.stop_and_frisk' gs://nypd-stop-and-frisk/2003-2013.csv year,pct,ser_num,datestop,timestop,recstat,inout,trhsloc,perobs,crimsusp,perstop,typeofid,explnstp,othpers,arstmade,arstoffn,sumissue,sumoffen,compyear,comppct,offunif,officrid,frisked,searched,contrabn,adtlrept,pistol,riflshot,asltweap,knifcuti,machgun,othrweap,pf_hands,pf_wall,pf_grnd,pf_drwep,pf_ptwep,pf_baton,pf_hcuff,pf_pepsp,pf_other,radio,ac_rept,ac_inves,rf_vcrim,rf_othsw,ac_proxm,rf_attir,cs_objcs,cs_descr,cs_casng,cs_lkout,rf_vcact,cs_cloth,cs_drgtr,ac_evasv,ac_assoc,cs_furtv,rf_rfcmp,ac_cgdir,rf_verbl,cs_vcrim,cs_bulge,cs_other,ac_incid,ac_time,rf_knowl,ac_stsnd,ac_other,sb_hdobj,sb_outln,sb_admis,sb_other,repcmd,revcmd,rf_furt,rf_bulg,offverb,offshld,forceuse,sex,race,dob,age,ht_feet,ht_inch,weight,haircolr,eyecolor,build,othfeatr,addrtyp,rescode,premtype,premname,addrnum,stname,stinter,crossst,aptnum,city,state,zip,addrpct,sector,beat,post,xcoord,ycoord,dettypcm,linecm,detailcm

## stop-and-frisk-by-month-year-race.r
library('bigrquery')
library('dplyr')
library('ggplot2')
project_id = 'nypd-data'

sql = "select datestop, race, count(*) AS stops from [nypd-data:nypd.stop_and_frisk] group by datestop, race"
data <- query_exec(sql, project = project_id, max_pages = Inf)

datax <- filter(data, datestop != "") %>%
  mutate(date = as.POSIXct(datestop, tz = 'UTC', origin="1970-01-01")) %>%
   mutate(year = format(date, '%Y'))   %>%
   mutate(month = format(date, '%m')) %>%
   mutate(general_race = ifelse(race == 'B', 'Black',  ifelse(race=='Q' | race == 'P', 'Hispanic', 'Other'))) %>%
  group_by(year, month, general_race) %>%
  summarize(total_stops = sum(stops)) %>%
  mutate(yearmonthday = as.POSIXct(paste(year,month,'01', sep='-'), tz='UTC', origin = '1970-01-01')) %>%
  filter(year >= 2003) # ugly hack, I know

ggplot(datax, aes(x = yearmonthday, y = total_stops,fill=factor(general_race) )) +
  geom_bar(stat="identity",colour ="gray") + scale_fill_brewer() + geom_smooth() +
  facet_grid(general_race ~ .) +
  theme_minimal() +
  theme(legend.position = "none",text = element_text(family="Gill Sans"), plot.title = element_text(size=20)) +
  ggtitle(expression(atop("NYPD Stop and Frisk data", atop(italic("By Dan Nguyen / Stanford Journalism"), "http://bit.ly/nypdstopnfrisk")))) +
  ylab("Stops per month") + xlab("") + scale_y_continuous(labels = comma)
	mkdir -p stop-and-frisk
	cd stop-and-frisk
	# This zip contains data files that I've already converted from the
	# original SAS into CSV fields
	curl http://stash.compciv.org.s3.amazonaws.com/nypd-stop-and-frisk/2003-2013.zip -o data.zip && unzip data.zip

	# Get the common fields, i.e. that are common in the 11 files
	for y in $(seq 2003 2013); do head -n 1 $y.csv; done \| grep -oE '[^,]+' \| sort \| uniq -c \| grep -E '\b11\b'

	# Get the fields that are not common:

	for y in $(seq 2003 2013); do head -n 1 $y.csv; done \| grep -oE '[^,"]+' \| sort \| uniq -c \| grep -v '\b11\b' \| sort

	# 1 adrnum
	# 1 adrpct
	# 1 detail1_
	# 1 details_
	# 1 dettyp_c
	# 1 premtyp
	# 1 prenam
	# 1 rescod
	# 1 strintr
	# 1 strname
	# 1 wepfound
	# 3 forceuse
	# 10 addrnum
	# 10 addrpct
	# 10 detailcm
	# 10 dettypcm
	# 10 linecm
	# 10 premname
	# 10 premtype
	# 10 rescode
	# 10 stinter
	# 10 stname
	#
	# file 2006.csv has all the weird things
	# detail1_ and details_ are exact copies of each other

	# and 2006.csv does not have a field for linecm
	# and only 2006 has wepfound field, so we take it out

	mv 2006.csv 2006.csv.old

	head -n 1 2006.csv.old \| csvfix exclude -f 80,83 \| sed -e s/adrnum/addrnum/ -e s/adrpct/addrpct/ -e s/details_/detailcm/ \
	-e s/dettyp_c/dettypcm/ -e s/premtyp/premtype/ -e s/prenam/premname/ -e s/rescod/rescode/ -e s/strintr/stinter/ -e s/strname/stname/ \|\
	csvfix put -v "linecm" \| csvfix put -v 'forceuse' > 2006.csv

	tail -n +2 2006.csv.old \| csvfix exclude -f 80,83 \| csvfix put -v "" \| csvfix put -v "" >> 2006.csv


	# pad all of the pre-2011 tables with an empty forceuse column
	for y in 2003 2004 2005 2007 2008 2009 2010; do
	echo $y
	mv $y.csv $y.csv.old
	head -n 1 $y.csv.old \| csvfix put -v 'forceuse' > $y.csv
	tail -n +2 $y.csv.old \| csvfix put -v "" >> $y.csv
	done

	# now merge all the files into one big file, with common fields
	common_headers=$(csvfix echo -smq 2013.csv \| head -n 1)
	echo $common_headers > 2003-2013.csv
	for y in $(seq 2003 2013); do
	echo "Adding $y"
	csvfix order -fn "$common_headers" $y.csv \| sed 1d \| csvfix trim -l -t \| csvfix edit -f 4 -e 's/\([0-9][0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-\1-\2/' -e 's/\([0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-0\1-\2/' \| csvfix edit -f 5 -e 's/\([0-9][0-9]\):?\([0-9][0-9]\)/\1:\2/' -e 's/^\([0-9]\):?\([0-9][0-9]\)/0\1:\2/' -e 's/^\([0-9][0-9]\)$/00:\1/' -e 's/^\([0-9]\)$/00:0\1/' >> 2003-2013.csv
	done


	# now make a file with the timestamps set up
	export LC_CTYPE=C
	export LANG=C
	# http://stackoverflow.com/questions/19242275/re-error-illegal-byte-sequence-on-mac-os-x

	# fixing the datestop and timestop fields
	csvfix trim -l -t 2003-2013.csv \| csvfix edit -f 4 -e 's/\([0-9][0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-\1-\2/' -e 's/\([0-9]\)\([0-9][0-9]\)\([0-9][0-9][0-9][0-9]\)/\3-0\1-\2/' \| csvfix edit -f 5 -e 's/\([0-9][0-9]\):?\([0-9][0-9]\)/\1:\2/' -e 's/^\([0-9]\):?\([0-9][0-9]\)/0\1:\2/' -e 's/^\([0-9][0-9]\)$/00:\1/' -e 's/^\([0-9]\)$/00:0\1/' > 2003-2013.timefixed.csv

	# Now load into BigQuery
	gsutil -m cp 2003-2013.csv gs://nypd-stop-and-frisk/2003-2013.csv
	# first, create the table with 2013's data, since it has the most fields
	bq load --project_id='nypd-data' --skip_leading_rows=1 'nypd-data:nypd.stop_and_frisk' gs://nypd-stop-and-frisk/2003-2013.csv year,pct,ser_num,datestop,timestop,recstat,inout,trhsloc,perobs,crimsusp,perstop,typeofid,explnstp,othpers,arstmade,arstoffn,sumissue,sumoffen,compyear,comppct,offunif,officrid,frisked,searched,contrabn,adtlrept,pistol,riflshot,asltweap,knifcuti,machgun,othrweap,pf_hands,pf_wall,pf_grnd,pf_drwep,pf_ptwep,pf_baton,pf_hcuff,pf_pepsp,pf_other,radio,ac_rept,ac_inves,rf_vcrim,rf_othsw,ac_proxm,rf_attir,cs_objcs,cs_descr,cs_casng,cs_lkout,rf_vcact,cs_cloth,cs_drgtr,ac_evasv,ac_assoc,cs_furtv,rf_rfcmp,ac_cgdir,rf_verbl,cs_vcrim,cs_bulge,cs_other,ac_incid,ac_time,rf_knowl,ac_stsnd,ac_other,sb_hdobj,sb_outln,sb_admis,sb_other,repcmd,revcmd,rf_furt,rf_bulg,offverb,offshld,forceuse,sex,race,dob,age,ht_feet,ht_inch,weight,haircolr,eyecolor,build,othfeatr,addrtyp,rescode,premtype,premname,addrnum,stname,stinter,crossst,aptnum,city,state,zip,addrpct,sector,beat,post,xcoord,ycoord,dettypcm,linecm,detailcm
	library('bigrquery')
	library('dplyr')
	library('ggplot2')
	project_id = 'nypd-data'

	sql = "select datestop, race, count(*) AS stops from [nypd-data:nypd.stop_and_frisk] group by datestop, race"
	data <- query_exec(sql, project = project_id, max_pages = Inf)

	datax <- filter(data, datestop != "") %>%
	mutate(date = as.POSIXct(datestop, tz = 'UTC', origin="1970-01-01")) %>%
	mutate(year = format(date, '%Y')) %>%
	mutate(month = format(date, '%m')) %>%
	mutate(general_race = ifelse(race == 'B', 'Black', ifelse(race=='Q' \| race == 'P', 'Hispanic', 'Other'))) %>%
	group_by(year, month, general_race) %>%
	summarize(total_stops = sum(stops)) %>%
	mutate(yearmonthday = as.POSIXct(paste(year,month,'01', sep='-'), tz='UTC', origin = '1970-01-01')) %>%
	filter(year >= 2003) # ugly hack, I know

	ggplot(datax, aes(x = yearmonthday, y = total_stops,fill=factor(general_race) )) +
	geom_bar(stat="identity",colour ="gray") + scale_fill_brewer() + geom_smooth() +
	facet_grid(general_race ~ .) +
	theme_minimal() +
	theme(legend.position = "none",text = element_text(family="Gill Sans"), plot.title = element_text(size=20)) +
	ggtitle(expression(atop("NYPD Stop and Frisk data", atop(italic("By Dan Nguyen / Stanford Journalism"), "http://bit.ly/nypdstopnfrisk")))) +
	ylab("Stops per month") + xlab("") + scale_y_continuous(labels = comma)