Skip to content

Instantly share code, notes, and snippets.

@davidfauth
davidfauth / jsonSingleLine.java
Created October 24, 2013 22:54
convert json to single line
package jsonFormatter;
import java.io.*;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
@davidfauth
davidfauth / mortarPitElasticsearch.pig
Last active December 26, 2015 11:49
Mortar Pigscript outputting to Elasticsearch
-- 'Document' is the delimiter
-- 'event, gathering' is the tag list
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/billsProject.py' USING streaming_python AS nltk_udfs;
REGISTER '/Users/davidfauth/Downloads/elasticsearch-hadoop-1.3.0.M1.jar';
define ESStorage org.elasticsearch.hadoop.pig.ESStorage('es.resource=govtrack/bills');
bills = LOAD '/Users/davidfauth/MortarData/'
from pig_util import outputSchema
import nltk
@outputSchema("top_five:bag{t:(bigram:chararray)}")
def top_5_bigrams(tweets):
tokenized_tweets = [ nltk.tokenize.WhitespaceTokenizer().tokenize(t[0]) for t in tweets ]
bgm = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_documents(tokenized_tweets)
top_5 = finder.nbest(bgm.likelihood_ratio, 5)
@davidfauth
davidfauth / mortarNeo4JExample
Created December 2, 2013 17:44
Mortar to Neo4J
-- 'Document' is the delimiter
-- 'event, gathering' is the tag list
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
%default S3_OUTPUT_PATH 's3n://df-bills-project'
%default S3_INPUT_PATH 's3n://df-bills-data'
%default INPUT_PATH '/Users/davidfauth/MortarNeoTestData'
%default BULK_INPUT_PATH '/Users/davidfauth/MortarTestDataBulk'
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/billsProject.py' USING streaming_python AS nltk_udfs;
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/utilities.py' USING streaming_python AS utility_udfs;
@davidfauth
davidfauth / fbo_analysis_gist
Last active January 2, 2016 08:28
fbo contracts analysis
/**
* FBO_Data
*/
%default INPUT_PATH '/Users/davidfauth/fbo_data/fbo_data_active.csv'
%default INPUT_NEW_PATH '/Users/davidfauth/fbo_data/fbo_data_pig/fbo_data_archive_12_13_tab.txt'
%default INPUT_DATA_PATH '/Users/davidfauth/fbo_data/fbo_data_pig'
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
/**
@davidfauth
davidfauth / singleLineJson.java
Created January 9, 2014 17:33
convert mutli-line json to single line json
package jsonFormatter;
import java.io.*;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
@davidfauth
davidfauth / FBOPigLiveAnimalNLTK.pig
Created January 21, 2014 15:46
Pig code for live animals award description
/* filter awards by NIH */
activeHasAward = FILTER active_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
fy0004HasAward = FILTER fy00_04_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
fy0507HasAward = FILTER fy05_07_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
fy0809HasAward = FILTER fy08_09_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
fy1011HasAward = FILTER fy10_11_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
fy1213HasAward = FILTER fy12_13_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
/*group Data */
@davidfauth
davidfauth / pythonBiGram.py
Created January 21, 2014 15:51
Python utility to Tokenize data and write out the top-5 bigrams
@outputSchema("top_five:bag{t:(bigram:chararray)}")
def top5_bigrams(textDescription):
sentences = nltk.tokenize.sent_tokenize(textDescription)
tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]
bgm = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_documents(tokens)
top_5 = finder.nbest(bgm.likelihood_ratio, 5)
return [ ("%s %s" % (s[0], s[1]),) for s in top_5 ]
register '/Users/davidfauth/Downloads/elasticsearch-hadoop-1.3.0.M1.jar';
define ESStorage org.elasticsearch.hadoop.pig.ESStorage('es.resource=fbo/awards');
-- More code here
B = FOREACH joinedActiveDetails GENERATE
CONCAT(SUBSTRING(postedDate,0,10),'T12:30:00-05:00') as searchPostedDate,
classCode as searchClassCode,
naicsCode as searchNaicsCode,
agencyName as searchAgencyName,
@davidfauth
davidfauth / mortarDocGraphEnhanced
Created April 1, 2014 21:32
Pig job to create the DocGraph structure
filteredNPIData = FOREACH npiData GENERATE
REPLACE(NPI, '\\"','') as npiRX,
REPLACE(Provider_Business_Mailing_Address_State_Name, '\\"','') as NPIState,
REPLACE(Healthcare_Provider_Taxonomy_Code_1, '\\"','') as NPITaxonomy,
REPLACE(Provider_Organization_Name_Legal_Business_Name, '\\"','') as NPIOrgName,
CONCAT(CONCAT(REPLACE(Provider_First_Name,'\\"',''),' '), REPLACE(Provider_Last_Name_Legal_Name,'\\"','')) as docName;
--join the filtered set to the NPI data to get the NPI information
joinReferred = JOIN docGraphRXData BY referringDoc, filteredNPIData by npiRX;