David Fauth davidfauth

## jsonSingleLine.java
package jsonFormatter;

import java.io.*;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;

## mortarPitElasticsearch.pig
-- 'Document' is the delimiter
-- 'event, gathering' is the tag list

%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/billsProject.py' USING streaming_python AS nltk_udfs;
REGISTER '/Users/davidfauth/Downloads/elasticsearch-hadoop-1.3.0.M1.jar';
define ESStorage org.elasticsearch.hadoop.pig.ESStorage('es.resource=govtrack/bills');


bills =  LOAD '/Users/davidfauth/MortarData/'

## nltk.py
from pig_util import outputSchema
import nltk

@outputSchema("top_five:bag{t:(bigram:chararray)}")
def top_5_bigrams(tweets):
    tokenized_tweets = [ nltk.tokenize.WhitespaceTokenizer().tokenize(t[0]) for t in tweets ]

    bgm    = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_documents(tokenized_tweets)
    top_5  = finder.nbest(bgm.likelihood_ratio, 5)

## mortarNeo4JExample
-- 'Document' is the delimiter
-- 'event, gathering' is the tag list

%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
%default S3_OUTPUT_PATH 's3n://df-bills-project'
%default S3_INPUT_PATH 's3n://df-bills-data'
%default INPUT_PATH '/Users/davidfauth/MortarNeoTestData'
%default BULK_INPUT_PATH '/Users/davidfauth/MortarTestDataBulk'
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/billsProject.py' USING streaming_python AS nltk_udfs;
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/utilities.py' USING streaming_python AS utility_udfs;

## fbo_analysis_gist
/**
 * FBO_Data
 */

%default INPUT_PATH '/Users/davidfauth/fbo_data/fbo_data_active.csv'
%default INPUT_NEW_PATH '/Users/davidfauth/fbo_data/fbo_data_pig/fbo_data_archive_12_13_tab.txt'
%default INPUT_DATA_PATH '/Users/davidfauth/fbo_data/fbo_data_pig'
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'

/**

## singleLineJson.java
package jsonFormatter;

import java.io.*;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;

## FBOPigLiveAnimalNLTK.pig
/* filter awards by NIH */
activeHasAward = FILTER active_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
fy0004HasAward = FILTER fy00_04_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
fy0507HasAward = FILTER fy05_07_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
fy0809HasAward = FILTER fy08_09_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
fy1011HasAward = FILTER fy10_11_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
fy1213HasAward = FILTER fy12_13_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');


/*group Data */

## pythonBiGram.py
@outputSchema("top_five:bag{t:(bigram:chararray)}")
def top5_bigrams(textDescription):
    sentences = nltk.tokenize.sent_tokenize(textDescription)
    tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]

    bgm    = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_documents(tokens)
    top_5  = finder.nbest(bgm.likelihood_ratio, 5)

    return [ ("%s %s" % (s[0], s[1]),) for s in top_5 ]

## MortarToElk
register '/Users/davidfauth/Downloads/elasticsearch-hadoop-1.3.0.M1.jar';
define ESStorage org.elasticsearch.hadoop.pig.ESStorage('es.resource=fbo/awards');

-- More code here

B = FOREACH joinedActiveDetails GENERATE
CONCAT(SUBSTRING(postedDate,0,10),'T12:30:00-05:00') as searchPostedDate,
classCode as searchClassCode,
naicsCode as searchNaicsCode,
agencyName as searchAgencyName,

## mortarDocGraphEnhanced
filteredNPIData = FOREACH npiData GENERATE
	REPLACE(NPI, '\\"','') as npiRX,
 	REPLACE(Provider_Business_Mailing_Address_State_Name, '\\"','') as NPIState,
	REPLACE(Healthcare_Provider_Taxonomy_Code_1, '\\"','') as NPITaxonomy,
	REPLACE(Provider_Organization_Name_Legal_Business_Name, '\\"','') as NPIOrgName,
	CONCAT(CONCAT(REPLACE(Provider_First_Name,'\\"',''),' '), REPLACE(Provider_Last_Name_Legal_Name,'\\"','')) as docName;


--join the filtered set to the NPI data to get the NPI information
joinReferred = JOIN docGraphRXData BY referringDoc, filteredNPIData by npiRX;
	package jsonFormatter;

	import java.io.*;
	import java.nio.file.FileVisitResult;
	import java.nio.file.FileVisitor;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import java.nio.file.Paths;
	import java.nio.file.SimpleFileVisitor;
	import java.nio.file.attribute.BasicFileAttributes;
	-- 'Document' is the delimiter
	-- 'event, gathering' is the tag list

	%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
	REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/billsProject.py' USING streaming_python AS nltk_udfs;
	REGISTER '/Users/davidfauth/Downloads/elasticsearch-hadoop-1.3.0.M1.jar';
	define ESStorage org.elasticsearch.hadoop.pig.ESStorage('es.resource=govtrack/bills');


	bills = LOAD '/Users/davidfauth/MortarData/'
	from pig_util import outputSchema
	import nltk

	@outputSchema("top_five:bag{t:(bigram:chararray)}")
	def top_5_bigrams(tweets):
	tokenized_tweets = [ nltk.tokenize.WhitespaceTokenizer().tokenize(t[0]) for t in tweets ]

	bgm = nltk.collocations.BigramAssocMeasures()
	finder = nltk.collocations.BigramCollocationFinder.from_documents(tokenized_tweets)
	top_5 = finder.nbest(bgm.likelihood_ratio, 5)
	/**
	* FBO_Data
	*/

	%default INPUT_PATH '/Users/davidfauth/fbo_data/fbo_data_active.csv'
	%default INPUT_NEW_PATH '/Users/davidfauth/fbo_data/fbo_data_pig/fbo_data_archive_12_13_tab.txt'
	%default INPUT_DATA_PATH '/Users/davidfauth/fbo_data/fbo_data_pig'
	%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'

	/**
	/* filter awards by NIH */
	activeHasAward = FILTER active_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
	fy0004HasAward = FILTER fy00_04_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
	fy0507HasAward = FILTER fy05_07_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
	fy0809HasAward = FILTER fy08_09_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
	fy1011HasAward = FILTER fy10_11_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');
	fy1213HasAward = FILTER fy12_13_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL AND classCode == '88 -- Live animals');


	/group Data /
	@outputSchema("top_five:bag{t:(bigram:chararray)}")
	def top5_bigrams(textDescription):
	sentences = nltk.tokenize.sent_tokenize(textDescription)
	tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]

	bgm = nltk.collocations.BigramAssocMeasures()
	finder = nltk.collocations.BigramCollocationFinder.from_documents(tokens)
	top_5 = finder.nbest(bgm.likelihood_ratio, 5)

	return [ ("%s %s" % (s[0], s[1]),) for s in top_5 ]
	register '/Users/davidfauth/Downloads/elasticsearch-hadoop-1.3.0.M1.jar';
	define ESStorage org.elasticsearch.hadoop.pig.ESStorage('es.resource=fbo/awards');

	-- More code here

	B = FOREACH joinedActiveDetails GENERATE
	CONCAT(SUBSTRING(postedDate,0,10),'T12:30:00-05:00') as searchPostedDate,
	classCode as searchClassCode,
	naicsCode as searchNaicsCode,
	agencyName as searchAgencyName,
	filteredNPIData = FOREACH npiData GENERATE
	REPLACE(NPI, '\\"','') as npiRX,
	REPLACE(Provider_Business_Mailing_Address_State_Name, '\\"','') as NPIState,
	REPLACE(Healthcare_Provider_Taxonomy_Code_1, '\\"','') as NPITaxonomy,
	REPLACE(Provider_Organization_Name_Legal_Business_Name, '\\"','') as NPIOrgName,
	CONCAT(CONCAT(REPLACE(Provider_First_Name,'\\"',''),' '), REPLACE(Provider_Last_Name_Legal_Name,'\\"','')) as docName;


	--join the filtered set to the NPI data to get the NPI information
	joinReferred = JOIN docGraphRXData BY referringDoc, filteredNPIData by npiRX;