Skip to content

Instantly share code, notes, and snippets.

register '/Users/davidfauth/Downloads/elasticsearch-hadoop-1.3.0.M1.jar';
define ESStorage org.elasticsearch.hadoop.pig.ESStorage('es.resource=fbo/awards');
-- More code here
B = FOREACH joinedActiveDetails GENERATE
CONCAT(SUBSTRING(postedDate,0,10),'T12:30:00-05:00') as searchPostedDate,
classCode as searchClassCode,
naicsCode as searchNaicsCode,
agencyName as searchAgencyName,
@davidfauth
davidfauth / mortarDocGraphEnhanced
Created April 1, 2014 21:32
Pig job to create the DocGraph structure
filteredNPIData = FOREACH npiData GENERATE
REPLACE(NPI, '\\"','') as npiRX,
REPLACE(Provider_Business_Mailing_Address_State_Name, '\\"','') as NPIState,
REPLACE(Healthcare_Provider_Taxonomy_Code_1, '\\"','') as NPITaxonomy,
REPLACE(Provider_Organization_Name_Legal_Business_Name, '\\"','') as NPIOrgName,
CONCAT(CONCAT(REPLACE(Provider_First_Name,'\\"',''),' '), REPLACE(Provider_Last_Name_Legal_Name,'\\"','')) as docName;
--join the filtered set to the NPI data to get the NPI information
joinReferred = JOIN docGraphRXData BY referringDoc, filteredNPIData by npiRX;
@davidfauth
davidfauth / sampleBitcoinTransaction.json
Created February 2, 2015 14:43
Bitcoin Transactions for 1G541ENwQBqG3WZgvYtVCojVgdHFpJ8RXs
{
"hash160":"a54e0ee6071328dc58c8c37a4e974c4816364f24",
"address":"1G541ENwQBqG3WZgvYtVCojVgdHFpJ8RXs",
"n_tx":25,
"total_received":3393980000,
"total_sent":3393980000,
"final_balance":0,
"txs":[{
"ver":1,
"inputs":[
@davidfauth
davidfauth / bitcoinAddressDetails
Created February 2, 2015 14:51
Bitcoin Transactions for an address
{
"hash160":"a54e0ee6071328dc58c8c37a4e974c4816364f24",
"address":"1G541ENwQBqG3WZgvYtVCojVgdHFpJ8RXs",
"n_tx":25,
"total_received":3393980000,
"total_sent":3393980000,
"final_balance":0,
"txs":[{
"ver":1,
"inputs":[
@davidfauth
davidfauth / Neo4jHive.java
Created November 16, 2015 15:21
Neo4j Hive Example
package com.neo4j.hadoop.example;
import org.codehaus.jackson.map.ObjectMapper;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.schema.Schema;
import org.neo4j.helpers.collection.MapUtil;
import org.neo4j.tooling.GlobalGraphOperations;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
@davidfauth
davidfauth / mortarPitElasticsearch.pig
Last active December 26, 2015 11:49
Mortar Pigscript outputting to Elasticsearch
-- 'Document' is the delimiter
-- 'event, gathering' is the tag list
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/billsProject.py' USING streaming_python AS nltk_udfs;
REGISTER '/Users/davidfauth/Downloads/elasticsearch-hadoop-1.3.0.M1.jar';
define ESStorage org.elasticsearch.hadoop.pig.ESStorage('es.resource=govtrack/bills');
bills = LOAD '/Users/davidfauth/MortarData/'
@davidfauth
davidfauth / jsonSingleLine.java
Created October 24, 2013 22:54
convert json to single line
package jsonFormatter;
import java.io.*;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
from pig_util import outputSchema
import nltk
@outputSchema("top_five:bag{t:(bigram:chararray)}")
def top_5_bigrams(tweets):
tokenized_tweets = [ nltk.tokenize.WhitespaceTokenizer().tokenize(t[0]) for t in tweets ]
bgm = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_documents(tokenized_tweets)
top_5 = finder.nbest(bgm.likelihood_ratio, 5)
@davidfauth
davidfauth / mortarNeo4JExample
Created December 2, 2013 17:44
Mortar to Neo4J
-- 'Document' is the delimiter
-- 'event, gathering' is the tag list
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
%default S3_OUTPUT_PATH 's3n://df-bills-project'
%default S3_INPUT_PATH 's3n://df-bills-data'
%default INPUT_PATH '/Users/davidfauth/MortarNeoTestData'
%default BULK_INPUT_PATH '/Users/davidfauth/MortarTestDataBulk'
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/billsProject.py' USING streaming_python AS nltk_udfs;
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/utilities.py' USING streaming_python AS utility_udfs;
@davidfauth
davidfauth / fbo_analysis_gist
Last active January 2, 2016 08:28
fbo contracts analysis
/**
* FBO_Data
*/
%default INPUT_PATH '/Users/davidfauth/fbo_data/fbo_data_active.csv'
%default INPUT_NEW_PATH '/Users/davidfauth/fbo_data/fbo_data_pig/fbo_data_archive_12_13_tab.txt'
%default INPUT_DATA_PATH '/Users/davidfauth/fbo_data/fbo_data_pig'
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
/**