Skip to content

Instantly share code, notes, and snippets.

View MortarToElk
register '/Users/davidfauth/Downloads/elasticsearch-hadoop-1.3.0.M1.jar';
define ESStorage org.elasticsearch.hadoop.pig.ESStorage('es.resource=fbo/awards');
-- More code here
B = FOREACH joinedActiveDetails GENERATE
CONCAT(SUBSTRING(postedDate,0,10),'T12:30:00-05:00') as searchPostedDate,
classCode as searchClassCode,
naicsCode as searchNaicsCode,
agencyName as searchAgencyName,
@davidfauth
davidfauth / mortarDocGraphEnhanced
Created Apr 1, 2014
Pig job to create the DocGraph structure
View mortarDocGraphEnhanced
filteredNPIData = FOREACH npiData GENERATE
REPLACE(NPI, '\\"','') as npiRX,
REPLACE(Provider_Business_Mailing_Address_State_Name, '\\"','') as NPIState,
REPLACE(Healthcare_Provider_Taxonomy_Code_1, '\\"','') as NPITaxonomy,
REPLACE(Provider_Organization_Name_Legal_Business_Name, '\\"','') as NPIOrgName,
CONCAT(CONCAT(REPLACE(Provider_First_Name,'\\"',''),' '), REPLACE(Provider_Last_Name_Legal_Name,'\\"','')) as docName;
--join the filtered set to the NPI data to get the NPI information
joinReferred = JOIN docGraphRXData BY referringDoc, filteredNPIData by npiRX;
@davidfauth
davidfauth / sampleBitcoinTransaction.json
Created Feb 2, 2015
Bitcoin Transactions for 1G541ENwQBqG3WZgvYtVCojVgdHFpJ8RXs
View sampleBitcoinTransaction.json
{
"hash160":"a54e0ee6071328dc58c8c37a4e974c4816364f24",
"address":"1G541ENwQBqG3WZgvYtVCojVgdHFpJ8RXs",
"n_tx":25,
"total_received":3393980000,
"total_sent":3393980000,
"final_balance":0,
"txs":[{
"ver":1,
"inputs":[
@davidfauth
davidfauth / bitcoinAddressDetails
Created Feb 2, 2015
Bitcoin Transactions for an address
View bitcoinAddressDetails
{
"hash160":"a54e0ee6071328dc58c8c37a4e974c4816364f24",
"address":"1G541ENwQBqG3WZgvYtVCojVgdHFpJ8RXs",
"n_tx":25,
"total_received":3393980000,
"total_sent":3393980000,
"final_balance":0,
"txs":[{
"ver":1,
"inputs":[
View Neo4jHive.java
package com.neo4j.hadoop.example;
import org.codehaus.jackson.map.ObjectMapper;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.schema.Schema;
import org.neo4j.helpers.collection.MapUtil;
import org.neo4j.tooling.GlobalGraphOperations;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
@davidfauth
davidfauth / mortarPitElasticsearch.pig
Last active Dec 26, 2015
Mortar Pigscript outputting to Elasticsearch
View mortarPitElasticsearch.pig
-- 'Document' is the delimiter
-- 'event, gathering' is the tag list
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/billsProject.py' USING streaming_python AS nltk_udfs;
REGISTER '/Users/davidfauth/Downloads/elasticsearch-hadoop-1.3.0.M1.jar';
define ESStorage org.elasticsearch.hadoop.pig.ESStorage('es.resource=govtrack/bills');
bills = LOAD '/Users/davidfauth/MortarData/'
@davidfauth
davidfauth / jsonSingleLine.java
Created Oct 24, 2013
convert json to single line
View jsonSingleLine.java
package jsonFormatter;
import java.io.*;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
View nltk.py
from pig_util import outputSchema
import nltk
@outputSchema("top_five:bag{t:(bigram:chararray)}")
def top_5_bigrams(tweets):
tokenized_tweets = [ nltk.tokenize.WhitespaceTokenizer().tokenize(t[0]) for t in tweets ]
bgm = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_documents(tokenized_tweets)
top_5 = finder.nbest(bgm.likelihood_ratio, 5)
View mortarNeo4JExample
-- 'Document' is the delimiter
-- 'event, gathering' is the tag list
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
%default S3_OUTPUT_PATH 's3n://df-bills-project'
%default S3_INPUT_PATH 's3n://df-bills-data'
%default INPUT_PATH '/Users/davidfauth/MortarNeoTestData'
%default BULK_INPUT_PATH '/Users/davidfauth/MortarTestDataBulk'
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/billsProject.py' USING streaming_python AS nltk_udfs;
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/utilities.py' USING streaming_python AS utility_udfs;
@davidfauth
davidfauth / fbo_analysis_gist
Last active Jan 2, 2016
fbo contracts analysis
View fbo_analysis_gist
/**
* FBO_Data
*/
%default INPUT_PATH '/Users/davidfauth/fbo_data/fbo_data_active.csv'
%default INPUT_NEW_PATH '/Users/davidfauth/fbo_data/fbo_data_pig/fbo_data_archive_12_13_tab.txt'
%default INPUT_DATA_PATH '/Users/davidfauth/fbo_data/fbo_data_pig'
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
/**