Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@soobrosa
Created November 20, 2015 20:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save soobrosa/4adf89ce197eb6299eb9 to your computer and use it in GitHub Desktop.
Save soobrosa/4adf89ce197eb6299eb9 to your computer and use it in GitHub Desktop.
Is Yelp international?
source.downloaded:
mkdir source
cd source && { curl -O "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/yelp_dataset_challenge_academic_dataset.zip" ; cd -; }
source.decompressed: source.downloaded
unzip source/yelp_dataset_challenge_academic_dataset.zip
#
# one record pretty printed from each file composed of lines of JSONs
#
examples: source.decompressed
head -n 1 source/yelp_academic_dataset_business.json | python -mjson.tool > example/business.json
head -n 1 source/yelp_academic_dataset_review.json | python -mjson.tool > example/review.json
head -n 1 source/yelp_academic_dataset_user.json | python -mjson.tool > example/user.json
head -n 1 source/yelp_academic_dataset_checkin.json | python -mjson.tool > example/checkin.json
head -n 1 source/yelp_academic_dataset_tip.json | python -mjson.tool > example/tip.json
#
# $ brew install coreutils
#
# a random 10k sample from all files for quickies
#
samples: source.decompressed
gshuf -n 10000 source/yelp_academic_dataset_business.json > sample/business.json
gshuf -n 10000 source/yelp_academic_dataset_review.json > sample/review.json
gshuf -n 10000 source/yelp_academic_dataset_user.json > sample/user.json
gshuf -n 10000 source/yelp_academic_dataset_checkin.json > sample/checkin.json
gshuf -n 10000 source/yelp_academic_dataset_tip.json > sample/tip.json
#
# https://github.com/jehiah/json2csv
#
# http://stedolan.github.io/jq/
#
# flatten all files to tables lazily
#
sample_flattened: source.decompressed
< sample/tip.json | json2csv -p=true -k business_id,date,likes,text,user_id > sample_flattened/tip.csv
< sample/tip.json | json2csv -p=true -k business_id,date,likes,user_id > source_flattened/tip_no_text.csv
< sample/review.json | json2csv -p=true -k business_id,date,review_id,stars,text,user_id > sample_flattened/review_compact.csv
< sample/review.json | json2csv -p=true -k business_id,date,review_id,stars,user_id > source_flattened/review_compact_no_text.csv
< sample/review.json | jq -c '{review_id, a: .votes | to_entries[]}' | jq -c '{review_id: .review_id, key: .a.key, value: .a.value}' | json2csv -p=true -k review_id,key,value > sample_flattened/review_votes.csv
< sample/user.json | jq -c '{average_stars, fans, friends: .friends | length, name, review_count, user_id, yelping_since}' | json2csv -p=true -k average_stars,fans,friends,name,review_count,user_id,yelping_since > sample_flattened/user_compact.csv
< sample/user.json | jq -c '{user_id, friend: .friends[]}'| json2csv -p=true -k user_id,friend > sample_flattened/user_friends.csv
< sample/user.json | jq -c '{user_id, a: .compliments | to_entries[]}' | jq -c '{user_id: .user_id, key: .a.key, value: .a.value}' | json2csv -p=true -k user_id,key,value > sample_flattened/user_compliments.csv
< sample/business.json | jq -c '{business_id, category_main: .categories[0], category_sub: .categories[1], city, latitude, longitude, name, neighborhood: .neighborhoods[0], open, review_count, stars, state}' | json2csv -p=true -k business_id,category_main,category_sub,city,latitude,longitude,name,neighborhood,open,review_count,stars,state > sample_flattened/business.csv
< sample/business.json | jq -c '{business_id, a: .hours | to_entries[]}' | jq -c '{business_id, day: .a.key, b: .a.value | to_entries[]}' | jq -c '{business_id: .business_id, day: .day, key: .b.key, value: .b.value}' | json2csv -p=true -k business_id,day,key,value > sample_flattened/business_hours.csv
< sample/checkin.json | jq -c '{business_id, a: .checkin_info | to_entries[]}' | jq -c '{business_id: .business_id, key: .a.key | split ("-"), value: .a.value}' | jq -c '{business_id: .business_id, key1: .key[0], key2: .key[1], value: .value}' | json2csv -p=true -k business_id,key1,key2,value > sample_flattened/checkin.csv
source_flattened: source.decompressed
< source/yelp_academic_dataset_tip.json | json2csv -p=true -k business_id,date,likes,text,user_id > source_flattened/tip.csv
< source/yelp_academic_dataset_tip.json | json2csv -p=true -k business_id,date,likes,user_id > source_flattened/tip_no_text.csv
< source/yelp_academic_dataset_review.json | json2csv -p=true -k business_id,date,review_id,stars,text,user_id > source_flattened/review_compact.csv
< source/yelp_academic_dataset_review.json | json2csv -p=true -k business_id,date,review_id,stars,user_id > source_flattened/review_compact_no_text.csv
< source/yelp_academic_dataset_review.json | jq -c '{review_id, a: .votes | to_entries[]}' | jq -c '{review_id: .review_id, key: .a.key, value: .a.value}' | json2csv -p=true -k review_id,key,value > source_flattened/review_votes.csv
< source/yelp_academic_dataset_user.json | jq -c '{average_stars, fans, friends: .friends | length, name, review_count, user_id, yelping_since}' | json2csv -p=true -k average_stars,fans,friends,name,review_count,user_id,yelping_since > source_flattened/user_compact.csv
< source/yelp_academic_dataset_user.json | jq -c '{user_id, friend: .friends[]}' | json2csv -p=true -k user_id,friend > sample_flattened/source_friends.csv
< source/yelp_academic_dataset_user.json | jq -c '{user_id, a: .compliments | to_entries[]}' | jq -c '{user_id: .user_id, key: .a.key, value: .a.value}' | json2csv -p=true -k user_id,key,value > sample_flattened/user_compliments.csv
< source/yelp_academic_dataset_business.json | jq -c '{business_id, category_main: .categories[0], category_sub: .categories[1], city, latitude, longitude, name, neighborhood: .neighborhoods[0], open, review_count, stars, state}' | json2csv -p=true -k business_id,category_main,category_sub,city,latitude,longitude,name,neighborhood,open,review_count,stars,state > source_flattened/business.csv
< source/yelp_academic_dataset_business.json | jq -c '{business_id, a: .hours | to_entries[]}' | jq -c '{business_id, day: .a.key, b: .a.value | to_entries[]}' | jq -c '{business_id: .business_id, day: .day, key: .b.key, value: .b.value}' | json2csv -p=true -k business_id,day,key,value > source_flattened/business_hours.csv
< source/yelp_academic_dataset_checkin.json | jq -c '{business_id, a: .checkin_info | to_entries[]}' | jq -c '{business_id: .business_id, key: .a.key | split ("-"), value: .a.value}' | jq -c '{business_id: .business_id, key1: .key[0], key2: .key[1], value: .value}' | json2csv -p=true -k business_id,key1,key2,value > source_flattened/checkin.csv
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment