Skip to content

Instantly share code, notes, and snippets.

@mcritchlow
Last active May 14, 2018 16:08
Show Gist options
  • Save mcritchlow/f133e8bacd32a7d9daaa4b74af4cbcab to your computer and use it in GitHub Desktop.
Save mcritchlow/f133e8bacd32a7d9daaa4b74af4cbcab to your computer and use it in GitHub Desktop.
CIL Data Processing R&D
# Ruby script to test out parsing the CIL json data and dump to csv
#
# For now, we're concatenating the property hierarchy with colons
# So CIL: { "Image Files" [ "Mime_type": "application/zip" will be CIL:Image Files:Mime_type "application/zip"
require 'open-uri'
require 'json'
require 'byebug'
require 'csv'
# Hackety hacks, don't talk back
class CilCSV
DATA_PATH = 'CIL_Public_Data_JSON/Version8_6/DATA/CIL_PUBLIC_DATA'.freeze
attr_reader :cil_data
def initialize
@cil_data = {}
end
def start
load_data
CSV.open('cil.csv', 'wb', headers: true, write_headers: true, col_sep: '|') do |csv|
csv << ['Identifier'] + cil_header_row
cil_value_rows.each { |row| csv << row }
end
end
def load_data
json_files.each do |file|
cil_data[file] = parse(file)
end
end
def cil_header_row
@cil_header_row ||= cil_data.values.map(&:keys).reduce(&:+).uniq
end
def cil_value_rows
values = []
cil_data.each_pair do |k, v|
values << [k] + cil_header_row.map { |h| v.fetch(h, '') }
end
values
end
def json_files
# For now just grab a couple samples
# Dir.children(DATA_PATH).take(500)
Dir.children(DATA_PATH)
end
def parse(cil_file)
file = File.read(DATA_PATH + '/' + cil_file)
metadata = JSON.parse(file)
flatten_hash(metadata)
end
def flatten_hash(hash)
hash.each_with_object({}) do |(k, v), h|
if v.is_a? Hash
flatten_hash(v).map do |h_k, h_v|
h["#{k}.#{h_k}".to_sym] = h_v
end
else
h[k] = String(v)
end
end
end
end
CilCSV.new.start
# Ruby script to test out parsing the CIL json data
#
require 'open-uri'
require 'json'
data_path = 'CIL_Public_Data_JSON/Version8_6/DATA/CIL_PUBLIC_DATA'
default_file_types = ['jpg', 'zip']
video_base_url = 'https://cildata.crbs.ucsd.edu/media/videos/'
image_base_url = 'https://cildata.crbs.ucsd.edu/media/images/'
# For now just grab a couple samples
results = Dir.children(data_path).take(100)
results.each do |json_record|
identifier = json_record.split('.')[0]
file = File.read(data_path + '/' + json_record)
metadata = JSON.parse(file)
if metadata['CIL_CCDB']['Data_type']['Video']
# according to the README we can get jpg, flv, zip
default_file_types.each do |file_type|
`wget #{video_base_url}/#{identifier}/#{identifier}.#{file_type}`
end
`wget #{video_base_url}/#{identifier}/#{identifier}.flv`
else
# acconrding to the README we can get jpg, tif, zip
default_file_types.each do |file_type|
`wget #{image_base_url}/#{identifier}/#{identifier}.#{file_type}`
end
`wget #{video_base_url}/#{identifier}/#{identifier}.tif`
end
end
#!/bin/bash
#
# Example usage: ./git_changes.sh $HOME/CIL_Public_Data_JSON
if [ -z "$1" ]; then echo "No project directory provided"; exit 1; fi
cd "$1" || exit 1
current_head=$(git log | head -1 | awk '{print $2}')
current_branch=$(git rev-parse --abbrev-ref HEAD)
git pull origin "$current_branch"
new_head=$(git log | head -1 | awk '{print $2}')
if [[ "$current_head" == "$new_head" ]]; then
echo "No new commits have been added"
exit 0
fi
new_files=$(git diff --name-only --diff-filter=A "$current_head".."$new_head")
echo "$new_files"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment