Last active
May 14, 2018 16:08
-
-
Save mcritchlow/f133e8bacd32a7d9daaa4b74af4cbcab to your computer and use it in GitHub Desktop.
CIL Data Processing R&D
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ruby script to test out parsing the CIL json data and dump to csv | |
# | |
# For now, we're concatenating the property hierarchy with colons | |
# So CIL: { "Image Files" [ "Mime_type": "application/zip" will be CIL:Image Files:Mime_type "application/zip" | |
require 'open-uri' | |
require 'json' | |
require 'byebug' | |
require 'csv' | |
# Hackety hacks, don't talk back | |
class CilCSV | |
DATA_PATH = 'CIL_Public_Data_JSON/Version8_6/DATA/CIL_PUBLIC_DATA'.freeze | |
attr_reader :cil_data | |
def initialize | |
@cil_data = {} | |
end | |
def start | |
load_data | |
CSV.open('cil.csv', 'wb', headers: true, write_headers: true, col_sep: '|') do |csv| | |
csv << ['Identifier'] + cil_header_row | |
cil_value_rows.each { |row| csv << row } | |
end | |
end | |
def load_data | |
json_files.each do |file| | |
cil_data[file] = parse(file) | |
end | |
end | |
def cil_header_row | |
@cil_header_row ||= cil_data.values.map(&:keys).reduce(&:+).uniq | |
end | |
def cil_value_rows | |
values = [] | |
cil_data.each_pair do |k, v| | |
values << [k] + cil_header_row.map { |h| v.fetch(h, '') } | |
end | |
values | |
end | |
def json_files | |
# For now just grab a couple samples | |
# Dir.children(DATA_PATH).take(500) | |
Dir.children(DATA_PATH) | |
end | |
def parse(cil_file) | |
file = File.read(DATA_PATH + '/' + cil_file) | |
metadata = JSON.parse(file) | |
flatten_hash(metadata) | |
end | |
def flatten_hash(hash) | |
hash.each_with_object({}) do |(k, v), h| | |
if v.is_a? Hash | |
flatten_hash(v).map do |h_k, h_v| | |
h["#{k}.#{h_k}".to_sym] = h_v | |
end | |
else | |
h[k] = String(v) | |
end | |
end | |
end | |
end | |
CilCSV.new.start |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ruby script to test out parsing the CIL json data | |
# | |
require 'open-uri' | |
require 'json' | |
data_path = 'CIL_Public_Data_JSON/Version8_6/DATA/CIL_PUBLIC_DATA' | |
default_file_types = ['jpg', 'zip'] | |
video_base_url = 'https://cildata.crbs.ucsd.edu/media/videos/' | |
image_base_url = 'https://cildata.crbs.ucsd.edu/media/images/' | |
# For now just grab a couple samples | |
results = Dir.children(data_path).take(100) | |
results.each do |json_record| | |
identifier = json_record.split('.')[0] | |
file = File.read(data_path + '/' + json_record) | |
metadata = JSON.parse(file) | |
if metadata['CIL_CCDB']['Data_type']['Video'] | |
# according to the README we can get jpg, flv, zip | |
default_file_types.each do |file_type| | |
`wget #{video_base_url}/#{identifier}/#{identifier}.#{file_type}` | |
end | |
`wget #{video_base_url}/#{identifier}/#{identifier}.flv` | |
else | |
# acconrding to the README we can get jpg, tif, zip | |
default_file_types.each do |file_type| | |
`wget #{image_base_url}/#{identifier}/#{identifier}.#{file_type}` | |
end | |
`wget #{video_base_url}/#{identifier}/#{identifier}.tif` | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Example usage: ./git_changes.sh $HOME/CIL_Public_Data_JSON | |
if [ -z "$1" ]; then echo "No project directory provided"; exit 1; fi | |
cd "$1" || exit 1 | |
current_head=$(git log | head -1 | awk '{print $2}') | |
current_branch=$(git rev-parse --abbrev-ref HEAD) | |
git pull origin "$current_branch" | |
new_head=$(git log | head -1 | awk '{print $2}') | |
if [[ "$current_head" == "$new_head" ]]; then | |
echo "No new commits have been added" | |
exit 0 | |
fi | |
new_files=$(git diff --name-only --diff-filter=A "$current_head".."$new_head") | |
echo "$new_files" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment