Skip to content

Instantly share code, notes, and snippets.

@rafapolo
Created November 27, 2014 16:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rafapolo/8123944da8101ea36c80 to your computer and use it in GitHub Desktop.
Save rafapolo/8123944da8101ea36c80 to your computer and use it in GitHub Desktop.
Convert Jeff's .XLS into a .DOT Graph file
#!/usr/bin/env ruby
#encoding: utf-8
#comment: Convert Jeff's .XLS into a .DOT Graph file
# configurations
file = "kenya.xls"
# columns to export as graph nodes
columns = ["Project", "Platform", "Developer", "Organization", "Funder", "Category", "Type"]
# how columns connect to each other
relations = {
["Organization", "Funder", "Platform", "Category"] => "Project",
["Developer", "Type"] => "Platform"
}
# generate graph node ID from string
require 'digest/md5'
def to_ID(type, data)
data = data.first if data.class == Array
md5 = Digest::MD5.hexdigest(data.to_s)
"#{type.downcase}_#{md5}"
end
# remove exceeding spaces and marks
def clean(data)
data.to_s.strip.gsub('"', '\'')
end
# print fancy info bar
start = Time.now
require 'colored'
bar = ("="*55).yellow
puts bar
puts "\t Parsing #{file} spreadsheet...".yellow
puts bar
# read spreadsheet file
require 'pry'
require 'simple-spreadsheet'
xls = SimpleSpreadsheet::Workbook.read(file)
xls.selected_sheet = xls.sheets.first
# map all header's columns positions
header = {}
0.upto xls.last_column do |column|
type = xls.cell(1, column)
if type
data = "#{column} => #{clean(type)}"
puts columns.index(type) ? data.green : data.red
header[column] = type
end
end
puts bar
# iterate over all data lines and rows
entities = []
2.upto(xls.last_row) do |line|
relation = {}
1.upto xls.last_column do |column|
data = xls.cell(line, column)
if data
type = nil
unless type = header[column]
# if this column doesn't have an associated header
# than it's header type is the previous existing one
current = column
while !type
type = header[current-=1]
end
end
if columns.index(type)
# instanciate a hash of array types as a relation structure, if empty
relation[type] = [] unless relation[type]
# and associate it
relation[type] << clean(data)
# ex: "Developer"=>["Social Impact Lab", "CartONG (France)", "International Media Support (IMS)"]
end
end
printf("\r\tInterpreting messy structure: %d%", 100*(line)/(xls.last_row-1))
end
entities << relation
end
puts
# now we have structured data to play with! let's generate the .DOT file
puts bar
output = file.split('.')[0] + Time.now.strftime("_%Y%m%d") + ".dot"
puts "\t\tGenerating graph file...".yellow
puts bar
ids = []
File.open(output, 'w') do |dot_file|
dot_file.write("digraph G {\n")
entities.each do |rows|
rows.keys.each do |type|
rows[type].each do |data|
unless data.class == Array
data = [data]
end
data.each do |value|
id = to_ID(type, value) # generate ID from string
# only create node entity if not previously done, avoiding repetition.
unless ids.index(id)
ids << id
append = "\"#{id}\" [label=\"#{value}\" type=\"#{type}\"];\n"
dot_file.write(append)
end
# connect nodes!
to_node = nil
relations.keys.each{|key| to_node = relations[key] if key.index(type)}
if to_node && rows[to_node] # how about nodes with no related nodes?
to_id = to_ID(to_node, rows[to_node])
connection = "\"#{id}\" -> \"#{to_id}\";\n"
dot_file.write(connection)
end
end
end
end
end
dot_file.write("}")
puts ("\t-> #{output} done in %3.2f secs" % [Time.now - start]).green
end
puts bar
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment