Skip to content

Instantly share code, notes, and snippets.

@bkimble
Created November 14, 2014 00:52
Show Gist options
  • Save bkimble/3cb5ccdf664aadfd30e9 to your computer and use it in GitHub Desktop.
Save bkimble/3cb5ccdf664aadfd30e9 to your computer and use it in GitHub Desktop.
#!/usr/bin/ruby
puts "hello"
require 'tempfile'
puts ARGV[1]
filename,lines,prefix = ARGV
unless filename && lines
abort("missing file or lines")
end
lines = lines.to_i
neo_query = <<-EOF
// load csv
LOAD CSV WITH HEADERS FROM "file:%%FILENAME%%" AS csvLine
// Find or create a user node with an id value of the userId field from the CSV
MERGE (user:User { id: toInt(csvLine.userId) })
// Find or create an alias node with a display name value (need to ensure this is not merging people with same names without taking email and such in to consideration)
MERGE (alias:Alias { name: csvLine.displayName })
// Create the KNOWS relationship between the user we found or created, and an empty contact niode that we are creating
// We set a merge false field so a subsequent process will know that it needs to be worked on
// Then link the empty contact node to the alias node we found or created
CREATE (user)-[:KNOWS]->(contact:Contact { merge: false })-[:ALIAS]->(alias)
// Loop over each Email address
FOREACH (address IN split(replace(replace(replace(replace(csvLine.emails,'[',''),']',''),'"',''),' ',''),',') |
// Find or create an email address node
MERGE (email:Email { address: address })
// If we find an email address (not create) set merge = true so we know how to deal with it later
ON MATCH SET contact.merge = true
// finally create the relationship between the contact we created in the above block, and the email we found or created here.
CREATE (contact)-[:EMAIL]->(email)
)
// Loop over each phone number
FOREACH (number IN split(replace(replace(replace(replace(csvLine.phoneNumbers,'[',''),']',''),'"',''),' ',''),',') |
// find or create a phone number node
MERGE (phone:Phone { number: number })
// If we find a phopne number (not create), set merge = true so we know how to deal with it late
ON MATCH SET contact.merge = true
// finally create the relationship between the contact we created in the above block, and the phone we found or created here.
CREATE (contact)-[:PHONE]->(phone)
);
// Phase 2
// Delete null Email addresses and Phone numbers ( a result in us using FOREACH on an empty array)
MATCH (email:Email { address: '' })<-[e:EMAIL]-()
DELETE e, email;
MATCH (phone:Phone { number: '' })<-[p:PHONE]-()
DELETE p, phone;
// End phase 2
EOF
File.open(filename, 'r') do |f|
total_imported = 0
headers = f.gets
size = `wc -l #{filename}`.gsub(/^(\d+).+?$/,"\\1")
files = size.to_i / lines.to_i
1.upto(files+1) do |file_count|
puts "making file."
file = File.new("/home/ubuntu/csvchunk.csv", "w")
begin
file.puts headers
0.upto(lines.to_i) do
if line = f.gets
file.puts line
end
end
file.close
puts "making query file now."
query_file = File.new('queryfile', 'w')
query = neo_query.gsub(/%%FILENAME%%/, file.path)
begin
query_file.puts query
query_file.close
end
start = Time.now
`/usr/bin/neo4j-shell -file #{query_file.path} /dev/null`
total_time = Time.now - start
total_imported += lines
puts "#{total_imported} total, last batch: #{total_time}"
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment