Last active
August 29, 2015 14:05
-
-
Save calciphus/f7d214b9db1ce13a03fd to your computer and use it in GitHub Desktop.
Topic and Category Extraction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
starttime = Time.now.to_f | |
if ARGV.size == 0 | |
puts "Usage: $ ruby script.rb [-o][-t] files_to_parse.json(can be an array) [> outputfile.csv]\n\nOptions: | |
-o\tSkip diagnostic and performance messages so output can be directly stored | |
-t\tSquelch results and only show time and statistics (diagnostic run)" | |
else | |
toscan = ARGV | |
if ARGV.include?"-o" | |
forexport = true | |
toscan.delete("-o") | |
else | |
forexport = false | |
end | |
if ARGV.include?"-t" | |
timeonly = true | |
toscan.delete("-t") | |
else | |
timeonly = false | |
end | |
if !forexport | |
puts "Squelching output, running for time only" unless !timeonly | |
puts "Importing #{ARGV.size} files" | |
puts "-----------------------------------" | |
end | |
require 'yaml' | |
require 'oj' | |
require 'json' | |
linecount = 0 | |
badlines = 0 | |
keywords = Hash.new | |
toscan.each do |filename| | |
sourcedata = File.open(filename).each_line do |line| | |
linecount += 1 | |
#begin | |
linearray = Oj.load(line) | |
topics = linearray["salience"]["content"]["topics"] rescue nil | |
entities = linearray["salience"]["content"]["entities"] rescue nil | |
if topics != nil and entities != nil | |
# Extract topics by name into array | |
toparr = [] | |
topics.each do |e| | |
toparr << e["name"] | |
end | |
# Extract entities by name into array | |
entarr = [] | |
entities.each do |e| | |
entarr << e["name"] | |
end | |
# Iterate through topics, adding each entity to each topic | |
# Note: this knowlingly counts some posts 2+ times. That's OK | |
toparr.each do |topic| | |
# If topic is new, start with an empty hash | |
if keywords[topic] == nil | |
keywords[topic] = Hash.new | |
end | |
# Iterate through entities | |
entarr.each do |entity| | |
# If entity already mentioned for this topic, incriment it | |
if keywords[topic][entity] != nil | |
keywords[topic][entity] += 1 | |
else | |
# Otherwise add topic to | |
keywords[topic][entity] = 1 | |
end | |
end | |
end | |
end | |
#rescue | |
# badlines += 1 | |
#puts "Bad line in file: #{filename}\n at: #{linecount}" | |
#end | |
end | |
sourcedata.close | |
end | |
# Output pipe-delimited document of topic, entity, and occurances | |
puts "Topic|Entity|Occurances" unless timeonly | |
keywords.each do |key, val_arr| | |
val_arr.each do |val, occur| | |
if occur > 5 | |
puts "#{key}|#{val}|#{occur}" unless timeonly | |
end | |
end | |
end | |
# Only export stats if asked | |
endtime = Time.now.to_f | |
if !forexport | |
puts "-----------------------------------" | |
puts "Total files: #{ARGV.size}" | |
puts "Processed: #{linecount} lines" | |
puts "Couldn't read: #{badlines} lines" | |
puts "Duration: #{endtime-starttime} seconds" | |
puts "-----------------------------------" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is the script referenced in the DataSift blog post:
Behind the Scenes: How We Identified WordPress’ Top 10 Content Categories