require 'rubygems'
require 'fastercsv'
class ParseObject
attr_accessor :name, :org, :twitter_name, :site, :tags, :new_tags
def initialize(name="", org="", twitter_name="", site="", tags="", new_tags="")
@name = name
@org = org
@twitter_name = twitter_name
@site = site
@tags = tags
@new_tags = new_tags
end
def to_csv
"\"#{@name}\"" + "," + "\"#{@org}\"" + "," + "\"#{@twitter_name}\"" + "," + "\"#{@site}\"" + "," + "\"#{display_array(@new_tags)}\"" + "\n"
end
def print
p "-----------------------"
p "name: #{@name}"
p "org: #{@org}"
p "twitter_name: #{@twitter_name}"
p "site: #{@site}"
p "tags: #{display_array(@tags)}"
p "new_tags: #{display_array(@new_tags)}"
p "-----------------------"
end
def display_array(arr)
arr.to_a.delete_if{|x| x.empty?}.flatten.uniq.join(", ")
end
end
#--kickin it off
def parse_csv(path_to_csv)
data = []
csv_contents = FasterCSV.read(path_to_csv)
csv_contents.each do |row|
data << tagify(ParseObject.new(row[0], row[1], row[2], row[3], row[4]))
end
return data
end
def tagify(obj)
obj.new_tags = []
if obj.tags.include?("The Hill")
obj.new_tags += the_hill_tags(obj)
end
obj.new_tags += match_key_words(obj.org) #ie, "Democratic", "Republican", etc
obj.new_tags += match_states(obj.org, "") unless obj.tags.include?("International")
obj.new_tags += match_countries(obj.org)
obj.new_tags << convert_tag_to_web_format(obj.tags) #ie, The Hill, etc
obj
end
def convert_tag_to_web_format(tag)
case tag
when /^Agencies/ : "Agencies"
when /^Reporters/ : "Media"
when /^Industry/ : "Industry"
when /^Judicial/ : "Judiciary"
else tag
end
end
#--the hill
def the_hill_tags(obj)
arr = []
obj.org =~ /^(\D)-\s*(\w*)\s*(\w*)\s*/
arr += arr + case $1
when "H" : ["House", "Representative"]
when "S" : ["Senate", "Senator"]
else []
end
if match_states($2, $3).size > 0 #returns array of states that match - need two params for say, "New Mexico"
arr += match_states($2, $3)
elsif obj.org =~ /^(\D)-\w+/ # No state, ie. H-Committee on Something
arr << obj.org.slice(2, obj.org.size)
else # No preceding letter with dash, ie, "Committee on something"
arr << obj.org
end
arr
end
#-- helpers
def match_states(str1, str2="")
str1 ||= ""
str2 ||= ""
a = []
[ ["Alaska", "AK"], ["Alabama", "AL"], ["Arkansas", "AR"], ["Arizona", "AZ"],
["California", "CA"], ["Colorado", "CO"], ["Connecticut", "CT"], ["District of Columbia", "DC"],
["Delaware", "DE"], ["Florida", "FL"], ["Georgia", "GA"], ["Hawaii", "HI"], ["Iowa", "IA"],
["Idaho", "ID"], ["Illinois", "IL"], ["Indiana", "IN"], ["Kansas", "KS"], ["Kentucky", "KY"],
["Louisiana", "LA"], ["Massachusetts", "MA"], ["Maryland", "MD"], ["Maine", "ME"], ["Michigan", "MI"],
["Minnesota", "MN"], ["Missouri", "MO"], ["Mississippi", "MS"], ["Montana", "MT"], ["North Carolina", "NC"],
["North Dakota", "ND"], ["Nebraska", "NE"], ["New Hampshire", "NH"], ["New Jersey", "NJ"],
["New Mexico", "NM"], ["Nevada", "NV"], ["New York", "NY"], ["Ohio", "OH"], ["Oklahoma", "OK"],
["Oregon", "OR"], ["Pennsylvania", "PA"], ["Rhode Island", "RI"], ["South Carolina", "SC"], ["South Dakota", "SD"],
["Tennessee", "TN"], ["Texas", "TX"], ["Utah", "UT"], ["Virginia", "VA"], ["Vermont", "VT"],
["Washington", "WA"], ["Wisconsin", "WI"], ["West Virginia", "WV"], ["Wyoming", "WY"]].each do |state|
if str1.include?(state[0]) || str1 =~ /^#{state[1]}(-|\s+|$)/
a << state[0]
elsif (str1 + " " + str2).include?(state[0]) || (str1 + " " + str2) =~ /^#{state[1]}(-|\s+|$)/
a << state[0]
end
end
a
end
def match_countries(str)
a = []
["UK", "New Zealand", "Israel", "Canada", "EU", "Australia"].each do |country|
a << country if str =~ /^#{country}/
end
a
end
def match_key_words(str)
a = []
a << "Democratic" if str =~ /democrat/i
a << "Republican" if str =~ /republican/i
a << "Representative" if str =~ /representative/i
a << "Virginia" if str =~ /Virgina/i #correct a misspelling
a << "fire department" if str =~ /fire/i
a << "police department" if str =~ /police/i
a << "NASA" if str =~ /NASA/i
a << "Coast Guard" if str =~ /Coast Guard/i || str =~ /USCG/
a << "Center for Disease Control" if str =~ /CDC/ || str =~ /Disease Control/i
a << "Air Force" if str =~ /USAF/ || str =~ /Air Force/i
a << "Department of Defense" if str =~ /Department of Defense/i || str =~ /DOD/
a << "Department of Energy" if str =~ /Department of Energy/i
a << "Department of Homeland Security" if str =~ /Department of Homeland Security/i
a << "General Services Administration" if str =~ /General Services Administration/i
a << "Health & Human Services" if str =~ /Health & Human Service/i
a << "Navy" if str =~ /Navy/i
a << "Army" if str =~ /Army/i
a << "Securities and Exchange Commission" if str =~ /SEC/ || str =~ /Securities and Exchange/i
a << "State Department" if str =~ /State Department/
a
end
# run it!
data = parse_csv( "govtwit-1.csv" )
data.each do |d|
d.print
#File.open("govtwit_tags.csv", "a") {|f| f << d.to_csv}
end