require 'rubygems' require 'fastercsv' class ParseObject attr_accessor :name, :org, :twitter_name, :site, :tags, :new_tags def initialize(name="", org="", twitter_name="", site="", tags="", new_tags="") @name = name @org = org @twitter_name = twitter_name @site = site @tags = tags @new_tags = new_tags end def to_csv "\"#{@name}\"" + "," + "\"#{@org}\"" + "," + "\"#{@twitter_name}\"" + "," + "\"#{@site}\"" + "," + "\"#{display_array(@new_tags)}\"" + "\n" end def print p "-----------------------" p "name: #{@name}" p "org: #{@org}" p "twitter_name: #{@twitter_name}" p "site: #{@site}" p "tags: #{display_array(@tags)}" p "new_tags: #{display_array(@new_tags)}" p "-----------------------" end def display_array(arr) arr.to_a.delete_if{|x| x.empty?}.flatten.uniq.join(", ") end end #--kickin it off def parse_csv(path_to_csv) data = [] csv_contents = FasterCSV.read(path_to_csv) csv_contents.each do |row| data << tagify(ParseObject.new(row[0], row[1], row[2], row[3], row[4])) end return data end def tagify(obj) obj.new_tags = [] if obj.tags.include?("The Hill") obj.new_tags += the_hill_tags(obj) end obj.new_tags += match_key_words(obj.org) #ie, "Democratic", "Republican", etc obj.new_tags += match_states(obj.org, "") unless obj.tags.include?("International") obj.new_tags += match_countries(obj.org) obj.new_tags << convert_tag_to_web_format(obj.tags) #ie, The Hill, etc obj end def convert_tag_to_web_format(tag) case tag when /^Agencies/ : "Agencies" when /^Reporters/ : "Media" when /^Industry/ : "Industry" when /^Judicial/ : "Judiciary" else tag end end #--the hill def the_hill_tags(obj) arr = [] obj.org =~ /^(\D)-\s*(\w*)\s*(\w*)\s*/ arr += arr + case $1 when "H" : ["House", "Representative"] when "S" : ["Senate", "Senator"] else [] end if match_states($2, $3).size > 0 #returns array of states that match - need two params for say, "New Mexico" arr += match_states($2, $3) elsif obj.org =~ /^(\D)-\w+/ # No state, ie. H-Committee on Something arr << obj.org.slice(2, obj.org.size) else # No preceding letter with dash, ie, "Committee on something" arr << obj.org end arr end #-- helpers def match_states(str1, str2="") str1 ||= "" str2 ||= "" a = [] [ ["Alaska", "AK"], ["Alabama", "AL"], ["Arkansas", "AR"], ["Arizona", "AZ"], ["California", "CA"], ["Colorado", "CO"], ["Connecticut", "CT"], ["District of Columbia", "DC"], ["Delaware", "DE"], ["Florida", "FL"], ["Georgia", "GA"], ["Hawaii", "HI"], ["Iowa", "IA"], ["Idaho", "ID"], ["Illinois", "IL"], ["Indiana", "IN"], ["Kansas", "KS"], ["Kentucky", "KY"], ["Louisiana", "LA"], ["Massachusetts", "MA"], ["Maryland", "MD"], ["Maine", "ME"], ["Michigan", "MI"], ["Minnesota", "MN"], ["Missouri", "MO"], ["Mississippi", "MS"], ["Montana", "MT"], ["North Carolina", "NC"], ["North Dakota", "ND"], ["Nebraska", "NE"], ["New Hampshire", "NH"], ["New Jersey", "NJ"], ["New Mexico", "NM"], ["Nevada", "NV"], ["New York", "NY"], ["Ohio", "OH"], ["Oklahoma", "OK"], ["Oregon", "OR"], ["Pennsylvania", "PA"], ["Rhode Island", "RI"], ["South Carolina", "SC"], ["South Dakota", "SD"], ["Tennessee", "TN"], ["Texas", "TX"], ["Utah", "UT"], ["Virginia", "VA"], ["Vermont", "VT"], ["Washington", "WA"], ["Wisconsin", "WI"], ["West Virginia", "WV"], ["Wyoming", "WY"]].each do |state| if str1.include?(state[0]) || str1 =~ /^#{state[1]}(-|\s+|$)/ a << state[0] elsif (str1 + " " + str2).include?(state[0]) || (str1 + " " + str2) =~ /^#{state[1]}(-|\s+|$)/ a << state[0] end end a end def match_countries(str) a = [] ["UK", "New Zealand", "Israel", "Canada", "EU", "Australia"].each do |country| a << country if str =~ /^#{country}/ end a end def match_key_words(str) a = [] a << "Democratic" if str =~ /democrat/i a << "Republican" if str =~ /republican/i a << "Representative" if str =~ /representative/i a << "Virginia" if str =~ /Virgina/i #correct a misspelling a << "fire department" if str =~ /fire/i a << "police department" if str =~ /police/i a << "NASA" if str =~ /NASA/i a << "Coast Guard" if str =~ /Coast Guard/i || str =~ /USCG/ a << "Center for Disease Control" if str =~ /CDC/ || str =~ /Disease Control/i a << "Air Force" if str =~ /USAF/ || str =~ /Air Force/i a << "Department of Defense" if str =~ /Department of Defense/i || str =~ /DOD/ a << "Department of Energy" if str =~ /Department of Energy/i a << "Department of Homeland Security" if str =~ /Department of Homeland Security/i a << "General Services Administration" if str =~ /General Services Administration/i a << "Health & Human Services" if str =~ /Health & Human Service/i a << "Navy" if str =~ /Navy/i a << "Army" if str =~ /Army/i a << "Securities and Exchange Commission" if str =~ /SEC/ || str =~ /Securities and Exchange/i a << "State Department" if str =~ /State Department/ a end # run it! data = parse_csv( "govtwit-1.csv" ) data.each do |d| d.print #File.open("govtwit_tags.csv", "a") {|f| f << d.to_csv} end