cglee (owner)

Revisions

  • da9d48 Chris Lee Tue Jun 09 14:24:46 -0700 2009
  • 31ed7d Chris Lee Tue Jun 09 14:12:04 -0700 2009
  • f47d50 cglee Tue Jun 09 13:53:08 -0700 2009
gist: 126778 Download_button fork
public
Public Clone URL: git://gist.github.com/126778.git
Embed All Files: show embed
parse_govtwit.rb #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
require 'rubygems'
require 'fastercsv'
 
class ParseObject
  attr_accessor :name, :org, :twitter_name, :site, :tags, :new_tags
  def initialize(name="", org="", twitter_name="", site="", tags="", new_tags="")
    @name = name
    @org = org
    @twitter_name = twitter_name
    @site = site
    @tags = tags
    @new_tags = new_tags
  end
  
  def to_csv
    "\"#{@name}\"" + "," + "\"#{@org}\"" + "," + "\"#{@twitter_name}\"" + "," + "\"#{@site}\"" + "," + "\"#{display_array(@new_tags)}\"" + "\n"
  end
  
  def print
    p "-----------------------"
    p "name: #{@name}"
    p "org: #{@org}"
    p "twitter_name: #{@twitter_name}"
    p "site: #{@site}"
    p "tags: #{display_array(@tags)}"
    p "new_tags: #{display_array(@new_tags)}"
    p "-----------------------"
  end
 
  def display_array(arr)
    arr.to_a.delete_if{|x| x.empty?}.flatten.uniq.join(", ")
  end
end
 
#--kickin it off
def parse_csv(path_to_csv)
  data = []
  csv_contents = FasterCSV.read(path_to_csv)
  csv_contents.each do |row|
    data << tagify(ParseObject.new(row[0], row[1], row[2], row[3], row[4]))
  end
  return data
end
 
def tagify(obj)
  obj.new_tags = []
  if obj.tags.include?("The Hill")
    obj.new_tags += the_hill_tags(obj)
  end
  
  obj.new_tags += match_key_words(obj.org) #ie, "Democratic", "Republican", etc
  obj.new_tags += match_states(obj.org, "") unless obj.tags.include?("International")
  obj.new_tags += match_countries(obj.org)
  obj.new_tags << convert_tag_to_web_format(obj.tags) #ie, The Hill, etc
  obj
end
 
def convert_tag_to_web_format(tag)
  case tag
    when /^Agencies/ : "Agencies"
    when /^Reporters/ : "Media"
    when /^Industry/ : "Industry"
    when /^Judicial/ : "Judiciary"
    else tag
  end
end
 
#--the hill
def the_hill_tags(obj)
  arr = []
  obj.org =~ /^(\D)-\s*(\w*)\s*(\w*)\s*/
  arr += arr + case $1
    when "H" : ["House", "Representative"]
    when "S" : ["Senate", "Senator"]
    else []
  end
 
  if match_states($2, $3).size > 0 #returns array of states that match - need two params for say, "New Mexico"
    arr += match_states($2, $3)
  elsif obj.org =~ /^(\D)-\w+/ # No state, ie. H-Committee on Something
    arr << obj.org.slice(2, obj.org.size)
  else # No preceding letter with dash, ie, "Committee on something"
    arr << obj.org
  end
  arr
end
 
#-- helpers
def match_states(str1, str2="")
  str1 ||= ""
  str2 ||= ""
  a = []
  [ ["Alaska", "AK"], ["Alabama", "AL"], ["Arkansas", "AR"], ["Arizona", "AZ"],
    ["California", "CA"], ["Colorado", "CO"], ["Connecticut", "CT"], ["District of Columbia", "DC"],
    ["Delaware", "DE"], ["Florida", "FL"], ["Georgia", "GA"], ["Hawaii", "HI"], ["Iowa", "IA"],
    ["Idaho", "ID"], ["Illinois", "IL"], ["Indiana", "IN"], ["Kansas", "KS"], ["Kentucky", "KY"],
    ["Louisiana", "LA"], ["Massachusetts", "MA"], ["Maryland", "MD"], ["Maine", "ME"], ["Michigan", "MI"],
    ["Minnesota", "MN"], ["Missouri", "MO"], ["Mississippi", "MS"], ["Montana", "MT"], ["North Carolina", "NC"],
    ["North Dakota", "ND"], ["Nebraska", "NE"], ["New Hampshire", "NH"], ["New Jersey", "NJ"],
    ["New Mexico", "NM"], ["Nevada", "NV"], ["New York", "NY"], ["Ohio", "OH"], ["Oklahoma", "OK"],
    ["Oregon", "OR"], ["Pennsylvania", "PA"], ["Rhode Island", "RI"], ["South Carolina", "SC"], ["South Dakota", "SD"],
    ["Tennessee", "TN"], ["Texas", "TX"], ["Utah", "UT"], ["Virginia", "VA"], ["Vermont", "VT"],
    ["Washington", "WA"], ["Wisconsin", "WI"], ["West Virginia", "WV"], ["Wyoming", "WY"]].each do |state|
    if str1.include?(state[0]) || str1 =~ /^#{state[1]}(-|\s+|$)/
      a << state[0]
    elsif (str1 + " " + str2).include?(state[0]) || (str1 + " " + str2) =~ /^#{state[1]}(-|\s+|$)/
      a << state[0]
    end
  end
  a
end
 
def match_countries(str)
  a = []
  ["UK", "New Zealand", "Israel", "Canada", "EU", "Australia"].each do |country|
    a << country if str =~ /^#{country}/
  end
  a
end
 
def match_key_words(str)
  a = []
  a << "Democratic" if str =~ /democrat/i
  a << "Republican" if str =~ /republican/i
  a << "Representative" if str =~ /representative/i
  a << "Virginia" if str =~ /Virgina/i #correct a misspelling
  a << "fire department" if str =~ /fire/i
  a << "police department" if str =~ /police/i
  a << "NASA" if str =~ /NASA/i
  a << "Coast Guard" if str =~ /Coast Guard/i || str =~ /USCG/
  a << "Center for Disease Control" if str =~ /CDC/ || str =~ /Disease Control/i
  a << "Air Force" if str =~ /USAF/ || str =~ /Air Force/i
  a << "Department of Defense" if str =~ /Department of Defense/i || str =~ /DOD/
  a << "Department of Energy" if str =~ /Department of Energy/i
  a << "Department of Homeland Security" if str =~ /Department of Homeland Security/i
  a << "General Services Administration" if str =~ /General Services Administration/i
  a << "Health & Human Services" if str =~ /Health & Human Service/i
  a << "Navy" if str =~ /Navy/i
  a << "Army" if str =~ /Army/i
  a << "Securities and Exchange Commission" if str =~ /SEC/ || str =~ /Securities and Exchange/i
  a << "State Department" if str =~ /State Department/
  a
end
 
 
# run it!
data = parse_csv( "govtwit-1.csv" )
data.each do |d|
  d.print
  #File.open("govtwit_tags.csv", "a") {|f| f << d.to_csv}
end