Skip to content

Instantly share code, notes, and snippets.

@adamlutz
Last active June 2, 2016 18:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adamlutz/e2a7f4ded0e5b94aecb70a6f90e7fd2b to your computer and use it in GitHub Desktop.
Save adamlutz/e2a7f4ded0e5b94aecb70a6f90e7fd2b to your computer and use it in GitHub Desktop.
mechanized cancer.gov search
require 'rubygems'
require 'ruby-progressbar'
require 'mechanize'
require 'pry'
require 'csv'
agent = Mechanize.new { |agent|
agent.user_agent_alias = 'Mac Safari'
}
query_options = {}
biopsy_options = {}
agent.get('http://www.cancer.gov/bcrisktool/Default.aspx') do |page|
page.form_with(:name => 'risk').fields.each do |field|
query_options[field.name] = []
query_options[field.name] = field.options.flatten.map{|x| [x.value, x.text]}.reject {|x| x[0] == '' || x[1] == "Select" || x[1] == '< 35'}
end
# post-process / simplify full universe of options to shorten length of
# script run-time:
query_options.delete('history')
query_options.delete('genetics')
query_options['race'] += query_options['subrace']
query_options.delete('subrace')
# move positive matching biopsy options to separate loop
# biopsy_options['previous_biopsies'] = query_options['previous_biopsies']
# biopsy_options['biopsy_with_hyperplasia'] = query_options['biopsy_with_hyperplasia']
biopsy_options['ever_had_biopsy'] = [ ["1", "Yes"] ]
query_options['ever_had_biopsy'] = biopsy_options['ever_had_biopsy'] #[ ["0", "No"] ]
# query_options['previous_biopsies'] = [ ["0", "No"] ]
# query_options['biopsy_with_hyperplasia'] = [ ["0", "No"] ]
end
def permutation_hash(hsh)
attrs = hsh.values
keys = hsh.keys
product = attrs[0].product(*attrs[1..-1])
product.map{ |p| Hash[keys.zip p] }
end
def build_url(query_options)
'http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0&' + query_options.to_a.map { |x| "#{x[0]}=#{x[1].first}" }.join("&")
end
def parsed_percentages(doc)
summary_text = doc.xpath("//ul[@class='gray-bg']").text.strip.gsub(/\r/," ").gsub(/\n/," ").gsub(/\s+/, ' ').gsub("%Average", "% Average")
percentage_matches = summary_text.scan(/([0-9]*\.[0-9]+|[0-9]+)(%)/)
percentage_matches.flatten.reject {|x| x == '%'}
end
sample = permutation_hash(query_options)
# sample = sample.slice(0,20)
p 'about to mechanize ' + sample.count.to_s + ' requests to cancer.gov!'
progressbar = ProgressBar.create( :format => '%a %bᗧ%i %p%% %t',
:progress_mark => ' ',
:remainder_mark => '・',
:total => sample.count,
:starting_at => 0)
CSV.open("with_previous_biopsy_results.csv", "w") do |csv|
csv << [ "current_age",
"age_at_menarche",
"age_at_first_live_birth",
"related_with_breast_cancer",
"ever_had_biopsy",
"previous_biopsies",
"biopsy_with_hyperplasia",
"race"] + ["risk %", "average risk %", "to age 90 risk %", "average to age 90 risk %", "url"]
sample.each_with_index do |unique_params,index|
url = build_url(unique_params)
doc = agent.get(url).parser
csv << [unique_params['current_age'].last] +
[unique_params['age_at_menarche'].last] +
[unique_params['age_at_first_live_birth'].last] +
[unique_params['related_with_breast_cancer'].last] +
[unique_params['ever_had_biopsy'].last] +
[unique_params['previous_biopsies'].last] +
[unique_params['biopsy_with_hyperplasia'].last] +
[unique_params['race'].last] +
parsed_percentages(doc) +
[url] #, summary_text, informational_text # , "summary text", "long-winded informational_text"
progressbar.increment
end
end
#http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0&current_age=35&age_at_menarche=99&age_at_first_live_birth=99&ever_had_biopsy=0&previous_biopsies=0&biopsy_with_hyperplasia=0&related_with_breast_cancer=99&race=2
#http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0&current_age=36&age_at_menarche=13&age_at_first_live_birth=0&ever_had_biopsy=0&previous_biopsies=0&biopsy_with_hyperplasia=0&related_with_breast_cancer=0&race=8
#http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0&current_age=35&age_at_menarche=99&age_at_first_live_birth=99&related_with_breast_cancer=99&ever_had_biopsy=0&previous_biopsies=1&biopsy_with_hyperplasia=0&race=7
#http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0&current_age=36&age_at_menarche=13&age_at_first_live_birth=0&ever_had_biopsy=0&previous_biopsies=1&biopsy_with_hyperplasia=0&related_with_breast_cancer=0&race=8'
# doc = agent.get(url).parser
# info_text_raw = doc.xpath("//p").text
# end_of_info_str = " Home | Contact Us | Policies | Accessibility U.S. Department of Health and Human Services  |  National Institutes of Health  |  National Cancer Institute  |  USA.gov NIH…Turning Discovery Into Health®"
# informational_text = info_text_raw.slice(info_text_raw.index("Based")..-1).strip.gsub(/\r/," ").gsub(/\n/," ").gsub(/\s+/, ' ').chomp(end_of_info_str)
# with sub-race
# http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0&current_age=36&age_at_menarche=13&age_at_first_live_birth=0&ever_had_biopsy=0&previous_biopsies=1&biopsy_with_hyperplasia=0&related_with_breast_cancer=0&race=8&asian=It%20has%20been%20observed%20that%20recent%20immigrants%20from%20rural%20Asia%20may%20have%20a%20lower%20risk%20of%20breast%20cancer%20than%20calculated.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment