Skip to content

Instantly share code, notes, and snippets.

@dsager

dsager/wb_themes.rb

Created Oct 25, 2016
Embed
What would you like to do?
world-bank theme classifier
#
# script to load project data from the world bank and generate a training set
# for a classifier
#
# Get a list of projects from the world bank and filter out projects w/o abstracts & themes
#
# curl -s 'http://search.worldbank.org/api/v2/projects?format=json&fl=project_name,project_abstract,theme_namecode&source=IBRD&rows=50000'
# | jq '[.projects[] | select(.project_abstract? and .theme_namecode?)] | map({"text": [.project_name, .project_abstract.cdata] | join(" - "), "themes": .theme_namecode | map(.code)})'
# > wb-projects.json
#
require 'yaml'
require 'json'
require 'csv'
themes = {}
YAML.load_file('config/data/wb_themes.yml').each do |code, theme|
theme['children'].each do |sub_code, sub_theme|
themes[sub_code.to_i] = {
name: "[#{sub_code}] #{sub_theme['name']}",
full_name: "[#{code}] #{theme['name']}/[#{sub_code}] #{sub_theme['name']}",
count: 0,
samples: []
}
end
end
wb_projects = JSON.load(File.open('tmp/wb-projects-2.json'))
puts "available samples: #{wb_projects.count}"
wb_projects.each do |project|
project['themes'].each do |theme_code|
if themes[theme_code.to_i] && themes[theme_code.to_i][:count] < 300
themes[theme_code.to_i][:count] += 1
themes[theme_code.to_i][:samples] << project['text']
end
end
end
puts 'samples per theme:'
themes.sort_by { |_, v| v[:count] }.each do |_, v|
print v[:name].ljust(80)
print '| '
print v[:count].to_s.ljust(5)
print '| '
print ('.' * (v[:count] / 10).to_i).ljust(32)
puts ''
end
puts 'writing training set to CSV file'
CSV.open('tmp/wb_themes_training_set.csv', 'wb') do |csv|
themes.each do |_, theme|
theme[:samples].each do |sample|
csv << [sample, theme[:full_name]]
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment