Skip to content

Instantly share code, notes, and snippets.

@tim-peterson
Last active July 30, 2016 01:14
Show Gist options
  • Save tim-peterson/36271751df0c4858b3df985bf194d431 to your computer and use it in GitHub Desktop.
Save tim-peterson/36271751df0c4858b3df985bf194d431 to your computer and use it in GitHub Desktop.
connection.query('SELECT * from schools where id > 4479 && id < 4490;', function(err, rows, fields) { // and id=35402
if (err) throw err;
var len =rows.length;
for(var i = 0; i < len; i++){
var school_name = rows[i].name;
var parent_org = rows[i].id;
var url = "http://www.bing.com/search?q="+encodeURIComponent(school_name+" student organizations");
bingIt(url, school_name, parent_org);
if(i==(len -1) ) close_connection=true;
}
});
function get_each_club_page(links, l, parent_org, second_pass, row_to_update){
var indiv_club_link = (typeof links[l][0].attribs!='undefined' && typeof links[l][0].attribs.href!='undefined') ? links[l][0].attribs.href : '';
if(indiv_club_link=='') return;
if(second_pass && indiv_club_link.substring(0,4) !== "http"){
var orig_club_link = indiv_club_link;
indiv_club_link = second_pass + '/'+ indiv_club_link;
}
var description = (typeof links[l][0].prev!='undefined' && links[l][0].prev!=null && typeof links[l][0].prev.data!='undefined' && links[l][0].prev.data!=null) ? links[l][0].prev.data : '';
var name = links[l].text();
name = name.replace(/\s+/g,' ').trim();
if(name=='') return;
var contact_link = indiv_club_link;
request(indiv_club_link, function (error, response, body) {
if (!error && response.statusCode == 200) {
var name = links[l].text();
name = name.replace(/\s+/g,' ').trim();
var contact_link = indiv_club_link;
var description = '';
var $ = cheerio.load(body);
if(second_pass){
var longest_text = '';
$('p, p *, div, div *, li, li *').each(function(k) {
if($(this).text().length > longest_text.length){
longest_text = $(this).text().replace(/\s+/g, " ").trim();
var description = longest_text;
}
});
}
var innerText = $('body').text();
if(second_pass){
var innerText = $('*').text();
}
var address_arr = address_regex.exec(innerText);
if(address_arr!=null && address_arr.length > 0) var address = address_arr[0];
else var address = '';
var numbers = new PhoneNumberParser();
numbers.parse(innerText);
var phone_arr = [];
if(typeof numbers.items!='undefined'){
for(var ni = 0; ni < numbers.items.length; ni++){
if(numbers.items[ni].length>=9){
phone_arr.push(numbers.items[ni]);
}
}
}
var phone = phone_arr.join();
var email = extractEmails(innerText);
if(email!=null) email = email.join();
if(second_pass==false){
if(email!=null){
var arr = [name,phone,address,email,parent_org,description,contact_link]; //geo, ,geo = ?
}
else{
var arr = [name,phone,address,email+'__randomizer__'+ Math.random(),parent_org,description,contact_link];
}
connection.query('INSERT organization_leads_clubs SET name = ?,phone = ?,address = ?,email = ?,parent_org = ?,description = ?,website = ?', arr, function(err, rows, fields) {
if(err) {
return
}
if(email==null){
$('a').each(function(item) {
if(typeof $(this)[0].attribs!='undefined' && typeof $(this)[0].attribs.href!='undefined' && $(this)[0].attribs.href.toLowerCase().indexOf('contact') > -1){
var alt_indiv_club_link = [$(this)]; // (typeof $(this)[0].attribs!='undefined' && typeof $(this)[0].attribs.href!='undefined') ? $(this)[0].attribs.href : '';
get_each_club_page(alt_indiv_club_link, 0, parent_org, contact_link, rows.insertId);
return false;
}
//resume();
});
$('a').each(function(r){
if(typeof $(this)[0].attribs!='undefined' && typeof $(this)[0].attribs.href!='undefined' && $(this)[0].attribs.href.toLowerCase().indexOf('contact') > -1){
var alt_indiv_club_link = [$(this)];
get_each_club_page(alt_indiv_club_link, 0, parent_org, contact_link, rows.insertId);
return false;
}
});
}
console.log('INSERT organization_leads_clubs first pass');
});
} //if(contact_link
else if(second_pass){
var arr = [phone,address,email,description,contact_link,row_to_update]; //contact_link
if(email==null) var arr = [phone,address,email+'__randomizer__'+ Math.random(),description,contact_link,row_to_update];
connection.query('UPDATE organization_leads_clubs SET phone = ?,address = ?,email = ?,description = ?,contact_link = ? where id = ?', arr, function(err, rows, fields) {
if(err) {
return
}
console.log('UPDATE organization_leads_clubs'+second_pass);
return;
});
}
} // !if not error
}); //request
}
function bingIt(url, school_name, parent_org){
var tRequest = throttledRequest({url : url, headers : headers}, function (error, response, body) {
//console.log("inside bing: ");
if (error) {
//console.log("Couldn’t get page because of error: " + error);
return;
}
var $ = cheerio.load(body),
all_links = $(".b_algo > h2 > a"),
first_link = $(".b_algo > h2 > a").first(),
website = '';
var override = false;
for(var a=0; a < 9; a++){
if(typeof all_links[a]!='undefined'){
var website0 = typeof all_links[a].attribs!='undefined' ? all_links[a].attribs.href : '';
if(website0.toLowerCase().indexOf('organizations') > -1){
override = website0;
break;
//console.log('override bing link with: '+override);
}
if(override==false){
if(website0.toLowerCase().indexOf('clubs') > -1){
override = website0;
break;
}
}
}
}
if(override) website = override;
else{
if(typeof first_link[0]!='undefined'){
website = typeof first_link[0].attribs!='undefined' ? first_link[0].attribs.href : '';
}
}
if(website==''){
//console.log('no website, die');
return;
}
request(website, function (error, response, body) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(body);
var links = find_club_links($);
for(var l = 0; l < links.length; l++){
get_each_club_page(links, l, parent_org, false, false);
} //for each club link
} // !if not error
}); //request
}); //throttledrequest(url to Bing)
} //bingIt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment