Skip to content

Instantly share code, notes, and snippets.

@tkuldeep
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tkuldeep/787cec75cba98b0195a5 to your computer and use it in GitHub Desktop.
Save tkuldeep/787cec75cba98b0195a5 to your computer and use it in GitHub Desktop.
Parse HTML Content
include_once('simple_html_dom.php');
// TODO there are 38 pages, run this script in 38 loops, change the page value of page.
$html = file_get_html('https://amsterdam2014.drupal.org/attendees?field_badge_first_name_value=&field_badge_last_name_value=&field_badge_org_value=&uid=&field_badge_country_value=All&page=8');
$details = array();
$index = 0;
foreach($html->find('div.view-id-attendees ul li') as $li) {
// User name.
foreach($li->find('div.views-field-name') as $data) {
$do_username = $data->plaintext;
$details[$index]['username'] = $do_username;
}
// Job title.
foreach($li->find('div.views-field-field-badge-job-title') as $data) {
$job_title = $data->plaintext;
$details[$index]['job'] = $job_title;
}
// Organization.
foreach($li->find('div.views-field-field-badge-org') as $data) {
$organization = $data->plaintext;
$details[$index]['organization'] = $organization;
}
// Twitter URL.
foreach($li->find('a.username') as $data) {
$text = $data->href;
if (trim($text)) {
$amsterdam_prof_url = 'https://amsterdam2014.drupal.org' . $text;
$amsterdam_prof_html = file_get_html($amsterdam_prof_url);
//TODO fetch the data from the amsterdam page, if twitter link not found then from the do profile.
$twitter = 'twitter_not';
foreach($amsterdam_prof_html->find('div.field--name-field-twitter-handle .field__items') as $data) {
$twitter = $data->plaintext;
$details[$index]['twitter'] = $twitter;
}
}
}
// If twitter handler is not persent in amsterdem page, find out form DO profile.
if ($twitter == 'twitter_not') {
$do_prof = $amsterdam_prof_html->find('div.user-profile-item div.user-profile-item__items a');
foreach($do_prof as $data) {
$do_url = $data->href;
$do_html = file_get_html($do_url);
$do_profile = $do_html->find('div#user_user_full_group_profile_main a');
foreach($do_profile as $twiiter_do) {
$twitter_href = $twiiter_do->href;
if (preg_match('/twitter/',$twitter_href)) {
preg_match("/https?:\/\/(www\.)?twitter\.com\/(#!\/)?@?([^\/]*)/", $twitter_href, $matches);
if (isset($matches[3])) {
$twitter = '@' . $matches[3];
$details[$index]['twitter'] = $twitter;
}
}
}
}
}
$index++;
}
$users = array();
$fp = fopen ( 'drupal_users_21.csv', "a" );
foreach ($details as $user) {
if (isset($user['username']) && isset($user['job']) && isset($user['organization']) && isset($user['twitter'])) {
/*fwrite ($fp, trim($user['username']) . ',' );
fwrite ($fp, trim($user['job']) . ',' );
fwrite ($fp, trim($user['organization']) . ',' );
fwrite ($fp, trim($user['twitter']) . "\n" );*/
$users[] = $user;
}
}
fclose ($fp);
dpm($users);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment