Skip to content

Instantly share code, notes, and snippets.

@aramk
Created June 30, 2011 12:33
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save aramk/1056130 to your computer and use it in GitHub Desktop.
Save aramk/1056130 to your computer and use it in GitHub Desktop.
Scrape emails from a given URL in PHP. Using it to invite people to Google+, for now :)
<?php
$url = 'http://computerandu.wordpress.com/2011/06/29/how-to-get-google-invite/';
$emails = scrape_email($url);
echo implode($emails, ' ');
function scrape_email($url) {
if ( !is_string($url) ) {
return '';
}
//$result = @file_get_contents($url);
$result = @curl_get_contents($url);
if ($result === FALSE) {
return '';
}
// Convert to lowercase
$result = strtolower($result);
// Replace EMAIL DOT COM
$result = preg_replace('#[(\\[\\<]?AT[)\\]\\>]?\\s*(\\w*)\\s*[(\\[\\<]?DOT[)\\]\\>]?\\s*[a-z]{3}#ms', '@$1.com', $result);
// Email matches
preg_match_all('#\\b([\\w\\._]*)[\\s(]*@[\\s)]*([\\w_\\-]{3,})\\s*\\.\\s*([a-z]{3})\\b#msi', $result, $matches);
$usernames = $matches[1];
$accounts = $matches[2];
$suffixes = $matches[3];
$emails = array();
for ($i = 0; $i < count($usernames); $i++) {
$emails[$i] = $usernames[$i] . '@' . $accounts[$i] . '.' . $suffixes[$i];
}
return $emails;
}
function clean($str) {
if ( !is_string($str) ) {
return '';
} else {
return trim(strtolower($str));
}
}
function curl_get_contents($url) {
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, FALSE);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
// For https connections, we do not require SSL verification
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 20);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
$content = curl_exec($ch);
//$error = curl_error($ch);
//$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
return $content;
}
?>
@inderomc
Copy link

inderomc commented Nov 3, 2018

I copied this code in localhost file and output shows empty. Can you guide me a little how to implement it?

@niclowe
Copy link

niclowe commented Aug 6, 2019

Doesnt capture multi - suffixed email addresses such as me@university.nsw.edu.au - try here - https://www.igssyd.nsw.edu.au/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment