Created
February 2, 2014 21:21
-
-
Save Chitrank-Dixit/8775159 to your computer and use it in GitHub Desktop.
Extract Email from the pages , text files etc using Python re module (Regular Expressions)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this is the correct program | |
import re | |
import urllib2 | |
# get_next_target() takes a page and checks for the positions of the links | |
def get_next_target(page): | |
match=re.findall(r'[\w.-]+@[\w.-]+',page) | |
if match: | |
return match | |
'''start_link=page.find('<a href=') | |
if start_link == -1: | |
return None,0 | |
start_quote=page.find('"',start_link) | |
end_quote=page.find('"',start_quote+1) | |
url=page[start_quote+1:end_quote] | |
return url,end_quote | |
''' | |
def get_page(s): | |
sourceFile = urllib2.urlopen(s) | |
print_all_links(sourceFile.read()) | |
def print_all_links(page): | |
url=get_next_target(page) | |
if url: | |
return url | |
else: | |
return None | |
print print_all_links('hi this is a chitrankdixit@hotmail. "string" with no chitrankdixit@gmail.com link chitrankdixit@yahoo.com') | |
#url, endpos = print_all_links('this is a <a href="http://udacity.com">link!</a> MOre things happens here with all goes good <a href="http://www.trackleech.in/">') | |
#print (url,endpos) | |
'''url, endpos = print_all_links(get_page("http://www.xkcd.com/")) | |
print (url,endpos)''' | |
#url,endpos=get_page('http://www.xkcd.com/') | |
#print url,endpos |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment