Skip to content

Instantly share code, notes, and snippets.

@Chitrank-Dixit
Created February 2, 2014 21:21
Show Gist options
  • Save Chitrank-Dixit/8775159 to your computer and use it in GitHub Desktop.
Save Chitrank-Dixit/8775159 to your computer and use it in GitHub Desktop.
Extract Email from the pages , text files etc using Python re module (Regular Expressions)
# this is the correct program
import re
import urllib2
# get_next_target() takes a page and checks for the positions of the links
def get_next_target(page):
match=re.findall(r'[\w.-]+@[\w.-]+',page)
if match:
return match
'''start_link=page.find('<a href=')
if start_link == -1:
return None,0
start_quote=page.find('"',start_link)
end_quote=page.find('"',start_quote+1)
url=page[start_quote+1:end_quote]
return url,end_quote
'''
def get_page(s):
sourceFile = urllib2.urlopen(s)
print_all_links(sourceFile.read())
def print_all_links(page):
url=get_next_target(page)
if url:
return url
else:
return None
print print_all_links('hi this is a chitrankdixit@hotmail. "string" with no chitrankdixit@gmail.com link chitrankdixit@yahoo.com')
#url, endpos = print_all_links('this is a <a href="http://udacity.com">link!</a> MOre things happens here with all goes good <a href="http://www.trackleech.in/">')
#print (url,endpos)
'''url, endpos = print_all_links(get_page("http://www.xkcd.com/"))
print (url,endpos)'''
#url,endpos=get_page('http://www.xkcd.com/')
#print url,endpos
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment