Skip to content

Instantly share code, notes, and snippets.

Created July 25, 2014 05:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save anonymous/ea17299fcd86d8af2be5 to your computer and use it in GitHub Desktop.
Save anonymous/ea17299fcd86d8af2be5 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
import re
import json
from bs4 import BeautifulSoup as bs
from urllib2 import urlopen
__authors__ = ['haxwithaxe me@haxwithaxe.net']
__license__ = 'GPLv3'
_thanks = ''' Special thanks to Ukraine Calling (http://www.ukrainecalling.com) for assembling the list that this script uses as a data source. '''
_source = 'http://www.ukrainecalling.com/email-to-text.aspx'
class EmailToTxt:
def __init__(self, url = _source):
self.url = url
self.soup = None
self.countries = {}
self.country = None
self.scrape()
self.parse()
def json(self, country=None):
''' dumps scraped data as json
@param country optional country name to dump only the values for that country
@returns json string
@throws IndexError if the country does not exist.
'''
if country:
data = self.countries.get(country.lower())
else:
data = self.countries
data = {'credits': _thanks, 'source': self.url, 'data':data}
return json.dumps(data)
def __str__(self):
return self.json()
def scrape(self):
self.soup = bs(urlopen(self.url).read())
def parse(self):
for row in self.soup.find('table', class_='tblcn').find_all('tr'):
self.parse_row(row)
def parse_row(self, row):
if row.get('class'):
self._add_country(row)
elif row.get('itemprop'):
self._add_gateway(row)
def _add_country(self, row):
self.country = row.find('h3').text.lower()
self.countries[self.country] = []
def _add_gateway(self, row):
self.countries[self.country].append(self._get_entry(row))
def _get_entry(self, row):
items = [self._provider, self._gateway, self._notes]
entry = {'provider':None, 'gateway_raw':None, 'gateway':None, 'notes':None, 'number_format':None}
col = 0
for c in row.find_all('td'):
items[col](entry, c)
col+=1
return entry
def _provider(self, entry, item):
entry['provider'] = item.text
def _gateway(self, entry, item):
fmt, digits = self._gateway_format(item)
entry['gateway'] = fmt
entry['number_format'] = '%s digit number' % digits
def _gateway_format(self, col):
prefix = number = gateway = ''
if len(col.contents) == 2:
number = col.contents[0].text
gateway = col.contents[1]
elif len(col.contents) == 3:
prefix = col.contents[0]
number = col.contents[1].text
gateway = col.contents[2]
fmt = '%s%%(number)s@%s' % (prefix, gateway)
digits = str(len(number))
return fmt, digits
def _notes(self, entry, item):
entry['notes'] = item.text
if __name__ == '__main__':
dbo = EmailToTxt()
print(dbo)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment