Instantly share code, notes, and snippets.

Embed
What would you like to do?
Download Fama French industry classifications
from io import BytesIO
from zipfile import ZipFile
import requests
def download_ffind_zip(ind_num):
zip_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/Siccodes{}.zip'
.format(ind_num))
data = requests.get(zip_url)
zipfile = ZipFile(BytesIO(data.content))
return zipfile.open('Siccodes{}.txt'.format(ind_num)).read().decode()
def get_ffind_df(ind_num):
if ind_num not in [5, 10, 12, 17, 30, 38, 48, 49]:
raise ValueError('Industry number must be one of {} not {}.'
.format([5, 10, 12, 17, 30, 38, 48, 49], ind_num))
re_nameline = re.compile(r'^\s*(?P<ff{0}>\d\d?)\s+(?P<ff{0}_name>[a-z]+)\s+(?P<detail>.+)\s*$'
.format(ind_num), re.I|re.M)
re_rangeline = re.compile(r'^\s*(?P<sicfrom>\d{3,4})-(?P<sicto>\d{3,4})(?P<notes>\s+.+)?\s*$', re.I|re.M)
data = download_ffind_zip(ind_num)
# init to 'other'
try:
current_ind = [_.groupdict() for _ in re_nameline.finditer(data)
if _.group('ff{0}_name'.format(ind_num)).lower() == 'other'][0]
except IndexError:
current_ind = {'ff{0}'.format(ind_num):ind_num,
'ff{0}_name'.format(ind_num):'Other',
'detail':''}
vals = {i:current_ind for i in range(10000)}
for line in data.split('\n'):
match = re_nameline.search(line.strip())
if match:
current_ind = match.groupdict()
continue
match = re_rangeline.search(line.strip())
if not match:
continue
match = match.groupdict()
sicfrom,sicto = int(match['sicfrom']), int(match['sicto'])
for i in range(sicfrom, sicto+1):
vals[i] = current_ind
df = pd.DataFrame.from_dict(vals, orient='index')
df.index.name = 'sic'
df['ff{0}'.format(ind_num)] = df['ff{0}'.format(ind_num)].astype(int)
return df.reset_index()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment