Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Download Fama French industry classifications
from io import BytesIO
from zipfile import ZipFile
import requests
def download_ffind_zip(ind_num):
zip_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/Siccodes{}.zip'
.format(ind_num))
data = requests.get(zip_url)
zipfile = ZipFile(BytesIO(data.content))
return zipfile.open('Siccodes{}.txt'.format(ind_num)).read().decode()
def get_ffind_df(ind_num):
if ind_num not in [5, 10, 12, 17, 30, 38, 48, 49]:
raise ValueError('Industry number must be one of {} not {}.'
.format([5, 10, 12, 17, 30, 38, 48, 49], ind_num))
re_nameline = re.compile(r'^\s*(?P<ff{0}>\d\d?)\s+(?P<ff{0}_name>[a-z]+)\s+(?P<detail>.+)\s*$'
.format(ind_num), re.I|re.M)
re_rangeline = re.compile(r'^\s*(?P<sicfrom>\d{3,4})-(?P<sicto>\d{3,4})(?P<notes>\s+.+)?\s*$', re.I|re.M)
data = download_ffind_zip(ind_num)
# init to 'other'
try:
current_ind = [_.groupdict() for _ in re_nameline.finditer(data)
if _.group('ff{0}_name'.format(ind_num)).lower() == 'other'][0]
except IndexError:
current_ind = {'ff{0}'.format(ind_num):ind_num,
'ff{0}_name'.format(ind_num):'Other',
'detail':''}
vals = {i:current_ind for i in range(10000)}
for line in data.split('\n'):
match = re_nameline.search(line.strip())
if match:
current_ind = match.groupdict()
continue
match = re_rangeline.search(line.strip())
if not match:
continue
match = match.groupdict()
sicfrom,sicto = int(match['sicfrom']), int(match['sicto'])
for i in range(sicfrom, sicto+1):
vals[i] = current_ind
df = pd.DataFrame.from_dict(vals, orient='index')
df.index.name = 'sic'
df['ff{0}'.format(ind_num)] = df['ff{0}'.format(ind_num)].astype(int)
return df.reset_index()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.