-
-
Save pmlandwehr/f5c3a2dc8de935f5e4bf2404a36340be to your computer and use it in GitHub Desktop.
Read the infodump into a dictionary of dataframes, an HDF5 file, or SQLite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def mefi_strptime(x): | |
from datetime import datetime | |
try: | |
return datetime.strptime(x, '%b %d %Y %I:%M:%S:%f%p') | |
except ValueError: | |
return datetime(1900, 1, 1, 0, 0, 0) | |
def int_with_filled_nans(x): | |
import numpy as np | |
try: | |
return np.int64(x) | |
except ValueError: | |
return -1 | |
def posttitles_to_df(fpath): | |
""" | |
Necessitated by a wonky, long title in posttitles_askme.txt | |
:param str fpath: path to the file | |
:returns: `pandas.DataFrame` -- Dataframe of posttitles file | |
""" | |
import pandas as pd | |
df_dict = {'postid': [], 'title': []} | |
with open(fpath, 'r') as infile: | |
infile.readline() # pre-date | |
infile.readline() # header | |
for line in infile: | |
data = line.strip().split('\t') | |
df_dict['postid'].append(int_with_filled_nans(data[0])) | |
if len(data) < 2: | |
df_dict['title'].append('') | |
elif len(data) == 2: | |
df_dict['title'].append(data[1]) | |
else: | |
df_dict['title'].append(' '.join(data[1:])) | |
return pd.DataFrame(df_dict) | |
def infodump_file_to_df(fpath): | |
""" | |
:param str fpath: path to infodump file | |
:returns: `pandas.DataFrame` -- dataframe made from the infodump file | |
""" | |
import pandas as pd | |
if fpath.find('posttitles_') > -1: | |
return posttitles_to_df(fpath) | |
df = pd.read_table(fpath, | |
skiprows=1, | |
converters={ | |
'above': int_with_filled_nans, | |
'below': int_with_filled_nans, | |
'best answer?': bool, | |
'category': int_with_filled_nans, | |
'comments': int_with_filled_nans, | |
'date': mefi_strptime, | |
'datestamp': mefi_strptime, | |
'deleted': bool, | |
'favorites': int_with_filled_nans, | |
'joindate': mefi_strptime, | |
'link_date': mefi_strptime, | |
'link_id': int_with_filled_nans, | |
'name': str, | |
'postid': int_with_filled_nans, | |
'reason': str, | |
'tag_id': int_with_filled_nans, | |
'tag_name': str, | |
'title': str, | |
'url': int_with_filled_nans, | |
'urldesc': int_with_filled_nans, | |
'userid': int_with_filled_nans | |
}) | |
# All the comment data has a "best answer?" column, but it's meaningless | |
# except for when dealing with AskMe. | |
if 'best answer?' in df.columns and fpath.find('commentdata_askme.txt') == -1: | |
del df['best answer?'] | |
# MetaFilter's post data contains a category column, but it's always 0 | |
# -except, it seems, on 4 broken posts where it comes up as -1- | |
# so should be dropped from this df. | |
if 'category' in df.columns and fpath.find('postdata_mefi.txt') > -1: | |
del df['category'] | |
return df | |
def infodump_to_df_dict(infodump_path='.'): | |
""" | |
:param str infodump_path: path to infodump directory | |
:returns `dict` -- dictionary of dataframes and dictionaries of dataframes | |
""" | |
import os | |
from tqdm import tqdm | |
dfs = {} | |
pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt']) | |
for fname in pbar: | |
pbar.desc = fname | |
df = infodump_file_to_df(os.path.join(infodump_path, fname)) | |
if fname.find('_') > -1: | |
data_type, subsite = fname[:-4].split('_') | |
if subsite not in dfs: | |
dfs[subsite] = {} | |
dfs[subsite][data_type] = df | |
else: | |
dfs[fname[:-4]] = df | |
return dfs | |
def infodump_to_hdf5(infodump_path='.', hdf_path='infodump.h5'): | |
""" | |
:param str infodump_path: path to infodump directory | |
:param str hdf_path: path to HDF data store (will be created if missing) | |
:returns: `str` -- path to HDF data store | |
""" | |
import os | |
import pandas as pd | |
from tqdm import tqdm | |
pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt']) | |
for fname in pbar: | |
pbar.desc = fname | |
df = infodump_file_to_df(os.path.join(infodump_path, fname)) | |
with pd.HDFStore(hdf_path) as store: | |
if fname.find('_') > -1: | |
data_type, subsite = fname[:-4].split('_') | |
store['/'.join([subsite, data_type])] = df | |
else: | |
store[fname[:-4]] = df | |
return hdf_path | |
def infodump_to_sqlite(infodump_path='.', db_path='infodump.db'): | |
""" | |
:param str infodump_path: path to infodump directory | |
:param str db_path: path to SQLite DB (will be created if missing) | |
:returns: `str` -- path to SQLite DB | |
""" | |
from odo import odo | |
import os | |
from tqdm import tqdm | |
pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt']) | |
for fname in pbar: | |
pbar.desc = fname | |
df = infodump_file_to_df(os.path.join(infodump_path, fname)) | |
sql_str = 'sqlite:///{}::{}'.format(db_path, fname[:-4]) | |
odo(df, sql_str) | |
return db_path | |
def infodump_to_csvs(infodump_path='.', csv_dir_path='.'): | |
""" | |
:param str infodump_path: path to infodump directory | |
:param str csv_dir_path: path to the directory where the CSVs should be stored. | |
:returns: `str` -- path to the directory where the CSVs are stored. | |
""" | |
import os | |
import pandas as pd | |
from tqdm import tqdm | |
pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt']) | |
for fname in pbar: | |
pbar.desc = fname | |
df = infodump_file_to_df(os.path.join(infodump_path, fname)) | |
if fname.find('_') > -1: | |
data_type, subsite = fname[:-4].split('_') | |
out_fpath = os.path.join(csv_dir_path, '_'.join(subsite, data_type)+'.csv') | |
else: | |
out_fpath = os.path.join(csv_dir_path, fname[:-4]+'.csv') | |
df.to_csv(out_fpath, index=False) | |
return csv_dir_path |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment