Skip to content

Instantly share code, notes, and snippets.

@pmlandwehr
Forked from goingtomaine/infodump_to_others.py
Last active April 11, 2017 01:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pmlandwehr/f5c3a2dc8de935f5e4bf2404a36340be to your computer and use it in GitHub Desktop.
Save pmlandwehr/f5c3a2dc8de935f5e4bf2404a36340be to your computer and use it in GitHub Desktop.
Read the infodump into a dictionary of dataframes, an HDF5 file, or SQLite
def mefi_strptime(x):
from datetime import datetime
try:
return datetime.strptime(x, '%b %d %Y %I:%M:%S:%f%p')
except ValueError:
return datetime(1900, 1, 1, 0, 0, 0)
def int_with_filled_nans(x):
import numpy as np
try:
return np.int64(x)
except ValueError:
return -1
def posttitles_to_df(fpath):
"""
Necessitated by a wonky, long title in posttitles_askme.txt
:param str fpath: path to the file
:returns: `pandas.DataFrame` -- Dataframe of posttitles file
"""
import pandas as pd
df_dict = {'postid': [], 'title': []}
with open(fpath, 'r') as infile:
infile.readline() # pre-date
infile.readline() # header
for line in infile:
data = line.strip().split('\t')
df_dict['postid'].append(int_with_filled_nans(data[0]))
if len(data) < 2:
df_dict['title'].append('')
elif len(data) == 2:
df_dict['title'].append(data[1])
else:
df_dict['title'].append(' '.join(data[1:]))
return pd.DataFrame(df_dict)
def infodump_file_to_df(fpath):
"""
:param str fpath: path to infodump file
:returns: `pandas.DataFrame` -- dataframe made from the infodump file
"""
import pandas as pd
if fpath.find('posttitles_') > -1:
return posttitles_to_df(fpath)
df = pd.read_table(fpath,
skiprows=1,
converters={
'above': int_with_filled_nans,
'below': int_with_filled_nans,
'best answer?': bool,
'category': int_with_filled_nans,
'comments': int_with_filled_nans,
'date': mefi_strptime,
'datestamp': mefi_strptime,
'deleted': bool,
'favorites': int_with_filled_nans,
'joindate': mefi_strptime,
'link_date': mefi_strptime,
'link_id': int_with_filled_nans,
'name': str,
'postid': int_with_filled_nans,
'reason': str,
'tag_id': int_with_filled_nans,
'tag_name': str,
'title': str,
'url': int_with_filled_nans,
'urldesc': int_with_filled_nans,
'userid': int_with_filled_nans
})
# All the comment data has a "best answer?" column, but it's meaningless
# except for when dealing with AskMe.
if 'best answer?' in df.columns and fpath.find('commentdata_askme.txt') == -1:
del df['best answer?']
# MetaFilter's post data contains a category column, but it's always 0
# -except, it seems, on 4 broken posts where it comes up as -1-
# so should be dropped from this df.
if 'category' in df.columns and fpath.find('postdata_mefi.txt') > -1:
del df['category']
return df
def infodump_to_df_dict(infodump_path='.'):
"""
:param str infodump_path: path to infodump directory
:returns `dict` -- dictionary of dataframes and dictionaries of dataframes
"""
import os
from tqdm import tqdm
dfs = {}
pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
for fname in pbar:
pbar.desc = fname
df = infodump_file_to_df(os.path.join(infodump_path, fname))
if fname.find('_') > -1:
data_type, subsite = fname[:-4].split('_')
if subsite not in dfs:
dfs[subsite] = {}
dfs[subsite][data_type] = df
else:
dfs[fname[:-4]] = df
return dfs
def infodump_to_hdf5(infodump_path='.', hdf_path='infodump.h5'):
"""
:param str infodump_path: path to infodump directory
:param str hdf_path: path to HDF data store (will be created if missing)
:returns: `str` -- path to HDF data store
"""
import os
import pandas as pd
from tqdm import tqdm
pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
for fname in pbar:
pbar.desc = fname
df = infodump_file_to_df(os.path.join(infodump_path, fname))
with pd.HDFStore(hdf_path) as store:
if fname.find('_') > -1:
data_type, subsite = fname[:-4].split('_')
store['/'.join([subsite, data_type])] = df
else:
store[fname[:-4]] = df
return hdf_path
def infodump_to_sqlite(infodump_path='.', db_path='infodump.db'):
"""
:param str infodump_path: path to infodump directory
:param str db_path: path to SQLite DB (will be created if missing)
:returns: `str` -- path to SQLite DB
"""
from odo import odo
import os
from tqdm import tqdm
pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
for fname in pbar:
pbar.desc = fname
df = infodump_file_to_df(os.path.join(infodump_path, fname))
sql_str = 'sqlite:///{}::{}'.format(db_path, fname[:-4])
odo(df, sql_str)
return db_path
def infodump_to_csvs(infodump_path='.', csv_dir_path='.'):
"""
:param str infodump_path: path to infodump directory
:param str csv_dir_path: path to the directory where the CSVs should be stored.
:returns: `str` -- path to the directory where the CSVs are stored.
"""
import os
import pandas as pd
from tqdm import tqdm
pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
for fname in pbar:
pbar.desc = fname
df = infodump_file_to_df(os.path.join(infodump_path, fname))
if fname.find('_') > -1:
data_type, subsite = fname[:-4].split('_')
out_fpath = os.path.join(csv_dir_path, '_'.join(subsite, data_type)+'.csv')
else:
out_fpath = os.path.join(csv_dir_path, fname[:-4]+'.csv')
df.to_csv(out_fpath, index=False)
return csv_dir_path
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment