pmlandwehr/infodump_to_others.py

## infodump_to_others.py
def mefi_strptime(x):
    from datetime import datetime
    try:
        return datetime.strptime(x, '%b %d %Y %I:%M:%S:%f%p')
    except ValueError:
        return datetime(1900, 1, 1, 0, 0, 0)


def int_with_filled_nans(x):
    import numpy as np
    try:
        return np.int64(x)
    except ValueError:
        return -1


def posttitles_to_df(fpath):
    """
    Necessitated by a wonky, long title in posttitles_askme.txt
    :param str fpath: path to the file
    :returns: `pandas.DataFrame` -- Dataframe of posttitles file
    """
    import pandas as pd
    df_dict = {'postid': [], 'title': []}
    with open(fpath, 'r') as infile:
        infile.readline()  # pre-date
        infile.readline()  # header
        for line in infile:
            data = line.strip().split('\t')
            df_dict['postid'].append(int_with_filled_nans(data[0]))
            if len(data) < 2:
                df_dict['title'].append('')
            elif len(data) == 2:
                df_dict['title'].append(data[1])
            else:
                df_dict['title'].append(' '.join(data[1:]))

    return pd.DataFrame(df_dict)


def infodump_file_to_df(fpath):
    """
    :param str fpath: path to infodump file
    :returns: `pandas.DataFrame` -- dataframe made from the infodump file
    """
    import pandas as pd
    if fpath.find('posttitles_') > -1:
        return posttitles_to_df(fpath)

    df = pd.read_table(fpath,
                         skiprows=1,
                         converters={
                             'above': int_with_filled_nans,
                             'below': int_with_filled_nans,
                             'best answer?': bool,
                             'category': int_with_filled_nans,
                             'comments': int_with_filled_nans,
                             'date': mefi_strptime,
                             'datestamp': mefi_strptime,
                             'deleted': bool,
                             'favorites': int_with_filled_nans,
                             'joindate': mefi_strptime,
                             'link_date': mefi_strptime,
                             'link_id': int_with_filled_nans,
                             'name': str,
                             'postid': int_with_filled_nans,
                             'reason': str,
                             'tag_id': int_with_filled_nans,
                             'tag_name': str,
                             'title': str,
                             'url': int_with_filled_nans,
                             'urldesc': int_with_filled_nans,
                             'userid': int_with_filled_nans
                         })

    # All the comment data has a "best answer?" column, but it's meaningless
    # except for when dealing with AskMe.
    if 'best answer?' in df.columns and fpath.find('commentdata_askme.txt') == -1:
        del df['best answer?']

    # MetaFilter's post data contains a category column, but it's always 0
    # -except, it seems, on 4 broken posts where it comes up as -1-
    # so should be dropped from this df.
    if 'category' in df.columns and fpath.find('postdata_mefi.txt') > -1:
        del df['category']

    return df


def infodump_to_df_dict(infodump_path='.'):
    """
    :param str infodump_path: path to infodump directory
    :returns `dict` -- dictionary of dataframes and dictionaries of dataframes
    """
    import os
    from tqdm import tqdm
    dfs = {}
    pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
    for fname in pbar:
        pbar.desc = fname
        df = infodump_file_to_df(os.path.join(infodump_path, fname))

    if fname.find('_') > -1:
        data_type, subsite = fname[:-4].split('_')
        if subsite not in dfs:
            dfs[subsite] = {}
        dfs[subsite][data_type] = df
    else:
        dfs[fname[:-4]] = df

    return dfs


def infodump_to_hdf5(infodump_path='.', hdf_path='infodump.h5'):
    """
    :param str infodump_path: path to infodump directory
    :param str hdf_path: path to HDF data store (will be created if missing)
    :returns: `str` -- path to HDF data store
    """
    import os
    import pandas as pd
    from tqdm import tqdm
    pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
    for fname in pbar:
        pbar.desc = fname
        df = infodump_file_to_df(os.path.join(infodump_path, fname))

        with pd.HDFStore(hdf_path) as store:
            if fname.find('_') > -1:
                data_type, subsite = fname[:-4].split('_')
                store['/'.join([subsite, data_type])] = df
            else:
                store[fname[:-4]] = df

    return hdf_path


def infodump_to_sqlite(infodump_path='.', db_path='infodump.db'):
    """
    :param str infodump_path: path to infodump directory
    :param str db_path: path to SQLite DB (will be created if missing)
    :returns: `str` -- path to SQLite DB
    """
    from odo import odo
    import os
    from tqdm import tqdm
    pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
    for fname in pbar:
        pbar.desc = fname
        df = infodump_file_to_df(os.path.join(infodump_path, fname))
        sql_str = 'sqlite:///{}::{}'.format(db_path, fname[:-4])
        odo(df, sql_str)

    return db_path


def infodump_to_csvs(infodump_path='.', csv_dir_path='.'):
    """
    :param str infodump_path: path to infodump directory
    :param str csv_dir_path: path to the directory where the CSVs should be stored.
    :returns: `str` -- path to the directory where the CSVs are stored.
    """
    import os
    import pandas as pd
    from tqdm import tqdm
    pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
    for fname in pbar:
        pbar.desc = fname
        df = infodump_file_to_df(os.path.join(infodump_path, fname))

        if fname.find('_') > -1:
            data_type, subsite = fname[:-4].split('_')
            out_fpath = os.path.join(csv_dir_path, '_'.join(subsite, data_type)+'.csv')
        else:
            out_fpath = os.path.join(csv_dir_path, fname[:-4]+'.csv')

        df.to_csv(out_fpath, index=False)

    return csv_dir_path
	def mefi_strptime(x):
	from datetime import datetime
	try:
	return datetime.strptime(x, '%b %d %Y %I:%M:%S:%f%p')
	except ValueError:
	return datetime(1900, 1, 1, 0, 0, 0)


	def int_with_filled_nans(x):
	import numpy as np
	try:
	return np.int64(x)
	except ValueError:
	return -1


	def posttitles_to_df(fpath):
	"""
	Necessitated by a wonky, long title in posttitles_askme.txt
	:param str fpath: path to the file
	:returns: `pandas.DataFrame` -- Dataframe of posttitles file
	"""
	import pandas as pd
	df_dict = {'postid': [], 'title': []}
	with open(fpath, 'r') as infile:
	infile.readline() # pre-date
	infile.readline() # header
	for line in infile:
	data = line.strip().split('\t')
	df_dict['postid'].append(int_with_filled_nans(data[0]))
	if len(data) < 2:
	df_dict['title'].append('')
	elif len(data) == 2:
	df_dict['title'].append(data[1])
	else:
	df_dict['title'].append(' '.join(data[1:]))

	return pd.DataFrame(df_dict)


	def infodump_file_to_df(fpath):
	"""
	:param str fpath: path to infodump file
	:returns: `pandas.DataFrame` -- dataframe made from the infodump file
	"""
	import pandas as pd
	if fpath.find('posttitles_') > -1:
	return posttitles_to_df(fpath)

	df = pd.read_table(fpath,
	skiprows=1,
	converters={
	'above': int_with_filled_nans,
	'below': int_with_filled_nans,
	'best answer?': bool,
	'category': int_with_filled_nans,
	'comments': int_with_filled_nans,
	'date': mefi_strptime,
	'datestamp': mefi_strptime,
	'deleted': bool,
	'favorites': int_with_filled_nans,
	'joindate': mefi_strptime,
	'link_date': mefi_strptime,
	'link_id': int_with_filled_nans,
	'name': str,
	'postid': int_with_filled_nans,
	'reason': str,
	'tag_id': int_with_filled_nans,
	'tag_name': str,
	'title': str,
	'url': int_with_filled_nans,
	'urldesc': int_with_filled_nans,
	'userid': int_with_filled_nans
	})

	# All the comment data has a "best answer?" column, but it's meaningless
	# except for when dealing with AskMe.
	if 'best answer?' in df.columns and fpath.find('commentdata_askme.txt') == -1:
	del df['best answer?']

	# MetaFilter's post data contains a category column, but it's always 0
	# -except, it seems, on 4 broken posts where it comes up as -1-
	# so should be dropped from this df.
	if 'category' in df.columns and fpath.find('postdata_mefi.txt') > -1:
	del df['category']

	return df


	def infodump_to_df_dict(infodump_path='.'):
	"""
	:param str infodump_path: path to infodump directory
	:returns `dict` -- dictionary of dataframes and dictionaries of dataframes
	"""
	import os
	from tqdm import tqdm
	dfs = {}
	pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
	for fname in pbar:
	pbar.desc = fname
	df = infodump_file_to_df(os.path.join(infodump_path, fname))

	if fname.find('_') > -1:
	data_type, subsite = fname[:-4].split('_')
	if subsite not in dfs:
	dfs[subsite] = {}
	dfs[subsite][data_type] = df
	else:
	dfs[fname[:-4]] = df

	return dfs


	def infodump_to_hdf5(infodump_path='.', hdf_path='infodump.h5'):
	"""
	:param str infodump_path: path to infodump directory
	:param str hdf_path: path to HDF data store (will be created if missing)
	:returns: `str` -- path to HDF data store
	"""
	import os
	import pandas as pd
	from tqdm import tqdm
	pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
	for fname in pbar:
	pbar.desc = fname
	df = infodump_file_to_df(os.path.join(infodump_path, fname))

	with pd.HDFStore(hdf_path) as store:
	if fname.find('_') > -1:
	data_type, subsite = fname[:-4].split('_')
	store['/'.join([subsite, data_type])] = df
	else:
	store[fname[:-4]] = df

	return hdf_path


	def infodump_to_sqlite(infodump_path='.', db_path='infodump.db'):
	"""
	:param str infodump_path: path to infodump directory
	:param str db_path: path to SQLite DB (will be created if missing)
	:returns: `str` -- path to SQLite DB
	"""
	from odo import odo
	import os
	from tqdm import tqdm
	pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
	for fname in pbar:
	pbar.desc = fname
	df = infodump_file_to_df(os.path.join(infodump_path, fname))
	sql_str = 'sqlite:///{}::{}'.format(db_path, fname[:-4])
	odo(df, sql_str)

	return db_path


	def infodump_to_csvs(infodump_path='.', csv_dir_path='.'):
	"""
	:param str infodump_path: path to infodump directory
	:param str csv_dir_path: path to the directory where the CSVs should be stored.
	:returns: `str` -- path to the directory where the CSVs are stored.
	"""
	import os
	import pandas as pd
	from tqdm import tqdm
	pbar = tqdm([x for x in os.listdir(infodump_path) if x[-4:] == '.txt'])
	for fname in pbar:
	pbar.desc = fname
	df = infodump_file_to_df(os.path.join(infodump_path, fname))

	if fname.find('_') > -1:
	data_type, subsite = fname[:-4].split('_')
	out_fpath = os.path.join(csv_dir_path, '_'.join(subsite, data_type)+'.csv')
	else:
	out_fpath = os.path.join(csv_dir_path, fname[:-4]+'.csv')

	df.to_csv(out_fpath, index=False)

	return csv_dir_path