roblesch/parsingutils.py

## parsingutils.py
import pandas as pd
import numpy as np
import glob
import re

def readfile(file_name):
    """Read a newline delimited file.
    """
    return open(file_name, encoding='utf8').read().split('\n')

def parse_line(line):
    """Parse a line of conversation.

    Regex the line for t=\d+> and return [user, t, message]
    """
    # search for a timestamp
    match = re.search(r't=\d+>', line)
    # if there is none, line is not a chat msg
    if not match:
        return [None, None, None]
    # else, return [user, time, message]
    start, end = match.start(), match.end()
    return [line[1:start - 1], line[start + 2:end - 1], line[end:]]


def txt_file_to_df(filename):
    """Read a .txt file as a dataframe

    Parse lines of a specified .txt file into a dataframe of [user, t, message].
    Fixes timestampes to be 0-indexed.
    """
    # sanitize path
    filename = filename.replace('\\', '/')
    # collect friendly id
    fileid = filename[12:-4]

    # read data
    df = pd.DataFrame(data=map(parse_line, readfile(filename)),
                      columns=['user', 't', 'message'])

    # remove empty rows
    df = df.dropna()

    # append file id
    df.insert(0, 'id', fileid)

    # fix timestamps
    df['t'] = df['t'].astype(int)
    df['t'] -= df['t'][1]

    return df

def read_txt_in_dir(dirname):
    """Read all the .txt files in a dir.

    Creates a vertical df of conversation data indexed by file id.
    """
    filenames = glob.glob(dirname + '/*')
    return pd.concat(map(txt_file_to_df, filenames))


# read a directory of text files
dirname = '../data/txt'
data = read_txt_in_dir(dirname)
data.tail()

# filter by file id
ir = data[data['id'] == '0-ImminentRisk']
ir.head()
	import pandas as pd
	import numpy as np
	import glob
	import re

	def readfile(file_name):
	"""Read a newline delimited file.
	"""
	return open(file_name, encoding='utf8').read().split('\n')

	def parse_line(line):
	"""Parse a line of conversation.

	Regex the line for t=\d+> and return [user, t, message]
	"""
	# search for a timestamp
	match = re.search(r't=\d+>', line)
	# if there is none, line is not a chat msg
	if not match:
	return [None, None, None]
	# else, return [user, time, message]
	start, end = match.start(), match.end()
	return [line[1:start - 1], line[start + 2:end - 1], line[end:]]


	def txt_file_to_df(filename):
	"""Read a .txt file as a dataframe

	Parse lines of a specified .txt file into a dataframe of [user, t, message].
	Fixes timestampes to be 0-indexed.
	"""
	# sanitize path
	filename = filename.replace('\\', '/')
	# collect friendly id
	fileid = filename[12:-4]

	# read data
	df = pd.DataFrame(data=map(parse_line, readfile(filename)),
	columns=['user', 't', 'message'])

	# remove empty rows
	df = df.dropna()

	# append file id
	df.insert(0, 'id', fileid)

	# fix timestamps
	df['t'] = df['t'].astype(int)
	df['t'] -= df['t'][1]

	return df

	def read_txt_in_dir(dirname):
	"""Read all the .txt files in a dir.

	Creates a vertical df of conversation data indexed by file id.
	"""
	filenames = glob.glob(dirname + '/*')
	return pd.concat(map(txt_file_to_df, filenames))


	# read a directory of text files
	dirname = '../data/txt'
	data = read_txt_in_dir(dirname)
	data.tail()

	# filter by file id
	ir = data[data['id'] == '0-ImminentRisk']
	ir.head()