Created
July 27, 2020 17:35
-
-
Save roblesch/362bd9084ff232ef6866848f544ea7a3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import glob | |
import re | |
def readfile(file_name): | |
"""Read a newline delimited file. | |
""" | |
return open(file_name, encoding='utf8').read().split('\n') | |
def parse_line(line): | |
"""Parse a line of conversation. | |
Regex the line for t=\d+> and return [user, t, message] | |
""" | |
# search for a timestamp | |
match = re.search(r't=\d+>', line) | |
# if there is none, line is not a chat msg | |
if not match: | |
return [None, None, None] | |
# else, return [user, time, message] | |
start, end = match.start(), match.end() | |
return [line[1:start - 1], line[start + 2:end - 1], line[end:]] | |
def txt_file_to_df(filename): | |
"""Read a .txt file as a dataframe | |
Parse lines of a specified .txt file into a dataframe of [user, t, message]. | |
Fixes timestampes to be 0-indexed. | |
""" | |
# sanitize path | |
filename = filename.replace('\\', '/') | |
# collect friendly id | |
fileid = filename[12:-4] | |
# read data | |
df = pd.DataFrame(data=map(parse_line, readfile(filename)), | |
columns=['user', 't', 'message']) | |
# remove empty rows | |
df = df.dropna() | |
# append file id | |
df.insert(0, 'id', fileid) | |
# fix timestamps | |
df['t'] = df['t'].astype(int) | |
df['t'] -= df['t'][1] | |
return df | |
def read_txt_in_dir(dirname): | |
"""Read all the .txt files in a dir. | |
Creates a vertical df of conversation data indexed by file id. | |
""" | |
filenames = glob.glob(dirname + '/*') | |
return pd.concat(map(txt_file_to_df, filenames)) | |
# read a directory of text files | |
dirname = '../data/txt' | |
data = read_txt_in_dir(dirname) | |
data.tail() | |
# filter by file id | |
ir = data[data['id'] == '0-ImminentRisk'] | |
ir.head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment