Skip to content

Instantly share code, notes, and snippets.

@roblesch
Created July 27, 2020 17:35
Show Gist options
  • Save roblesch/362bd9084ff232ef6866848f544ea7a3 to your computer and use it in GitHub Desktop.
Save roblesch/362bd9084ff232ef6866848f544ea7a3 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import glob
import re
def readfile(file_name):
"""Read a newline delimited file.
"""
return open(file_name, encoding='utf8').read().split('\n')
def parse_line(line):
"""Parse a line of conversation.
Regex the line for t=\d+> and return [user, t, message]
"""
# search for a timestamp
match = re.search(r't=\d+>', line)
# if there is none, line is not a chat msg
if not match:
return [None, None, None]
# else, return [user, time, message]
start, end = match.start(), match.end()
return [line[1:start - 1], line[start + 2:end - 1], line[end:]]
def txt_file_to_df(filename):
"""Read a .txt file as a dataframe
Parse lines of a specified .txt file into a dataframe of [user, t, message].
Fixes timestampes to be 0-indexed.
"""
# sanitize path
filename = filename.replace('\\', '/')
# collect friendly id
fileid = filename[12:-4]
# read data
df = pd.DataFrame(data=map(parse_line, readfile(filename)),
columns=['user', 't', 'message'])
# remove empty rows
df = df.dropna()
# append file id
df.insert(0, 'id', fileid)
# fix timestamps
df['t'] = df['t'].astype(int)
df['t'] -= df['t'][1]
return df
def read_txt_in_dir(dirname):
"""Read all the .txt files in a dir.
Creates a vertical df of conversation data indexed by file id.
"""
filenames = glob.glob(dirname + '/*')
return pd.concat(map(txt_file_to_df, filenames))
# read a directory of text files
dirname = '../data/txt'
data = read_txt_in_dir(dirname)
data.tail()
# filter by file id
ir = data[data['id'] == '0-ImminentRisk']
ir.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment