Skip to content

Instantly share code, notes, and snippets.

@adithya-badidey
Created October 12, 2021 21:02
Show Gist options
  • Save adithya-badidey/77000887845ba35c600d71def77219ef to your computer and use it in GitHub Desktop.
Save adithya-badidey/77000887845ba35c600d71def77219ef to your computer and use it in GitHub Desktop.
LiwcUtils class for Python
import liwc
import csv
import numpy as np
import re
import seaborn as sns
def printProgress(message, progress, limit):
output = f"{message} ({progress}/{limit})"
print("\r{: <100}".format(output), end="")
class LiwcUtils:
def __init__(self, path, configpath):
self._parse, self.cat_names = liwc.load_token_parser(path)
self.cat_dict = {}
with open (configpath, 'r') as config:
config.readline()
csvFile = csv.reader(config)
for line in csvFile:
self.cat_dict[line[1]] = int(line[0])
self.numcats = len(self.cat_names)
def getCats(self):
return self.cat_dict.copy()
def tokenize(self, string):
for match in re.finditer(r'\w+', string.lower(), re.UNICODE):
yield match.group(0)
def parse(self, string, arr=None, unrecognized=None):
if arr is None:
arr = np.zeros(self.numcats)
tokens = self.tokenize(string)
if unrecognized is None:
unrecognized = {}
total = 0
for token in tokens:
total += 1
count = 0
for cat in self._parse(token):
# print(self.cat_dict[cat])
arr[self.cat_dict[cat]] += 1
count += 1
if count == 0:
if token not in unrecognized:
unrecognized[token] = 1
else:
unrecognized[token] += 1
return arr, unrecognized, total
def parseSeries(self, series, normalizeRows=True, normalizeCols=False):
if isinstance(series, pd.Series) or isinstance(series, list):
pass
else:
raise Exception("This accepts only pandas.Series or Lists")
return
l = len(series)
print("(LIWCUtils) Parsing Series of size", l)
arr = np.zeros((l, self.numcats))
unrecognized = {}
count = 0
for i in range(l):
_,_,c = self.parse(series[i], arr[i], unrecognized)
count += c
if i % 1000 == 0:
printProgress("(LIWCUtils) Parsed", i, l)
printProgress("(LIWCUtils) Parsed", l, l)
print()
print(f"(LIWCUtils) Total number of tokens unrecognized = {sum(unrecognized.values())}/{count}")
print(f"(LIWCUtils) Top ten unrecognized tokens are:")
print(f"{'Token':20}Freq")
print(f"{'-----':20}----")
for i in sorted(unrecognized.items(), key=lambda x: x[1], reverse=True)[:20]:
print(f"{i[0]:20}{i[1]}")
if normalizeRows:
print(f"(LIWCUtils) Normalizing Rows")
s = np.linalg.norm(arr, ord=1, axis=1) + 1e-12 #Adding a very small number to avoid divide by zero errors
arr = arr/(s[:,None])
if normalizeCols:
print(f"(LIWCUtils) Normalizing Columns")
s = np.linalg.norm(arr, ord=1, axis=0) + 1e-12
arr = arr/s
print(f"(LIWCUtils) Done!")
return arr
# Initialize using this line
lc = LiwcUtils('/fastdata/LIWC2015_English.dic', '/notebooks/adithya/liwc-experiments/config/liwc_cats.csv')
# Parse series using this command
res = lc.parseSeries(processed)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment