Created
October 12, 2021 21:02
-
-
Save adithya-badidey/77000887845ba35c600d71def77219ef to your computer and use it in GitHub Desktop.
LiwcUtils class for Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import liwc | |
import csv | |
import numpy as np | |
import re | |
import seaborn as sns | |
def printProgress(message, progress, limit): | |
output = f"{message} ({progress}/{limit})" | |
print("\r{: <100}".format(output), end="") | |
class LiwcUtils: | |
def __init__(self, path, configpath): | |
self._parse, self.cat_names = liwc.load_token_parser(path) | |
self.cat_dict = {} | |
with open (configpath, 'r') as config: | |
config.readline() | |
csvFile = csv.reader(config) | |
for line in csvFile: | |
self.cat_dict[line[1]] = int(line[0]) | |
self.numcats = len(self.cat_names) | |
def getCats(self): | |
return self.cat_dict.copy() | |
def tokenize(self, string): | |
for match in re.finditer(r'\w+', string.lower(), re.UNICODE): | |
yield match.group(0) | |
def parse(self, string, arr=None, unrecognized=None): | |
if arr is None: | |
arr = np.zeros(self.numcats) | |
tokens = self.tokenize(string) | |
if unrecognized is None: | |
unrecognized = {} | |
total = 0 | |
for token in tokens: | |
total += 1 | |
count = 0 | |
for cat in self._parse(token): | |
# print(self.cat_dict[cat]) | |
arr[self.cat_dict[cat]] += 1 | |
count += 1 | |
if count == 0: | |
if token not in unrecognized: | |
unrecognized[token] = 1 | |
else: | |
unrecognized[token] += 1 | |
return arr, unrecognized, total | |
def parseSeries(self, series, normalizeRows=True, normalizeCols=False): | |
if isinstance(series, pd.Series) or isinstance(series, list): | |
pass | |
else: | |
raise Exception("This accepts only pandas.Series or Lists") | |
return | |
l = len(series) | |
print("(LIWCUtils) Parsing Series of size", l) | |
arr = np.zeros((l, self.numcats)) | |
unrecognized = {} | |
count = 0 | |
for i in range(l): | |
_,_,c = self.parse(series[i], arr[i], unrecognized) | |
count += c | |
if i % 1000 == 0: | |
printProgress("(LIWCUtils) Parsed", i, l) | |
printProgress("(LIWCUtils) Parsed", l, l) | |
print() | |
print(f"(LIWCUtils) Total number of tokens unrecognized = {sum(unrecognized.values())}/{count}") | |
print(f"(LIWCUtils) Top ten unrecognized tokens are:") | |
print(f"{'Token':20}Freq") | |
print(f"{'-----':20}----") | |
for i in sorted(unrecognized.items(), key=lambda x: x[1], reverse=True)[:20]: | |
print(f"{i[0]:20}{i[1]}") | |
if normalizeRows: | |
print(f"(LIWCUtils) Normalizing Rows") | |
s = np.linalg.norm(arr, ord=1, axis=1) + 1e-12 #Adding a very small number to avoid divide by zero errors | |
arr = arr/(s[:,None]) | |
if normalizeCols: | |
print(f"(LIWCUtils) Normalizing Columns") | |
s = np.linalg.norm(arr, ord=1, axis=0) + 1e-12 | |
arr = arr/s | |
print(f"(LIWCUtils) Done!") | |
return arr | |
# Initialize using this line | |
lc = LiwcUtils('/fastdata/LIWC2015_English.dic', '/notebooks/adithya/liwc-experiments/config/liwc_cats.csv') | |
# Parse series using this command | |
res = lc.parseSeries(processed) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment