adithya-badidey/LiwcUtils.py

## LiwcUtils.py
import liwc
import csv
import numpy as np
import re
import seaborn as sns

def printProgress(message, progress, limit):
    output = f"{message} ({progress}/{limit})"
    print("\r{: <100}".format(output), end="")

class LiwcUtils:
    def __init__(self, path, configpath):
        self._parse, self.cat_names = liwc.load_token_parser(path)
        self.cat_dict = {}
        with open (configpath, 'r') as config:
            config.readline()
            csvFile = csv.reader(config)

            for line in csvFile:
                self.cat_dict[line[1]] = int(line[0])
        self.numcats = len(self.cat_names)

    def getCats(self):
        return self.cat_dict.copy()

    def tokenize(self, string):
        for match in re.finditer(r'\w+', string.lower(), re.UNICODE):
            yield match.group(0)

    def parse(self, string, arr=None, unrecognized=None):
        if arr is None:
            arr = np.zeros(self.numcats)
        tokens = self.tokenize(string)
        if unrecognized is None:
            unrecognized = {}
        total = 0
        for token in tokens:
            total += 1
            count = 0
            for cat in self._parse(token):
#                 print(self.cat_dict[cat])
                arr[self.cat_dict[cat]] += 1
                count += 1
            if count == 0:
                if token not in unrecognized:
                    unrecognized[token] = 1
                else:
                    unrecognized[token] += 1

        return arr, unrecognized, total

    def parseSeries(self, series, normalizeRows=True, normalizeCols=False):
        if isinstance(series, pd.Series) or isinstance(series, list):
            pass
        else:
            raise Exception("This accepts only pandas.Series or Lists")
            return
        l = len(series)
        print("(LIWCUtils) Parsing Series of size", l)
        arr = np.zeros((l, self.numcats))
        unrecognized = {}
        count = 0
        for i in range(l):
            _,_,c = self.parse(series[i], arr[i], unrecognized)
            count += c
            if i % 1000 == 0:
                printProgress("(LIWCUtils) Parsed", i, l)

        printProgress("(LIWCUtils) Parsed", l, l)
        print()

        print(f"(LIWCUtils) Total number of tokens unrecognized = {sum(unrecognized.values())}/{count}")

        print(f"(LIWCUtils) Top ten unrecognized tokens are:")
        print(f"{'Token':20}Freq")
        print(f"{'-----':20}----")
        for i in sorted(unrecognized.items(), key=lambda x: x[1], reverse=True)[:20]:
            print(f"{i[0]:20}{i[1]}")

        if normalizeRows:
            print(f"(LIWCUtils) Normalizing Rows")
            s = np.linalg.norm(arr, ord=1, axis=1) + 1e-12 #Adding a very small number to avoid divide by zero errors
            arr = arr/(s[:,None])

        if normalizeCols:
            print(f"(LIWCUtils) Normalizing Columns")
            s = np.linalg.norm(arr, ord=1, axis=0) + 1e-12
            arr = arr/s

        print(f"(LIWCUtils) Done!")
        return arr

# Initialize using this line
lc = LiwcUtils('/fastdata/LIWC2015_English.dic', '/notebooks/adithya/liwc-experiments/config/liwc_cats.csv')

# Parse series using this command
res = lc.parseSeries(processed)
	import liwc
	import csv
	import numpy as np
	import re
	import seaborn as sns

	def printProgress(message, progress, limit):
	output = f"{message} ({progress}/{limit})"
	print("\r{: <100}".format(output), end="")

	class LiwcUtils:
	def __init__(self, path, configpath):
	self._parse, self.cat_names = liwc.load_token_parser(path)
	self.cat_dict = {}
	with open (configpath, 'r') as config:
	config.readline()
	csvFile = csv.reader(config)

	for line in csvFile:
	self.cat_dict[line[1]] = int(line[0])
	self.numcats = len(self.cat_names)

	def getCats(self):
	return self.cat_dict.copy()

	def tokenize(self, string):
	for match in re.finditer(r'\w+', string.lower(), re.UNICODE):
	yield match.group(0)

	def parse(self, string, arr=None, unrecognized=None):
	if arr is None:
	arr = np.zeros(self.numcats)
	tokens = self.tokenize(string)
	if unrecognized is None:
	unrecognized = {}
	total = 0
	for token in tokens:
	total += 1
	count = 0
	for cat in self._parse(token):
	# print(self.cat_dict[cat])
	arr[self.cat_dict[cat]] += 1
	count += 1
	if count == 0:
	if token not in unrecognized:
	unrecognized[token] = 1
	else:
	unrecognized[token] += 1

	return arr, unrecognized, total

	def parseSeries(self, series, normalizeRows=True, normalizeCols=False):
	if isinstance(series, pd.Series) or isinstance(series, list):
	pass
	else:
	raise Exception("This accepts only pandas.Series or Lists")
	return
	l = len(series)
	print("(LIWCUtils) Parsing Series of size", l)
	arr = np.zeros((l, self.numcats))
	unrecognized = {}
	count = 0
	for i in range(l):
	_,_,c = self.parse(series[i], arr[i], unrecognized)
	count += c
	if i % 1000 == 0:
	printProgress("(LIWCUtils) Parsed", i, l)

	printProgress("(LIWCUtils) Parsed", l, l)
	print()

	print(f"(LIWCUtils) Total number of tokens unrecognized = {sum(unrecognized.values())}/{count}")

	print(f"(LIWCUtils) Top ten unrecognized tokens are:")
	print(f"{'Token':20}Freq")
	print(f"{'-----':20}----")
	for i in sorted(unrecognized.items(), key=lambda x: x[1], reverse=True)[:20]:
	print(f"{i[0]:20}{i[1]}")

	if normalizeRows:
	print(f"(LIWCUtils) Normalizing Rows")
	s = np.linalg.norm(arr, ord=1, axis=1) + 1e-12 #Adding a very small number to avoid divide by zero errors
	arr = arr/(s[:,None])

	if normalizeCols:
	print(f"(LIWCUtils) Normalizing Columns")
	s = np.linalg.norm(arr, ord=1, axis=0) + 1e-12
	arr = arr/s

	print(f"(LIWCUtils) Done!")
	return arr

	# Initialize using this line
	lc = LiwcUtils('/fastdata/LIWC2015_English.dic', '/notebooks/adithya/liwc-experiments/config/liwc_cats.csv')

	# Parse series using this command
	res = lc.parseSeries(processed)