Skip to content

Instantly share code, notes, and snippets.

@fauxneticien
Created September 3, 2022 17:35
Show Gist options
  • Save fauxneticien/a75ff73c79a6ca3371c1006d88ce11e6 to your computer and use it in GitHub Desktop.
Save fauxneticien/a75ff73c79a6ca3371c1006d88ce11e6 to your computer and use it in GitHub Desktop.
Script to process vocabulary
import pandas as pd
from collections import Counter
from tqdm.contrib.concurrent import process_map
def get_vocab(texts_list, ids_list=None):
def sum_counters(counter_list):
'''
Recursive counter with a O(log(n)) Complexity
Sourced from https://stackoverflow.com/a/62393323
'''
if len(counter_list) > 10:
counter_0 = sum_counters(counter_list[:int(len(counter_list)/2)])
counter_1 = sum_counters(counter_list[int(len(counter_list)/2):])
return sum([counter_0, counter_1], Counter())
else:
return sum(counter_list, Counter())
ids_list = range(len(texts_list)) if ids_list is None else ids_list
char_counts = process_map(Counter, texts_list, chunksize=1000)
# Document-character counts matrix
text_char_df = pd.concat([
pd.DataFrame({ 'id' : ids_list, 'text' : texts_list }),
pd.DataFrame(char_counts).fillna(0).astype(int)
], axis=1)
# Aggregates
char_aggs = sum_counters(char_counts)
return char_aggs, text_char_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment