Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import subprocess
import time
import json
import requests
import os
import glob
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import re
good_ner_words = ['united states','the united states','washington','congress','american','america','senate','americans','this morning','the district court','district court','the supreme court','supreme court','california','democrats','house','iraq','d. c.','new york','obama','republicans','john','state','summary judgment','bush',"i'm talking","i'm glad",'texas','etcetera','memory','trust','good morning','u. s','white house','medicare','cdc','i apologize','legislature','the u. s.','english','court','corn','usda','fbi','roll','republican','afghanistan','depression','europe','more than one','siri','the white house','china','clinton',"the district court's",'chris','missouri','internet','florida','democrat','mexico','iran','canada','battle','mike','smith','mark','one percent','rock','counsel','clause','u. s. department of agriculture','the u. s. department of agriculture','one hundred percent','cancer','weii','george','bird','jim','gay','aids','jack','martin','u. s.','earth','minnesota','joe','ford','two hundred','democracy','okay','ohio','constitution','germany','india','five minutes','pennsylvania','honor','afternoon','africa','the house of representatives','john mccain','barack obama','billions of dollars','twenty twenty','democratic party','the democratic party','fifty million dollars']
stop_words = ["a","about","above","after","again","against","ain","all","am","an","and","any","are","aren","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can","couldn","couldn't","d","did","didn","didn't","do","does","doesn","doesn't","doing","don","don't","down","during","each","few","for","from","further","had","hadn","hadn't","has","hasn","hasn't","have","haven","haven't","having","he","her","here","hers","herself","him","himself","his","how","i","if","in","into","is","isn","isn't","it","it's","its","itself","just","ll","m","ma","me","mightn","mightn't","more","most","mustn","mustn't","my","myself","needn","needn't","no","nor","not","now","o","of","off","on","once","only","or","other","our","ours","ourselves","out","over","own","re","s","same","shan","shan't","she","she's","should","should've","shouldn","shouldn't","so","some","such","t","than","that","that'll","the","their","theirs","them","themselves","then","there","these","they","this","those","through","to","too","under","until","up","ve","very","was","wasn","wasn't","we","were","weren","weren't","what","when","where","which","while","who","whom","why","will","with","won","won't","wouldn","wouldn't","y","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","could","he'd","he'll","he's","here's","how's","i'd","i'll","i'm","i've","let's","ought","she'd","she'll","that's","there's","they'd","they'll","they're","they've","we'd","we'll","we're","we've","what's","when's","where's","who's","why's","would","able","abst","accordance","according","accordingly","across","act","actually","added","adj","affected","affecting","affects","afterwards","ah","almost","alone","along","already","also","although","always","among","amongst","announce","another","anybody","anyhow","anymore","anyone","anything","anyway","anyways","anywhere","apparently","approximately","arent","arise","around","aside","ask","asking","auth","available","away","awfully","b","back","became","become","becomes","becoming","beforehand","begin","beginning","beginnings","begins","behind","believe","beside","besides","beyond","biol","brief","briefly","c","ca","came","cannot","can't","cause","causes","certain","certainly","co","com","come","comes","contain","containing","contains","couldnt","date","different","done","downwards","due","e","ed","edu","effect","eg","eight","eighty","either","else","elsewhere","end","ending","enough","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","except","f","far","ff","fifth","first","five","fix","followed","following","follows","former","formerly","forth","found","four","furthermore","g","gave","get","gets","getting","give","given","gives","giving","go","goes","gone","got","gotten","h","happens","hardly","hed","hence","hereafter","hereby","herein","heres","hereupon","hes","hi","hid","hither","home","howbeit","however","hundred","id","ie","im","immediate","immediately","importance","important","inc","indeed","index","information","instead","invention","inward","itd","it'll","j","k","keep","keeps","kept","kg","km","know","known","knows","l","largely","last","lately","later","latter","latterly","least","less","lest","let","lets","like","liked","likely","line","little","'ll","look","looking","looks","ltd","made","mainly","make","makes","many","may","maybe","mean","means","meantime","meanwhile","merely","mg","might","million","miss","ml","moreover","mostly","mr","mrs","much","mug","must","n","na","name","namely","nay","nd","near","nearly","necessarily","necessary","need","needs","neither","never","nevertheless","new","next","nine","ninety","nobody","non","none","nonetheless","noone","normally","nos","noted","nothing","nowhere","obtain","obtained","obviously","often","oh","ok","okay","old","omitted","one","ones","onto","ord","others","otherwise","outside","overall","owing","p","page","pages","part","particular","particularly","past","per","perhaps","placed","please","plus","poorly","possible","possibly","potentially","pp","predominantly","present","previously","primarily","probably","promptly","proud","provides","put","q","que","quickly","quite","qv","r","ran","rather","rd","readily","really","recent","recently","ref","refs","regarding","regardless","regards","related","relatively","research","respectively","resulted","resulting","results","right","run","said","saw","say","saying","says","sec","section","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sent","seven","several","shall","shed","shes","show","showed","shown","showns","shows","significant","significantly","similar","similarly","since","six","slightly","somebody","somehow","someone","somethan","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specifically","specified","specify","specifying","still","stop","strongly","sub","substantially","successfully","sufficiently","suggest","sup","sure","take","taken","taking","tell","tends","th","thank","thanks","thanx","thats","that've","thence","thereafter","thereby","thered","therefore","therein","there'll","thereof","therere","theres","thereto","thereupon","there've","theyd","theyre","think","thou","though","thoughh","thousand","throug","throughout","thru","thus","til","tip","together","took","toward","towards","tried","tries","truly","try","trying","ts","twice","two","u","un","unfortunately","unless","unlike","unlikely","unto","upon","ups","us","use","used","useful","usefully","usefulness","uses","using","usually","v","value","various","'ve","via","viz","vol","vols","vs","w","want","wants","wasnt","way","wed","welcome","went","werent","whatever","what'll","whats","whence","whenever","whereafter","whereas","whereby","wherein","wheres","whereupon","wherever","whether","whim","whither","whod","whoever","whole","who'll","whomever","whos","whose","widely","willing","wish","within","without","wont","words","world","wouldnt","www","x","yes","yet","youd","youre","z","zero","a's","ain't","allow","allows","apart","appear","appreciate","appropriate","associated","best","better","c'mon","c's","cant","changes","clearly","concerning","consequently","consider","considering","corresponding","course","currently","definitely","described","despite","entirely","exactly","example","going","greetings","hello","help","hopefully","ignored","inasmuch","indicate","indicated","indicates","inner","insofar","it'd","keep","keeps","novel","presumably","reasonably","second","secondly","sensible","serious","seriously","sure","t's","third","thorough","thoroughly","three","well","wonder"]
good_bigrams = ["the court","the government","this court","the state","Your Honor","the District","the president","District Court","the statute","the evidence","your honor","with respect","Supreme Court","health care","the law","the jury","the country","the Senate","the language","the person","the defendant","Good morning","the judge","thousand dollars","this bill","cancer screening","entitled to","in Washington","the public","New York","Honor I","the money","that Congress","point is","lung cancer","nineteen eighty","an opportunity","million dollars","North Carolina","my client","state court","the military","final judgment","the defendants","you guys","the authority","substantial assistance","Wall Street","Excuse me","claims that"]
remove_stop_words = True
words = {}
all_grams = {}
all_grams_file_lookup = {}
all_ner_lookup = {}
gram_size_start = 2
gram_size_end = 16
all_files = list(glob.glob('lc-gov-audio-transcripts-ner/*.json'))
for file in all_files:
datafile = json.load(open(file))
for i in datafile['ner']:
w = i.strip().lower()
if 'date' in datafile['ner'][i]['typeMode']:
continue
# print(datafile['ner'][i])
if w not in words:
words[w] = 0
if w not in all_ner_lookup:
all_ner_lookup[w] = []
file = file.replace('lc-gov-audio-transcripts-ner','lc-gov-audio-data').replace('.json','.mp3')
if file not in all_ner_lookup[w]:
all_ner_lookup[w].append(file)
words[w]+=1
transcript = ""
for x in datafile['results']['transcripts']:
transcript = transcript + x['transcript']
transcript=transcript.strip()
transcript = re.sub(r'[^\w\s]','',transcript)
tokenize = word_tokenize(transcript)
grams = {}
for n in range(gram_size_start,gram_size_end):
if n not in all_grams:
all_grams[n] = {}
grams[n] = {}
these_grams = list(ngrams(tokenize,n))
for g in these_grams:
all_stop_words = True
# for w in g:
# if w.lower() not in stop_words:
# all_stop_words = False
# if all_stop_words:
# continue
g = " ".join(g)
if n == 2 and g not in good_bigrams:
continue
if g not in all_grams[n]:
all_grams[n][g] = 0
all_grams[n][g]+=1
if g not in all_grams_file_lookup:
all_grams_file_lookup[g] = []
file = file.replace('lc-gov-audio-transcripts-ner','lc-gov-audio-data').replace('.json','.mp3')
if file not in all_grams_file_lookup[g]:
all_grams_file_lookup[g].append(file)
# print(all_grams_file_lookup)
# print(all_grams[7])
do_the_damn_gram = {}
for nn in range(gram_size_start,gram_size_end):
sorted_all_grams = sorted(all_grams[nn].items(), key=lambda kv: kv[1], reverse=2)
for g in sorted_all_grams[:500]:
# print(g[0])
if g[1]>2:
do_the_damn_gram[g[0]] = {'count':g[1],'files':all_grams_file_lookup[g[0]]}
# print(do_the_damn_gram[g[0]])
json.dump(do_the_damn_gram,open('do_the_damn_gram.json','w'),indent=2)
do_the_damn_ner = {}
sorted_words = sorted(words.items(), key=lambda kv: kv[1], reverse=2)
for x in sorted_words[:1000]:
if x[0] in good_ner_words:
do_the_damn_ner[x[0]] = {'count':x[1],'files':all_ner_lookup[x[0]]}
json.dump(do_the_damn_ner,open('do_the_damn_ner.json','w'),indent=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment