Skip to content

Instantly share code, notes, and snippets.

@vanatteveldt
Created January 6, 2020 22:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vanatteveldt/7fdb225c2f22b6f5d030e600377dca30 to your computer and use it in GitHub Desktop.
Save vanatteveldt/7fdb225c2f22b6f5d030e600377dca30 to your computer and use it in GitHub Desktop.
import re
import csv
import sys
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from collections import Counter
es = Elasticsearch("localhost:9201")
index = "amcat_vu"
setid = 1021
query = {"query": {"term": {"sets": setid}}}
freqs = Counter()
for i, result in enumerate(scan(es, query, index=index)):
if not i% 1000: print(i, file=sys.stderr)
doc = result['_source']
for t in doc['title'], doc['text']:
freqs.update(re.findall("\w+", t.lower()))
o = csv.writer(sys.stdout)
o.writerow(["n", "word"])
for w, i in freqs.most_common():
o.writerow([i, w])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment