Skip to content

Instantly share code, notes, and snippets.

@anishthite
Created December 15, 2020 05:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save anishthite/c38718bf241697f3455ec7fc2bab070c to your computer and use it in GitHub Desktop.
Save anishthite/c38718bf241697f3455ec7fc2bab070c to your computer and use it in GitHub Desktop.
profanity
import lm_dataformat as lmd
from glob import glob
import os
import json
import collections
from tqdm import tqdm
import re
from best_download import download_file
import fasttext
import zstandard
import multiprocessing as mp
from profanity_check import predict
in_path = 'pile'
out_path = 'pile_analysis'
def profanity(doc):
words = re.split(r'\s+', doc)
splitdoc = [' '.join([x,y,z]) for x, y, z in itertools.zip_longest(words[:-2], words[1:-1], words[2:])]
return {
'num_profane': predict(splitdoc)
}
def writef(f, lines):
with open(f, 'wb') as fh:
cctx = zstandard.ZstdCompressor(level=3, threads=8)
compressor = cctx.stream_writer(fh)
for line in tqdm(lines):
compressor.write(line)
compressor.flush(zstandard.FLUSH_FRAME)
def analyze(ob):
doc, meta = ob
res = {
'pile_set_name': meta['pile_set_name']
}
for metric in metrics:
res = {**res, **metric(doc)}
return json.dumps(res).encode('utf-8')
metrics = [
profanity
]
pool = mp.Pool(24)
for f in tqdm(sorted(glob(in_path + '/*'))):
if os.path.exists(out_path + '/analysis_' + f.split('/')[-1]): continue
def meta_items():
rdr = lmd.Reader(f)
return pool.imap(analyze, rdr.stream_data(get_meta=True))
writef(out_path + '/tmp_analysis_' + f.split('/')[-1], meta_items())
os.rename(out_path + '/tmp_analysis_' + f.split('/')[-1], out_path + '/analysis_' + f.split('/')[-1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment