Skip to content

Instantly share code, notes, and snippets.

@espio999
Created September 2, 2023 14:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save espio999/c724a19eba91a0fc2602a27d03085ab9 to your computer and use it in GitHub Desktop.
Save espio999/c724a19eba91a0fc2602a27d03085ab9 to your computer and use it in GitHub Desktop.
chatLZMA-Twitter
import json
import lzma
import nltk
import random
nltk.download('twitter_samples')
my_filters = [
{"id": lzma.FILTER_LZMA2, "preset": 9 | lzma.PRESET_EXTREME},
]
lzc = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=my_filters)
reader = nltk.corpus.TwitterCorpusReader('/root/nltk_data/corpora/twitter_samples', '.*\.json')
all_tweets = ''
for tweet in reader.docs():
all_tweets += json.dumps(tweet['text'])
corp = all_tweets.encode()
out = lzc.compress(corp)
out_end = lzc.flush()
lzd = lzma.LZMADecompressor(lzma.FORMAT_RAW, filters=my_filters)
lzd.decompress(out)
lzd.decompress(out_end[:-50])
i = 0
while True:
try:
print(i, '\t', lzd.decompress(random.randbytes(10)).decode(errors="ignore"))
i += 1
except Exception as e:
print(e)
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment