Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
from nltk.tokenize import word_tokenize
import pickle
import pprint
import json
"""
(heads, descs, keywords) = ([headline], [description], )
"""
DATA_FILE = 'data/signalmedia-1m.jsonl.test'
PICKLE_FILE = 'data/tokens.pkl'
def write_to_pickle(filename, data):
with open(filename, 'wb') as f:
pickle.dump(data, f)
def read_from_pickle(filename):
with open(filename, 'r') as f:
return pickle.load(f)
def get_json_data(filename):
heads = []
descs = []
HEADER = "title"
DESCRIPTION = "content"
with open(filename, 'r') as json_data:
for json_object in json_data:
heads.append(json.loads(json_object)[HEADER])
descs.append(json.loads(json_object)[DESCRIPTION])
return heads, descs, None
data = get_json_data(DATA_FILE)
write_to_pickle(PICKLE_FILE, data)
data = read_from_pickle(PICKLE_FILE)
pprint.pprint(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment