Created
August 18, 2015 21:56
-
-
Save jcrist/fce59a8afd89d2e20015 to your computer and use it in GitHub Desktop.
Convert reddit comment data to castra
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ujson | |
from datetime import datetime | |
import pandas as pd | |
from toolz import partition_all, dissoc | |
from castra import Castra | |
def to_json(line): | |
blob = ujson.loads(line) | |
date = blob['created_utc'] | |
blob['created_utc'] = datetime.fromtimestamp(int(date)) | |
edited = blob['edited'] | |
blob['edited'] = datetime.fromtimestamp(int(edited)) if edited else pd.NaT | |
return dissoc(blob, 'id', 'retrieved_on', 'subreddit_id') | |
def to_df(batch, columns=None, index=None, p=None): | |
blobs = p.map(to_json, batch) | |
return pd.DataFrame.from_records(blobs, columns=columns).set_index(index) | |
columns = ['archived', 'author', 'author_flair_css_class', 'author_flair_text', | |
'body', 'controversiality', 'created_utc', 'distinguished', 'downs', | |
'edited', 'gilded', 'link_id', 'name', 'parent_id', | |
'removal_reason', 'score', 'score_hidden', 'subreddit', 'ups'] | |
categories = ['distinguished', 'subreddit', 'removal_reason'] | |
if __name__ == '__main__': | |
# Change this to point to the input and output files | |
IN_FILE = 'RC_2015-05.bz2' | |
OUT_FILE = 'reddit_data.castra' | |
from multiprocessing import Pool | |
from bz2 import BZ2File | |
p = Pool() | |
with BZ2File(IN_FILE) as fil: | |
batches = partition_all(200000, fil) | |
df = to_df(next(batches), columns, 'created_utc', p) | |
castra = Castra(OUT_FILE, template=df, categories=categories) | |
castra.extend(df) | |
for i, bat in enumerate(batches): | |
castra.extend(to_df(bat, columns, 'created_utc', p)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment