Skip to content

Instantly share code, notes, and snippets.

@jcrist
Created August 18, 2015 21:56
Show Gist options
  • Save jcrist/fce59a8afd89d2e20015 to your computer and use it in GitHub Desktop.
Save jcrist/fce59a8afd89d2e20015 to your computer and use it in GitHub Desktop.
Convert reddit comment data to castra
import ujson
from datetime import datetime
import pandas as pd
from toolz import partition_all, dissoc
from castra import Castra
def to_json(line):
blob = ujson.loads(line)
date = blob['created_utc']
blob['created_utc'] = datetime.fromtimestamp(int(date))
edited = blob['edited']
blob['edited'] = datetime.fromtimestamp(int(edited)) if edited else pd.NaT
return dissoc(blob, 'id', 'retrieved_on', 'subreddit_id')
def to_df(batch, columns=None, index=None, p=None):
blobs = p.map(to_json, batch)
return pd.DataFrame.from_records(blobs, columns=columns).set_index(index)
columns = ['archived', 'author', 'author_flair_css_class', 'author_flair_text',
'body', 'controversiality', 'created_utc', 'distinguished', 'downs',
'edited', 'gilded', 'link_id', 'name', 'parent_id',
'removal_reason', 'score', 'score_hidden', 'subreddit', 'ups']
categories = ['distinguished', 'subreddit', 'removal_reason']
if __name__ == '__main__':
# Change this to point to the input and output files
IN_FILE = 'RC_2015-05.bz2'
OUT_FILE = 'reddit_data.castra'
from multiprocessing import Pool
from bz2 import BZ2File
p = Pool()
with BZ2File(IN_FILE) as fil:
batches = partition_all(200000, fil)
df = to_df(next(batches), columns, 'created_utc', p)
castra = Castra(OUT_FILE, template=df, categories=categories)
castra.extend(df)
for i, bat in enumerate(batches):
castra.extend(to_df(bat, columns, 'created_utc', p))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment