| import psycopg2, re, gzip | |
| # NB. Data to fill in on lines 16-19 and 30 | |
| # Outputs two tab separated files (possibly gzip'd) where the first 3 | |
| # fields of each line are: | |
| # | |
| # 1. id of the item (base 36) | |
| # 2. the date (UTC time) | |
| # 3. total votes | |
| # | |
| # The rest of the line consists of each LW link separated by tabs | |
| # (including the possibility of no links, in which case there is no | |
| # trailing tab) | |
| connection = psycopg2.connect(database=, | |
| user=, | |
| password=, | |
| host=) | |
| # Output files (can be skipped by setting to None) | |
| ARTICLE_FILENAME = 'article-links.txt' | |
| COMMENT_FILENAME = 'comment-links.txt' | |
| # use gzip to automatically compress the output? | |
| # (appends .gz to file name) | |
| GZ=False | |
| # the ids of the public subreddits (Main and Discussion) | |
| PUBLIC_SUBREDDIT_IDS=(,) | |
| # CODE | |
| cursor = connection.cursor() | |
| # Take everything up to the first whitespace (trimming non-link | |
| # information can be done later) | |
| SEARCH_RE = re.compile(r'lesswrong.com/\S*') | |
| # convert to the base-36 identifier used | |
| digits = '0123456789abcdefghijklmnopqrstuvwxyz' | |
| def int2str(i): | |
| build=[] | |
| while i: | |
| i,rem = divmod(i,36) | |
| build.append(digits[rem]) | |
| return ''.join(build[::-1]) | |
| def go(fname, query): | |
| if GZ: | |
| f = gzip.open(fname+'.gz','w') | |
| else: | |
| f = open(fname,'w') | |
| cursor.execute(query) | |
| for id,date,karma,data in cursor: | |
| links = SEARCH_RE.findall(data) | |
| # convert to microsecondless, UTC time | |
| utc_date = date.replace(microsecond=0,tzinfo=None) - date.utcoffset() | |
| # print a tab sep line with "post-id date score[ link ...]" | |
| f.write('\t'.join([int2str(id),utc_date.isoformat(),str(karma)] + | |
| links) + '\n') | |
| f.close() | |
| # Articles (filtered by subreddit, keeping only those specified as | |
| # "public" above) | |
| if ARTICLE_FILENAME: | |
| go(ARTICLE_FILENAME, | |
| '''SELECT thing.thing_id, thing.date, thing.ups - thing.downs, data.value | |
| FROM reddit_thing_link AS thing | |
| INNER JOIN reddit_data_link AS data | |
| ON thing.thing_id = data.thing_id | |
| INNER JOIN reddit_data_link AS data2 | |
| ON thing.thing_id = data2.thing_id | |
| WHERE NOT thing.deleted AND NOT thing.spam AND data.key='article' AND | |
| data2.key='sr_id' AND data2.value IN (%s)''' \ | |
| % ','.join("'%d'" % id for id in PUBLIC_SUBREDDIT_IDS)) | |
| # Comments (I think all comments are public, so no filtering needed?) | |
| if COMMENT_FILENAME: | |
| go(COMMENT_FILENAME, | |
| '''SELECT thing.thing_id, thing.date, thing.ups - thing.downs, data.value | |
| FROM reddit_thing_comment AS thing | |
| INNER JOIN reddit_data_comment AS data | |
| ON thing.thing_id = data.thing_id | |
| WHERE NOT thing.deleted AND NOT thing.spam AND data.key='body' ''') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment