Skip to content

Instantly share code, notes, and snippets.

View dpenfoldbrown's full-sized avatar

Dunc PB dpenfoldbrown

View GitHub Profile
@dpenfoldbrown
dpenfoldbrown / date_extract.py
Created October 21, 2016 22:58
Date extracting
def date_to_datetime(date_str, fstrs=None):
"""
Given date string, return datetime object
2015 format: 2/8/13 0:00
2016 format: 02/08/2013 00:00
02/12/2013 20:05
"""
if fstrs is None:
fstrs = ["%m/%d/%y %H:%M", "%m/%d/%Y %H:%M"]
for f in fstrs:
@dpenfoldbrown
dpenfoldbrown / argparse_sample.py
Created September 11, 2013 20:46
Argparse argument example
import argparse
parser = argparse.ArgumentParser(description="Sample usage of argparse")
# Add argument. Do not use all elements as shown. Note: for flags, action="store_true" (?syntax?) is better.
parser.add_argument("-a", "--first", action="store", type=int, dest="first_arg", required=True, default=23,
help="An example first argument. Generally use only one of required or default. Leave out type for default (str).")
# Defaults to parsing sys.argv list, but can pass a list as well
args = parser.parse_args()
# Do this to update URL documents (assuming you already have url_dic and all other lists and dicts)
for user in collection.find():
for tweet in user['tweets']:
urls = tweet['urls']
for url in urls:
url_string = url['url'] # DO NOT FORGET THIS. url is the url OBJECT, not the url string
if url_string in short_urls:
@dpenfoldbrown
dpenfoldbrown / url_domain_annotate.py
Created August 28, 2013 18:27
Regex URLs to determine political leaning via labelled sources
# List of urls (pretend like it's populated)
urls = []
# Patterns to match in urls (note in some cases including the .org or .com to avoid matching common words or letters
# (eg for npr or slate or today)
# Add whatever other domains you want to match to the re OR (|) string
left_pattern = r"(?P<domain>nytimes|washingtonpost|npr.org|abcnews|nbcnews|huffingtonpost|slate.com|today.com)"
center_pattern = r"(?P<domain>cnn|bbc.co.uk|yahoo)"
right_pattern = r"(?P<domain>foxnews|washingtontimes|usnews|chicagotribune)"
@dpenfoldbrown
dpenfoldbrown / dict_to_file.py
Created August 27, 2013 20:43
Dictionary write to file
d = { 'a':1, 'b':2, 'c':3, 'd':4 }
outhandle = open("dictfile.txt", 'w')
for (key, val) in d.items():
outhandle.write("{0}\t{1}\n".format(key, val))
outhandle.close()
@dpenfoldbrown
dpenfoldbrown / tweet_url_byday_count.py
Last active December 21, 2015 20:08
Code to annotate all URLs with liberal, conservative, center, or unknown based on known news sources (eg fox, abc, npr with known affiliations/leanings). Also counts the number of domains found for each in list
import pymongo
# Set up DB client, db, collection
date_tweet_count = {}
date_url_count = {}
for user in collection.find():
for tweet in user['tweets']:
datestr = tweet['created_at']
@dpenfoldbrown
dpenfoldbrown / user_tweet_url_count.py
Created August 27, 2013 20:13
User count aggregate things (# tweets, # urls, etc)
import pymongo
# Set up mongo DB here (get client, get database, get collection
client = MongoClient("smapp", 27011)
dbh = client['GunControl']
collection = dbh['GunTweetUsers_zephyr']
url_count = {}
tweet_count = {}
# Simple re examples in python
import re
files = ("re_pro_p00012.txt", "re_neg_p00014.txt", "lrr_p00014.txt")
category_pattern = r"(?P<category>[a-zA-Z]+)_.*"
for file in files:
match = re.match(category_pattern, file)