Last active
August 29, 2015 14:07
-
-
Save Kalli/790e7dc52953383838d5 to your computer and use it in GitHub Desktop.
Find Facebook newsfeed posts related to the 10-influential-records meme and parse the record and artist titles from them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import facebook | |
import datetime | |
import urlparse | |
import json | |
import traceback | |
import time | |
import re | |
# get an auth token with feed reading permissions from https://developers.facebook.com/tools/explorer/ | |
graph = facebook.GraphAPI("$AUTH_TOKEN") | |
# set up dates, 15th of september was roughly when this trend reignited in Iceland | |
startdate = datetime.datetime.strptime('2014-09-15', "%Y-%m-%d").date() | |
postdate = datetime.datetime.now().date() | |
f = open("/fbdata.json", "w") | |
def post_match(post): | |
keywords = [u"plötu", "influential albums", u"breiðskífur", "geisladiskar"] | |
if any(word in post["message"] for word in keywords): | |
return True | |
if post["message"].count("\n") > 9 and post["message"].count("-") > 9: | |
return True | |
if len(re.findall("\n\d+(\.|\)|,)", post["message"])) > 10: | |
return True | |
else: | |
return False | |
# set the time offset to the current time | |
until = int(time.time()) | |
data = [] | |
try: | |
while postdate > startdate: | |
print postdate | |
# get the 25 newsfeed posts leading up til the date | |
feed = graph.get_object("me/home", **{"until":until}) | |
# parse the feed data we've loaded | |
for post in feed["data"]: | |
if "message" in post and post_match(post): | |
print "found record related post from " + post["from"]["name"] | |
entry = {"name": post["from"]["name"], "message": post["message"]} | |
data.append(entry) | |
# get the parameters for the next round of data | |
until = urlparse.parse_qs(urlparse.urlparse(feed["paging"]["next"]).query)["until"][0] | |
postdate = datetime.datetime.strptime(feed["data"][0]["created_time"][:10], "%Y-%m-%d").date() | |
except Exception: | |
print "Last data fetched: "+str(until) | |
print(traceback.format_exc()) | |
f.write(json.dumps(data)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import re | |
albums = [] | |
f = open("/fbdata-parsed.csv", "w") | |
with open("/fbdata.json") as data_file: | |
data = json.load(data_file) | |
for post in data: | |
for line in post["message"].split("\n"): | |
if '-' in line: | |
line = line.replace(u'•', "") | |
line = re.sub("^\d+(\.|\)|,)", "", line) # remove numbers from start of lines | |
line = line.title().strip() | |
l = line.split("-")[0] + "\t" + line.split("-")[1] | |
l = l.strip() + "\n" | |
albums.append(l) | |
for l in sorted(albums): | |
f.write(l.encode("UTF-8")) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment