Last active
August 29, 2015 14:07
-
-
Save Uberi/a373b3f666272de75d52 to your computer and use it in GitHub Desktop.
FaceBook chat data processing helper script. Run `normalize.py` to normalize the raw JSON data into cleaner JSON, then run `process.py` to calculate some interesting stats.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
path = "data.json" | |
new_path = "normalized_data.json" | |
from urllib.parse import urlparse, parse_qs | |
import json | |
users = { | |
"fbid:100001608518631": "Anthony Zhang", | |
"fbid:1538370516": "Keri Warr", | |
"fbid:814604197": "Daniel Hopper", | |
"fbid:100002673677412": "Elvin Yung", | |
"fbid:651304852": "Thomas Hansen", | |
"fbid:1060246902": "Jordan Verasamy", | |
"fbid:1259212291": "Joey Rideout", | |
"fbid:1270271322": "Joey Pereira", | |
"fbid:1599706473": "Hart Andrin", | |
"fbid:1617756404": "Enoch Lo", | |
"fbid:100000058532370": "Kushal Kumarasinghe", | |
"fbid:100000060523766": "Nerman Nicholas", | |
"fbid:606323988": "Vidur Kumar", | |
"fbid:100003050986762": "Viktor Chynarov", | |
"fbid:1112552782": "Leon Li", | |
} | |
def get_attachment(attach): | |
url = attach["url"] | |
if url.startswith("/ajax/mercury/attachments/photo/view"): | |
url = parse_qs(urlparse(url).query)["uri"][0] | |
elif url.startswith("/"): | |
url = "https://facebook.com" + url | |
return [attach["attach_type"] + ":" + url] | |
def get_body(entry): | |
if "body" in entry: | |
return entry["body"] | |
elif "log_message_body" in entry: | |
return entry["log_message_body"] | |
else: | |
raise Exception("Bad entry:\n" + str(entry)) | |
def get_entry(entry): | |
result = [ | |
entry["timestamp"], # unix timestamp | |
users[entry["author"]], # message author | |
get_body(entry), # message value | |
[get_attachment(attach) for attach in entry["attachments"] if isinstance(attach, dict)] if "attachments" in entry else [], | |
] | |
#if "coordinates" in entry: | |
#result.append(entry["coordinates"]) | |
return json.dumps(result) | |
data = json.load(open(path, "r")) | |
data = sorted(data, key=lambda entry: entry["timestamp"]) | |
# output the normalized message data | |
messages = [get_entry(entry) for entry in data] | |
#json.dump(messages, open(new_path, "w")) | |
open(new_path, "w").write("[\n" + ",\n".join(messages) + "\n]\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
path = "normalized_data.json" | |
import json, sys, re | |
from collections import defaultdict | |
data = json.load(open(path, "r")) | |
user_characters = defaultdict(int) | |
user_messages = defaultdict(int) | |
for entry in data: | |
user_characters[entry[1]] += len(entry[2]) | |
user_messages[entry[1]] += 1 | |
print("Characters typed by user, highest first:") | |
print(sorted(dict(user_characters).items(), key=lambda x: -x[1])) | |
print("Number of messages by user, highest first:") | |
print(sorted(dict(user_messages).items(), key=lambda x: -x[1])) | |
print("Average characters per message by user, highest first:") | |
user_average_chars = {} | |
for k, v in user_characters.items(): user_average_chars[k] = v / user_messages[k] | |
print(sorted(user_average_chars.items(), key=lambda x: -x[1])) | |
# get word frequencies | |
words = defaultdict(int) | |
for entry in data: | |
for word in entry[2].split(): | |
if "shit" in word or "piss" in word or "fuck" in word or "cunt" in word or "cocksucker" in word or "motherfucker" in word or "tits" in word: | |
words[entry[1]] += 1 | |
frequencies = sorted([(k, v / user_messages[k]) for k, v in words.items()], key=lambda x: -x[1]) | |
result = "\n".join([k + ":\t" + str(v) for k, v in frequencies]) | |
print("Expletives per message:\n" + result) | |
# get word frequencies | |
words = defaultdict(int) | |
for entry in data: | |
for word in entry[2].split(): | |
words[word.lower()] += 1 | |
frequencies = sorted(words.items(), key=lambda x: -x[1]) | |
result = "\n".join([k + ":\t" + str(v) for k, v in frequencies]) | |
open("word_freqs.txt", "wb").write(result.encode("UTF-8")) | |
# preliminary funniness detection | |
funny = re.compile("\d+/10") | |
result = "" | |
for i, entry in enumerate(data): | |
if funny.match(entry[2]): | |
for j in range(i - 6, i + 1): | |
result += data[j][1] + ":\t" + data[j][2] + "\n" | |
result += "\n" | |
#print(result.encode(sys.stdout.encoding, errors="replace").decode(sys.stdout.encoding)) | |
# get word frequencies | |
pattern = re.compile(":\)|:D|:\(|:/|:O|:\$") | |
words = defaultdict(int) | |
for entry in data: | |
for word in entry[2].split(): | |
if pattern.match(word): | |
words[entry[1]] += 1 | |
frequencies = sorted(words.items(), key=lambda x: -x[1]) | |
result = "\n".join([k + ":\t" + str(v) for k, v in frequencies]) | |
print("Smilies per user:\n" + result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment