Skip to content

Instantly share code, notes, and snippets.

@Uberi
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Uberi/a373b3f666272de75d52 to your computer and use it in GitHub Desktop.
Save Uberi/a373b3f666272de75d52 to your computer and use it in GitHub Desktop.
FaceBook chat data processing helper script. Run `normalize.py` to normalize the raw JSON data into cleaner JSON, then run `process.py` to calculate some interesting stats.
path = "data.json"
new_path = "normalized_data.json"
from urllib.parse import urlparse, parse_qs
import json
users = {
"fbid:100001608518631": "Anthony Zhang",
"fbid:1538370516": "Keri Warr",
"fbid:814604197": "Daniel Hopper",
"fbid:100002673677412": "Elvin Yung",
"fbid:651304852": "Thomas Hansen",
"fbid:1060246902": "Jordan Verasamy",
"fbid:1259212291": "Joey Rideout",
"fbid:1270271322": "Joey Pereira",
"fbid:1599706473": "Hart Andrin",
"fbid:1617756404": "Enoch Lo",
"fbid:100000058532370": "Kushal Kumarasinghe",
"fbid:100000060523766": "Nerman Nicholas",
"fbid:606323988": "Vidur Kumar",
"fbid:100003050986762": "Viktor Chynarov",
"fbid:1112552782": "Leon Li",
}
def get_attachment(attach):
url = attach["url"]
if url.startswith("/ajax/mercury/attachments/photo/view"):
url = parse_qs(urlparse(url).query)["uri"][0]
elif url.startswith("/"):
url = "https://facebook.com" + url
return [attach["attach_type"] + ":" + url]
def get_body(entry):
if "body" in entry:
return entry["body"]
elif "log_message_body" in entry:
return entry["log_message_body"]
else:
raise Exception("Bad entry:\n" + str(entry))
def get_entry(entry):
result = [
entry["timestamp"], # unix timestamp
users[entry["author"]], # message author
get_body(entry), # message value
[get_attachment(attach) for attach in entry["attachments"] if isinstance(attach, dict)] if "attachments" in entry else [],
]
#if "coordinates" in entry:
#result.append(entry["coordinates"])
return json.dumps(result)
data = json.load(open(path, "r"))
data = sorted(data, key=lambda entry: entry["timestamp"])
# output the normalized message data
messages = [get_entry(entry) for entry in data]
#json.dump(messages, open(new_path, "w"))
open(new_path, "w").write("[\n" + ",\n".join(messages) + "\n]\n")
path = "normalized_data.json"
import json, sys, re
from collections import defaultdict
data = json.load(open(path, "r"))
user_characters = defaultdict(int)
user_messages = defaultdict(int)
for entry in data:
user_characters[entry[1]] += len(entry[2])
user_messages[entry[1]] += 1
print("Characters typed by user, highest first:")
print(sorted(dict(user_characters).items(), key=lambda x: -x[1]))
print("Number of messages by user, highest first:")
print(sorted(dict(user_messages).items(), key=lambda x: -x[1]))
print("Average characters per message by user, highest first:")
user_average_chars = {}
for k, v in user_characters.items(): user_average_chars[k] = v / user_messages[k]
print(sorted(user_average_chars.items(), key=lambda x: -x[1]))
# get word frequencies
words = defaultdict(int)
for entry in data:
for word in entry[2].split():
if "shit" in word or "piss" in word or "fuck" in word or "cunt" in word or "cocksucker" in word or "motherfucker" in word or "tits" in word:
words[entry[1]] += 1
frequencies = sorted([(k, v / user_messages[k]) for k, v in words.items()], key=lambda x: -x[1])
result = "\n".join([k + ":\t" + str(v) for k, v in frequencies])
print("Expletives per message:\n" + result)
# get word frequencies
words = defaultdict(int)
for entry in data:
for word in entry[2].split():
words[word.lower()] += 1
frequencies = sorted(words.items(), key=lambda x: -x[1])
result = "\n".join([k + ":\t" + str(v) for k, v in frequencies])
open("word_freqs.txt", "wb").write(result.encode("UTF-8"))
# preliminary funniness detection
funny = re.compile("\d+/10")
result = ""
for i, entry in enumerate(data):
if funny.match(entry[2]):
for j in range(i - 6, i + 1):
result += data[j][1] + ":\t" + data[j][2] + "\n"
result += "\n"
#print(result.encode(sys.stdout.encoding, errors="replace").decode(sys.stdout.encoding))
# get word frequencies
pattern = re.compile(":\)|:D|:\(|:/|:O|:\$")
words = defaultdict(int)
for entry in data:
for word in entry[2].split():
if pattern.match(word):
words[entry[1]] += 1
frequencies = sorted(words.items(), key=lambda x: -x[1])
result = "\n".join([k + ":\t" + str(v) for k, v in frequencies])
print("Smilies per user:\n" + result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment