-
-
Save dylanmckay/2b191a10068bd87d0fffba242db44b52 to your computer and use it in GitHub Desktop.
#! /usr/bin/env ruby | |
# NOTE: Requires Ruby 2.1 or greater. | |
# This script can be used to parse and dump the information from | |
# the 'html/contact_info.htm' file in a Facebook user data ZIP download. | |
# | |
# It prints all cell phone call + SMS message + MMS records, plus a summary of each. | |
# | |
# It also dumps all of the records into CSV files inside a 'CSV' folder, that is created | |
# in whatever the working directory of the program is when executed. | |
# | |
# Place this script inside the extracted Facebook data download folder | |
# alongside the 'html' folder. | |
# | |
# This script requires Ruby and the Nokogiri library to be installed. | |
# | |
# Open source licensing | |
# --------------------- | |
# | |
# Dual-licensed under the MIT and Apache 2.0 open source licenses. Either license can be chosen | |
# by any user of the program. | |
# | |
# The MIT license is duplicated here, the Apache 2.0 license can be found here | |
# https://opensource.org/licenses/Apache-2.0 | |
# | |
# The MIT License (MIT) | |
# Copyright (c) 2018 Dylan McKay | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated | |
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation | |
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE | |
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS | |
# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR | |
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
require 'nokogiri' | |
require 'time' | |
require 'fileutils' | |
def hr | |
$stdout.puts "-" * 24 | |
end | |
def indent(level = 1) | |
$stdout.print " " * (level - 1) | |
$stdout.flush | |
end | |
def section(title, level: 1) | |
indent(level) and hr | |
indent(level) and $stdout.puts title | |
indent(level) and $stdout.puts | |
yield | |
indent(level) and hr | |
indent(level) and $stdout.puts | |
end | |
# Extracts metadata from a call/text/sms/mms table | |
# Returns nil if there is no metadata in this table. | |
# Returns a 2d list of row/colums | |
def extract_table_metadata(metadata_table) | |
headings = metadata_table.css('tr').first.css('th').map(&:text).map(&:chomp) | |
records = metadata_table.css('tr')[1..-1] | |
return nil if records.size <= 1 # many tables are empty (excluding headings). | |
[headings] + records.map do |call_record| | |
call_record.css('td').map(&:text).map(&:chomp).map do |field| | |
if field.include? ' at ' # some fields are dates/times | |
# Time example: "Wednesday, 14 June 2017 at 19:02 UTC+12" | |
Time.strptime(field, "%A, %e %B %Y at %R UTC%z") rescue field | |
else | |
field # no special processing | |
end | |
end | |
end | |
end | |
def dig_out_metadata(container:) | |
# If a specific type of metadata is missing (calls, texts, ..), the | |
# container div will simply not be present. | |
return [] if container.nil? | |
contact_tables = container.children.select { |c| c.name == "table" } | |
contact_tables.map do |contact_table| | |
metadata_table = contact_table.css('table')[0] | |
extract_table_metadata(metadata_table) | |
end.compact.select { |t| t.size > 1 } # must include non-header rows | |
end | |
def print_metadata(metadata, metadata_title:) | |
section(metadata_title) do | |
metadata.each do |phone_records| | |
puts | |
indent(2) and puts "Another phone number" | |
puts | |
phone_records.each do |record| | |
indent(2) and puts record.join(", ") | |
end | |
end | |
end | |
end | |
def print_timestamps(metadata, metadata_name:) | |
timestamps = metadata.map { |r| r[1].to_s.chomp }.select { |s| s.size > 0 }.map do |t| | |
begin | |
Time.parse(t) | |
rescue ArgumentError # do not parse timestamp if unparseable | |
t | |
end | |
end | |
if timestamps.size > 0 | |
puts "The oldest #{metadata_name} is from #{timestamps.min.to_date}, the most recent at #{timestamps.max.to_date}" | |
end | |
end | |
def print_status_breakdown(metadata, metadata_name:) | |
grouped_statuses = metadata.flatten(1).group_by(&:first) | |
if grouped_statuses.size > 0 | |
puts "This includes " + grouped_statuses.map { |status,records| "#{records.size} #{status.downcase} #{metadata_name}"}.join(", ") | |
end | |
end | |
def metadata_to_csv(metadata) | |
metadata.flatten(1).map { |record| record.join(',') }.join("\n") | |
end | |
def dump_metadata_csv(html_doc) | |
call_history_container = html_doc.xpath("//h2[text()='Call History']/following-sibling::div")[0] | |
sms_history_container = html_doc.xpath("//h2[text()='SMS History']/following-sibling::div")[0] | |
mms_history_container = html_doc.xpath("//h2[text()='MMS History']/following-sibling::div")[0] | |
FileUtils.mkdir_p("csv") | |
call_metadata = dig_out_metadata(:container => call_history_container) | |
sms_metadata = dig_out_metadata(:container => sms_history_container) | |
mms_metadata = dig_out_metadata(:container => mms_history_container) | |
File.write(File.join("csv", "call.csv"), metadata_to_csv(call_metadata)) | |
File.write(File.join("csv", "sms.csv"), metadata_to_csv(sms_metadata)) | |
File.write(File.join("csv", "mms.csv"), metadata_to_csv(mms_metadata)) | |
end | |
def print_metadata_human(html_doc) | |
call_history_container = html_doc.xpath("//h2[text()='Call History']/following-sibling::div")[0] | |
sms_history_container = html_doc.xpath("//h2[text()='SMS History']/following-sibling::div")[0] | |
mms_history_container = html_doc.xpath("//h2[text()='MMS History']/following-sibling::div")[0] | |
call_metadata = dig_out_metadata(:container => call_history_container) | |
sms_metadata = dig_out_metadata(:container => sms_history_container) | |
mms_metadata = dig_out_metadata(:container => mms_history_container) | |
if call_history_container | |
phone_numbers = call_history_container.xpath("//b[text()='Number:']/following-sibling::text()") | |
.map(&:text).sort.uniq | |
else | |
phone_numbers = [] | |
end | |
print_metadata(call_metadata, :metadata_title => "Call History") | |
print_metadata(sms_metadata, :metadata_title => "SMS History") | |
print_metadata(mms_metadata, :metadata_title => "MMS History") | |
section("The full list of phone numbers that have stored data") do | |
phone_numbers.each_slice(8).to_a.map { |g| g.join(", ") }.each do |line| | |
indent(2) and $stdout.puts line | |
end | |
end | |
$stdout.puts "A brief summary of phone records" | |
hr | |
$stdout.puts "There are phone records for #{phone_numbers.size} distinct phone numbers" | |
$stdout.puts "There are records of #{call_metadata.flatten(1).size} distinct cell phone calls" | |
indent(2) and print_timestamps(call_metadata, :metadata_name => "cell phone call") | |
indent(2) and print_status_breakdown(call_metadata, :metadata_name => "cell phone calls") | |
$stdout.puts "There are records of #{sms_metadata.flatten(1).size} distinct SMS messages" | |
indent(2) and print_timestamps(sms_metadata, :metadata_name => "SMS message") | |
indent(2) and print_status_breakdown(sms_metadata, :metadata_name => "SMS messages") | |
$stdout.puts "There are records of #{mms_metadata.flatten(1).size} distinct MMS messages" | |
indent(2) and print_timestamps(mms_metadata, :metadata_name => "MMS message") | |
indent(2) and print_status_breakdown(mms_metadata, :metadata_name => "MMS messages") | |
hr | |
end | |
html_text = File.read('html/contact_info.htm') | |
html_doc = Nokogiri::HTML(html_text) | |
print_metadata_human(html_doc) | |
$stdout.puts | |
hr | |
$stdout.puts "dumped metadata to CSV files at #{Dir.pwd}/csv" | |
dump_metadata_csv(html_doc) | |
I downloaded a new copy of my fb data today (i'm in europe), and the structure of the files and folders has completely changed since the last time i did it (and obviously the script isn't working, since the file the script tries to parse does not exist)
Am I alone in this situation ?
The place to download one's fb data has changed. It seems to change often. Three ways that currently work:
At the moment, you can save steps and time: A & B: go straight to step 3.1. C:
- Start at almost any facebook page; click on the "?" in the top right.
- Click "Privacy Shortcuts" (third from the bottom), which opens https://www.facebook.com/privacy/ . You are in a twisty maze of passages, all almost alike.
- Click "Access your information" - (which is first under the "Your Facebook Information" (fourth) section (which states: View or download your Facebook information at any time."; I had to laugh when I noticed later that the short URL https://www.facebook.com/dyi/ redirects there)) in order to ...
3.1. A: load https://www.facebook.com/your_information/ (Or, if that fails: B: Load https://www.facebook.com/dyi/ and ...) - Click "download your information" - which is B)in the first box of text, near the bottom, OR B)in the second row. (This opens a unique URL like https://www.facebook.com/dyi/?x=Admungedmungedei ... which you can bookmark - once you have the URL with the value of x that works for your account. )
- There, one is given the CHOICE of HTML or JSON format, and many other choices; with defaults:
Date Range:All of my data
Format:HTML
Media Quality:Medium
Among the dozens items (Your Information:20, Information About You:6) is this one (#23):
Calls and Messages
Logs of your calls and messages that you've chosen to share in your device settings !!!
Accessed while in California, today.
It would be interesting to learn what fb goes by when it decides what to store, and changes what it makes available (and/or how) - (e.g. current IP geolocation, stated country of residence, usual IP geolocation, etc.) Clearly dates are important, as the survey results show. Speaking of those results - see my following comment. (It's a separate subtopic, so I'm posting separately.)
And THANK YOU, Dylan.
Dylan, do you want to write / would you like/accept a crowdfunded X$ for a new script (and thanks for the old one)?
The google form (questionnaire) has been closed*; results: http://archive.is/yb7xS; I like the cute "This form was created inside of Dylan." tweak to it. Wait--super freaky. The archive is showing different data! The live page and archive both show: Q1: Is there any call, text, or MMS metadata? 1,716 responses, 76% no, and Q2 data, but the archive lacks all the other data, and FALSELY displays "No responses yet for this question". (Not surprisingly, the IA / archive.org won't save it, even manually, because it's programmed to consider itself a robot/respect robots.txt; It doesn't make an exception when run manually.)
Select the type(s) of metadata that were collected 865 responses
Cellular calls 277 (32%)
MMS messages 269 (31%)
None 194 (22%)
ALL 763 (88%)
can't leave blank 7 ( 1%) ...
*"The form Facebook data dump contact information is no longer accepting responses.
Try contacting the owner of the form if you think this is a mistake."
I later found a partial archive at https://web-beta.archive.org/web/20190215193157/https://docs.google.com/forms/d/1g_K4yurJqCvBSknQHMdkvX-0zEEA2vMs5ZKnScaQ1mg/viewanalytics
-It shows (for me, currently) the answers to Q3 and Q4 - When does the metadata [start|end] (if at all) - Most common starts: 7:1/1/09. 10:1/1/18. 7:3/26/18.(data entry error?) (rest: <5). Common ends:8 days in Mar 2018, 3/21-3/29.
(I then pointed WebCite at it, and obtained http://www.webcitation.org/76DCI2a2u which has 0 question answers.
I then tried again, and obtained http://archive.fo/FrWv4 which was like the archive.is result.)
After I requested a download, facebook asked me:
We'd like to hear from you!
We're asking a small group of people about their experience here. Could you help us out by letting us know your reasons for downloading your information today? You can choose any or all of the following:
- To understand what information is available to be downloaded
- To keep a record of what you've shared on facebook
- To transfer your information to another service or app
If none of these apply, let us know your reason here:
Thanks for helping us improve Facebook!
I just closed the window.
I've been a Facebook user since 2007, in the US, with the FB and Messenger app on both iOS and Android phones for much of that time. My data contains none of the phone records in the initial @dylanmckay example. I'm guessing it's a phone or messaging service I never opted into?
My total Facebook data download was about 230 MB uncompressed. In comparison, I just requested my Google data dump and it's 321 GB, of compressed data. I can't wait to learn what that one has in it.
Thank's for putting this script together @dylanmckay