ilimugur/migrateFacebookNotesDataToCSV.py

## migrateFacebookNotesDataToCSV.py
import csv
from datetime import datetime
import json

def processInput(filePath):
    with open(filePath, 'r') as f:
        deserializedJSON = json.load(f)

    notesData = deserializedJSON['notes']
    return notesData

def getDiagnostics(notes):
    numNotes = len(notes)
    print("Found " + str(numNotes) + " notes.")
    numEmptyTitleWithEmptyText = 0
    numEmptyTitleWithNonemptyText = 0
    numNonemptyTitleWithEmptyText = 0
    numNonemptyTitleWithNonemptyText = 0

    numEmptyCoverPhotoWithEmptyTitle = 0
    numEmptyCoverPhotoWithNonemptyTitle = 0
    numNonemptyCoverPhotoWithEmptyTitle = 0
    numNonemptyCoverPhotoWithNonemptyTitle = 0

    maxTextLength = 0
    minTextLength = 100000
    lenDict = {}
    for dtNote, note in notes:
        if len(note['title']) == 0:
            if len(note['text']) == 0:
                numEmptyTitleWithEmptyText += 1
            else:
                numEmptyTitleWithNonemptyText += 1
            if 'cover_photo' in note and 'uri' in note['cover_photo']:
                numNonemptyCoverPhotoWithEmptyTitle += 1
            else:
                numEmptyCoverPhotoWithEmptyTitle += 1
        else:
            if len(note['text']) == 0:
                numNonemptyTitleWithEmptyText += 1
            else:
                numNonemptyTitleWithNonemptyText += 1
            if 'cover_photo' in note and 'uri' in note['cover_photo']:
                numNonemptyCoverPhotoWithNonemptyTitle += 1
            else:
                numEmptyCoverPhotoWithNonemptyTitle += 1

        maxTextLength = max(maxTextLength, len(note['text']))
        if len(note['text']) > 0:
            minTextLength = min(minTextLength, len(note['text']))
        if len(note['text']) in lenDict:
            lenDict[ len(note['text']) ].append(note['created_timestamp'])
        else:
            lenDict[ len(note['text']) ] = [note['created_timestamp']]

    print("Found " + str(numEmptyTitleWithEmptyText) + " notes with empty title AND text.")
    print("Found " + str(numEmptyTitleWithNonemptyText) + " notes with empty title BUT nonempty text.")
    print("Found " + str(numNonemptyTitleWithEmptyText) + " notes with nonempty title BUT empty text.")
    print("Found " + str(numNonemptyTitleWithNonemptyText) + " notes with nonempty title AND nonempty text.")
    print("Longest text length is: " + str(maxTextLength))
    print("Shortest text length is: " + str(minTextLength))

    print("Found " + str(numEmptyCoverPhotoWithEmptyTitle) + " notes with empty cover photo AND title.")
    print("Found " + str(numEmptyCoverPhotoWithNonemptyTitle) + " notes with empty cover photo BUT nonempty title.")
    print("Found " + str(numNonemptyCoverPhotoWithEmptyTitle) + " notes with nonempty cover photo BUT empty title.")
    print("Found " + str(numNonemptyCoverPhotoWithNonemptyTitle) + " notes with nonempty cover photo AND nonempty title.")

def parseInput(notesInput, authorUsername, postType, postStatus):
    notes = []
    notesInputForDiagnostic = []
    for noteInput in notesInput:
        timestamp = noteInput['created_timestamp']
        title = noteInput['title']
        text = noteInput['text']

        # Datetime processing and formatting
        timestampDatetime = datetime.fromtimestamp(timestamp)
        # WARNING: Timestamp provided by Facebook pages' timestamp data
        # does not seem to be in GMT/UTC. It may be a localized timestamp
        # based on the local timezone of the page/profile account.
        # This part needs updating if your local timezone differs from
        # the timezone of the system to which you'll feed this data.
        postDateString = timestampDatetime.strftime('%Y-%m-%d %H:%M:%S')

        # Title processing
        titleString = title.encode('latin1').decode('utf8')

        # Text processing
        contentString = text.encode('latin1').decode('utf8')

        noteCSVData = {'post_author': authorUsername, 'post_date': postDateString,
                       'post_type': postType, 'post_status': postStatus,
                       'post_title': titleString, 'post_content': contentString}
        notes.append((timestampDatetime, noteCSVData))
        notesInputForDiagnostic.append((timestampDatetime, noteInput))
    notes.sort()
    notesInputForDiagnostic.sort(key=(lambda noteInfo : noteInfo[0]))
    getDiagnostics(notesInputForDiagnostic)
    return notes

def migrateToCSV(notes, outputFilePath):
    fieldnames = ['post_author', 'post_date', 'post_type', 'post_status', 'post_title', 'post_content']
    with open(outputFilePath, mode='w') as outputFile:
        dictWriter = csv.DictWriter(outputFile,
                                    fieldnames=fieldnames,
                                    delimiter = ',',
                                    quotechar = '"',
                                    quoting = csv.QUOTE_NONNUMERIC)
        dictWriter.writeheader()
        for note in notes:
            dictWriter.writerow(note[1])

notesFilePath = './notes.json' # TODO: Change it to the path of your notes.json file
authorUsername = 'WPAuthorUsername' # TODO: Change it to Wordpress username for the author of the posts
postType = 'post'
postStatus = 'publish'
outputFilePath = 'outputFacebook.csv' # TODO: Change it to the path of your desired output file location

notesInput = processInput(notesFilePath)
notes = parseInput(notesInput, authorUsername, postType, postStatus)
migrateToCSV(notes, outputFilePath)
	import csv
	from datetime import datetime
	import json

	def processInput(filePath):
	with open(filePath, 'r') as f:
	deserializedJSON = json.load(f)

	notesData = deserializedJSON['notes']
	return notesData

	def getDiagnostics(notes):
	numNotes = len(notes)
	print("Found " + str(numNotes) + " notes.")
	numEmptyTitleWithEmptyText = 0
	numEmptyTitleWithNonemptyText = 0
	numNonemptyTitleWithEmptyText = 0
	numNonemptyTitleWithNonemptyText = 0

	numEmptyCoverPhotoWithEmptyTitle = 0
	numEmptyCoverPhotoWithNonemptyTitle = 0
	numNonemptyCoverPhotoWithEmptyTitle = 0
	numNonemptyCoverPhotoWithNonemptyTitle = 0

	maxTextLength = 0
	minTextLength = 100000
	lenDict = {}
	for dtNote, note in notes:
	if len(note['title']) == 0:
	if len(note['text']) == 0:
	numEmptyTitleWithEmptyText += 1
	else:
	numEmptyTitleWithNonemptyText += 1
	if 'cover_photo' in note and 'uri' in note['cover_photo']:
	numNonemptyCoverPhotoWithEmptyTitle += 1
	else:
	numEmptyCoverPhotoWithEmptyTitle += 1
	else:
	if len(note['text']) == 0:
	numNonemptyTitleWithEmptyText += 1
	else:
	numNonemptyTitleWithNonemptyText += 1
	if 'cover_photo' in note and 'uri' in note['cover_photo']:
	numNonemptyCoverPhotoWithNonemptyTitle += 1
	else:
	numEmptyCoverPhotoWithNonemptyTitle += 1

	maxTextLength = max(maxTextLength, len(note['text']))
	if len(note['text']) > 0:
	minTextLength = min(minTextLength, len(note['text']))
	if len(note['text']) in lenDict:
	lenDict[ len(note['text']) ].append(note['created_timestamp'])
	else:
	lenDict[ len(note['text']) ] = [note['created_timestamp']]

	print("Found " + str(numEmptyTitleWithEmptyText) + " notes with empty title AND text.")
	print("Found " + str(numEmptyTitleWithNonemptyText) + " notes with empty title BUT nonempty text.")
	print("Found " + str(numNonemptyTitleWithEmptyText) + " notes with nonempty title BUT empty text.")
	print("Found " + str(numNonemptyTitleWithNonemptyText) + " notes with nonempty title AND nonempty text.")
	print("Longest text length is: " + str(maxTextLength))
	print("Shortest text length is: " + str(minTextLength))

	print("Found " + str(numEmptyCoverPhotoWithEmptyTitle) + " notes with empty cover photo AND title.")
	print("Found " + str(numEmptyCoverPhotoWithNonemptyTitle) + " notes with empty cover photo BUT nonempty title.")
	print("Found " + str(numNonemptyCoverPhotoWithEmptyTitle) + " notes with nonempty cover photo BUT empty title.")
	print("Found " + str(numNonemptyCoverPhotoWithNonemptyTitle) + " notes with nonempty cover photo AND nonempty title.")

	def parseInput(notesInput, authorUsername, postType, postStatus):
	notes = []
	notesInputForDiagnostic = []
	for noteInput in notesInput:
	timestamp = noteInput['created_timestamp']
	title = noteInput['title']
	text = noteInput['text']

	# Datetime processing and formatting
	timestampDatetime = datetime.fromtimestamp(timestamp)
	# WARNING: Timestamp provided by Facebook pages' timestamp data
	# does not seem to be in GMT/UTC. It may be a localized timestamp
	# based on the local timezone of the page/profile account.
	# This part needs updating if your local timezone differs from
	# the timezone of the system to which you'll feed this data.
	postDateString = timestampDatetime.strftime('%Y-%m-%d %H:%M:%S')

	# Title processing
	titleString = title.encode('latin1').decode('utf8')

	# Text processing
	contentString = text.encode('latin1').decode('utf8')

	noteCSVData = {'post_author': authorUsername, 'post_date': postDateString,
	'post_type': postType, 'post_status': postStatus,
	'post_title': titleString, 'post_content': contentString}
	notes.append((timestampDatetime, noteCSVData))
	notesInputForDiagnostic.append((timestampDatetime, noteInput))
	notes.sort()
	notesInputForDiagnostic.sort(key=(lambda noteInfo : noteInfo[0]))
	getDiagnostics(notesInputForDiagnostic)
	return notes

	def migrateToCSV(notes, outputFilePath):
	fieldnames = ['post_author', 'post_date', 'post_type', 'post_status', 'post_title', 'post_content']
	with open(outputFilePath, mode='w') as outputFile:
	dictWriter = csv.DictWriter(outputFile,
	fieldnames=fieldnames,
	delimiter = ',',
	quotechar = '"',
	quoting = csv.QUOTE_NONNUMERIC)
	dictWriter.writeheader()
	for note in notes:
	dictWriter.writerow(note[1])

	notesFilePath = './notes.json' # TODO: Change it to the path of your notes.json file
	authorUsername = 'WPAuthorUsername' # TODO: Change it to Wordpress username for the author of the posts
	postType = 'post'
	postStatus = 'publish'
	outputFilePath = 'outputFacebook.csv' # TODO: Change it to the path of your desired output file location

	notesInput = processInput(notesFilePath)
	notes = parseInput(notesInput, authorUsername, postType, postStatus)
	migrateToCSV(notes, outputFilePath)