Created
September 9, 2024 02:21
-
-
Save karpatic/90b261eabc26efb04d6c49dc4dc663f7 to your computer and use it in GitHub Desktop.
Facebook Conversation Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let talker = 'left'; | |
let messageData = []; | |
// Helper function to add a delay (1 second) | |
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); | |
// Select the third grid element (index 2 since it's 0-based) | |
let grid = document.querySelectorAll('div[role="grid"]')[2]; | |
// Function to check if a message already exists in the messageData array | |
const isMessageInList = (text) => { | |
return messageData.some(message => message.text === text); | |
}; | |
// Function to process messages with a 1-second delay between each scroll | |
async function processMessages() { | |
const chats = grid.querySelectorAll(".x78zum5"); | |
for (let i = 0; i < chats.length; i++) { | |
const chat = chats[i]; | |
let text = chat.querySelector(".x1gslohp")?.innerText || ''; | |
// Scroll to the chat element | |
chat.scrollIntoView(); | |
// Determine if the message is from "You sent" (right) or another user (left) | |
if (chat.querySelector("h4 span")?.innerText === "You sent") { | |
talker = "right"; | |
} else { | |
talker = "left"; // Reset to left for other messages | |
} | |
// Add the message data if it's not already in the list | |
if (text && !isMessageInList(text)) { | |
messageData.push({ talker, text }); | |
} else { | |
console.log({ text, reason: "Duplicate message skipped" }); | |
} | |
// Wait for 1 second before proceeding to the next element | |
await delay(1000); | |
} | |
console.log(messageData); | |
} | |
// Call the function to process the messages | |
processMessages(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import tensorflow as tf | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
# No need to define 'data', assuming it's already defined before this | |
# Convert JSON to DataFrame | |
df = pd.DataFrame(data) | |
# Word count per speaker | |
df['word_count'] = df['text'].apply(lambda x: len(x.split())) | |
# Chat distribution | |
chat_distribution = df['talker'].value_counts() | |
# Preparing data for emotion analysis using keras tokenizer | |
max_words = 10000 # Limit vocabulary size | |
max_len = 100 # Limit sentence length | |
tokenizer = Tokenizer(num_words=max_words) | |
tokenizer.fit_on_texts(df['text']) | |
# Convert text to sequence of tokens | |
sequences = tokenizer.texts_to_sequences(df['text']) | |
# Pad sequences to ensure uniform input shape | |
data_pad = pad_sequences(sequences, maxlen=max_len) | |
# Load or build your tf.keras model for emotion classification | |
# Assuming you're using a pre-trained model or a custom-built one. | |
# Modify this as per your needs or load an existing model. | |
emotion_model = tf.keras.models.Sequential([ | |
tf.keras.layers.Embedding(max_words, 128, input_length=max_len), | |
tf.keras.layers.LSTM(64, return_sequences=False), | |
tf.keras.layers.Dense(8, activation='softmax') # Expanding to 8 possible emotions | |
]) | |
# Compile the model (modify as needed if you're loading a pre-trained one) | |
emotion_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) | |
# Here, assuming you already have a trained model. Use emotion_model.fit() if you need to train. | |
# emotion_model.load_weights("path_to_pretrained_model_weights.h5") | |
# Predict emotions | |
emotion_predictions = emotion_model.predict(data_pad) | |
# Expanded list of emotions (including more negative ones) | |
emotion_labels = ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise', 'disgust', 'frustration'] | |
# Map predictions to the corresponding emotion labels | |
df['emotion'] = [emotion_labels[np.argmax(pred)] for pred in emotion_predictions] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# Group data by speaker and emotion to get the distribution of emotions for each speaker | |
emotion_distribution_by_speaker = df.groupby(['talker', 'emotion']).size().unstack().fillna(0) | |
# Visualize the emotion distribution by speaker | |
plt.figure(figsize=(12, 8)) | |
emotion_distribution_by_speaker.plot(kind='bar', stacked=True, figsize=(12, 8)) | |
plt.title("Emotion Distribution by Speaker") | |
plt.xlabel("Speaker") | |
plt.ylabel("Emotion Count") | |
plt.legend(title="Emotion", bbox_to_anchor=(1.05, 1), loc='upper left') | |
plt.tight_layout() | |
plt.show() | |
# If you want to visualize this separately for each speaker: | |
for speaker in df['talker'].unique(): | |
plt.figure(figsize=(10, 6)) | |
speaker_data = df[df['talker'] == speaker] | |
emotion_counts = speaker_data['emotion'].value_counts() | |
sns.barplot(x=emotion_counts.index, y=emotion_counts.values) | |
plt.title(f"Emotion Distribution for {speaker}") | |
plt.xlabel("Emotion") | |
plt.ylabel("Count") | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment