Skip to content

Instantly share code, notes, and snippets.

@adrianhorning08
Last active December 24, 2024 17:34
Show Gist options
  • Save adrianhorning08/42d4e4c0826c2a11a084cd1513b130b0 to your computer and use it in GitHub Desktop.
Save adrianhorning08/42d4e4c0826c2a11a084cd1513b130b0 to your computer and use it in GitHub Desktop.
Scrape Facebook Group
const allContent = []
function createCSV(data, fileName) {
const headers = [
'id',
'email',
'firstName',
'lastName',
'postId',
'postText',
'postAuthor',
'postAuthorId',
'postAuthorUrl',
'commentId',
'commentText',
'commentAuthorName',
'commentAuthorId',
'commentAuthorUrl',
'timestamp',
'commentUrl',
]
const csvContent = [
headers.join(','),
...data.map((row) =>
headers
.map((header) => {
const value = row[header]
if (value === null) return 'null'
if (typeof value === 'string') {
// Wrap all fields, including those without commas, in double quotes
return `"${value.replace(/"/g, '""')}"`
}
return value
})
.join(','),
),
].join('\n')
const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' })
const link = document.createElement('a')
if (navigator.msSaveBlob) {
// IE 10+
navigator.msSaveBlob(blob, fileName)
} else {
const url = URL.createObjectURL(blob)
link.setAttribute('href', url)
link.setAttribute('download', fileName || 'data.csv')
document.body.appendChild(link)
link.click()
document.body.removeChild(link)
URL.revokeObjectURL(url)
}
}
async function scrollDown() {
// const wrapper = document.querySelector("#search-page-list-container");
const wrapper = window
await new Promise((resolve, reject) => {
var totalHeight = 0
var distance = 800
var timer = setInterval(async () => {
var scrollHeightBefore = wrapper.scrollHeight
wrapper.scrollBy(0, distance)
totalHeight += distance
clearInterval(timer)
resolve()
}, 400)
})
await new Promise((resolve) => setTimeout(resolve, 1000))
}
function getEmailFromText(text) {
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g
const email = text?.match(emailRegex)?.[0]
return email || ''
}
function clickOnComments(post) {
// Get all divs on the page
var allDivs = post.getElementsByTagName('div')
// Create an array to store matching divs
var matchingDivs = []
// Loop through each div
for (var i = 0; i < allDivs.length; i++) {
// Check if the div has the attribute data-visualcompletion set to "ignore-dynamic"
if (allDivs[i].getAttribute('data-visualcompletion') === 'ignore-dynamic') {
// Add the matching div to the array
matchingDivs.push(allDivs[i])
const thingToClickToOpenComments =
allDivs?.[i]?.children?.[0]?.children?.[0]?.children?.[0]?.children?.[0]
?.children?.[0]?.children?.[1]?.children?.[1]?.children?.[0]
?.children?.[0]
if (thingToClickToOpenComments) {
thingToClickToOpenComments.click()
}
}
}
}
// Function to recursively traverse HTML elements and return text in an array
function traverseElementsToGetText(element) {
var textArray = []
// Check if the element has child nodes
if (element.childNodes.length > 0) {
// Loop through each child node
for (var i = 0; i < element.childNodes.length; i++) {
// Recursively call the function for each child node
textArray = textArray.concat(
traverseElementsToGetText(element.childNodes[i]),
)
}
} else {
// If the element is a text node and contains non-whitespace text
if (
element.nodeType === Node.TEXT_NODE &&
element.nodeValue.trim() !== ''
) {
// Push the text into the text array
textArray.push(element.nodeValue.trim())
}
}
return textArray
}
function getAllPosts() {
const posts = document.querySelectorAll('div[role=feed] > div')
return [...posts].filter((post) => {
const posterName = post?.querySelector('h2')?.textContent
if (posterName) {
return true
}
return false
})
}
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
function closeDialog() {
const closeButton = document?.querySelector('div[aria-label="Close"]')
if (!closeButton) {
return
}
closeButton.click()
}
function formatTopLevelComments(postId, topLevelComments = []) {
return topLevelComments.map((c) => {
const text = c?.comment.body.text
const commentId = c?.comment.id
const authorName = c?.comment.author.name
const authorId = c?.comment.author.id
return {
id: commentId,
commentId,
postId,
commentText: text || '',
commentAuthorName: authorName,
commentAuthorId: authorId,
email: getEmailFromText(text),
firstName: authorName?.split(' ')?.[0],
lastName: authorName?.split(' ')?.[1],
}
})
}
function parseFirstLevelJson(json) {
const actor =
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.content
?.story?.comet_sections?.context_layout?.story?.comet_sections
?.actor_photo?.story?.actors?.[0]
const postText =
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.content
?.story?.comet_sections?.message_container?.story?.message?.text
const postId =
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.feedback
?.story?.post_id
const post = {
id: postId,
postId,
postText: postText || '',
postAuthor: actor?.name,
postAuthorId: actor?.id,
postAuthorUrl: actor?.url,
email: getEmailFromText(postText),
firstName: actor?.name?.split(' ')?.[0],
lastName: actor?.name?.split(' ')?.[1],
}
const topLevelComments = formatTopLevelComments(
postId,
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.feedback
?.story?.feedback_context?.interesting_top_level_comments,
)
return {
post,
topLevelComments,
}
}
function parseSecondLevelJson(json) {
const data2 = json
const actor =
data2?.data?.node?.comet_sections?.content?.story?.comet_sections
?.context_layout?.story?.comet_sections?.actor_photo?.story?.actors?.[0]
const posterName = actor?.name
const postText =
data2?.data?.node?.comet_sections?.content?.story?.comet_sections
?.message_container?.story?.message?.text
const id = actor?.id
const postId = data2?.data?.node?.comet_sections?.feedback?.story?.post_id
const url = actor?.url
const post = {
id: postId,
postId,
postText: postText || '',
postAuthor: posterName,
postAuthorId: id,
postAuthorUrl: url,
email: getEmailFromText(postText),
firstName: posterName?.split(' ')?.[0],
lastName: posterName?.split(' ')?.[1],
}
const topLevelComments = formatTopLevelComments(
postId,
data2?.data?.node?.comet_sections?.feedback?.story?.feedback_context
?.interesting_top_level_comments,
)
return {
post,
topLevelComments,
}
}
function parseThirdLevelJson(json) {
const data3 = json
const actor3 =
data3?.data?.node?.comet_sections?.content?.story?.comet_sections
?.context_layout?.story?.comet_sections?.actor_photo?.story?.actors?.[0]
const posterName = actor3?.name
const postText =
data3?.data?.node?.comet_sections?.content?.story?.comet_sections
?.message_container?.story?.message?.text
const posterId = actor3?.id
const postId = data3?.data?.node?.comet_sections?.feedback?.story?.post_id
const url = actor3?.url
const post = {
id: postId,
postId,
postText: postText || '',
postAuthor: posterName,
postAuthorId: posterId,
postAuthorUrl: url,
email: getEmailFromText(postText),
firstName: posterName?.split(' ')?.[0],
lastName: posterName?.split(' ')?.[1],
}
const topLevelComments = formatTopLevelComments(
postId,
data3?.data?.node?.comet_sections?.feedback?.story?.feedback_context
?.interesting_top_level_comments,
)
return {
post,
topLevelComments,
}
}
function addCommentsToAllContent(comments = []) {
comments.forEach((c) => {
if (allContent?.find((f) => f.commentId === c.commentId)) {
} else {
allContent.push(c)
}
})
}
function interceptRequests() {
let oldXHROpen = window.XMLHttpRequest.prototype.open
window.XMLHttpRequest.prototype.open = function (method, url, async) {
if (!url.includes('graphql')) {
return oldXHROpen.apply(this, arguments)
}
// Capture the request body
let requestBody = null
// Override the send method to capture the request body
let oldXHRSend = this.send
this.send = function (data) {
requestBody = data
oldXHRSend.apply(this, arguments)
}
// Listen for the 'load' event to capture the response
this.addEventListener('load', function () {
if (
requestBody?.includes('GroupsCometFeedRegularStoriesPaginationQuery')
) {
console.log('getting posts')
// we're getting posts....
const payload = this.responseText
const lines = payload.split('\n')
const data1 = JSON.parse(lines[0])
const firstPost = parseFirstLevelJson(data1)
console.log('firstPost', firstPost)
const data2 = JSON.parse(lines[1])
const secondPost = parseSecondLevelJson(data2)
console.log('secondPost', secondPost)
const data3 = JSON.parse(lines[2])
const thirdPost = parseThirdLevelJson(data3)
console.log('thirdPost', thirdPost)
allContent.push(firstPost.post)
addCommentsToAllContent(firstPost.topLevelComments)
allContent.push(secondPost.post)
addCommentsToAllContent(secondPost.topLevelComments)
allContent.push(thirdPost.post)
addCommentsToAllContent(thirdPost.topLevelComments)
//
} else if (requestBody?.includes('CometFocusedStoryViewUFIQuery')) {
console.log('getting comments')
// we're getting comments
let data = null
try {
data = JSON.parse(this.responseText)
} catch (e) {}
const postId = data?.data?.story_card?.post_id
const comments =
data?.data?.feedback?.ufi_renderer?.feedback?.comment_list_renderer?.feedback?.comment_rendering_instance_for_feed_location?.comments?.edges?.map(
(blah) => {
const comment = blah?.node
const commentId = comment?.id
const commentText = comment?.body?.text
const authorName = comment?.author?.name
const authorId = comment?.author?.id
const authorUrl = comment?.author?.url
const timeStuff = comment?.comment_action_links?.find(
(f) => f?.__typename === 'XFBCommentTimeStampActionLink',
)?.comment
const timestamp = timeStuff?.created_time
const commentUrl = timeStuff?.url
const email = getEmailFromText(commentText)
return {
id: commentId,
commentId,
postId,
commentText,
commentAuthorName: authorName,
commentAuthorId: authorId,
commentAuthorUrl: authorUrl,
timestamp,
commentUrl,
email,
firstName: authorName?.split(' ')?.[0],
lastName: authorName?.split(' ')?.[1],
}
},
)
addCommentsToAllContent(comments)
console.log('comments', comments)
} else {
return
}
})
// Call the original open method
return oldXHROpen.apply(this, arguments)
}
}
async function run() {
interceptRequests()
console.log('starting...')
let posts = getAllPosts()
console.log('posts.length', posts.length)
let i = 0
while (i < posts.length) {
const post = posts[i]
console.log(
`while you're waiting, why not check out https://thewebscrapingguy.com/? 😅`,
)
clickOnComments(post)
await sleep(1000)
closeDialog()
i++
if (scrolls > 0) {
await scrollDown()
scrolls--
console.log('scrolls left', scrolls)
console.log('old posts', posts.length)
const currentPosts = getAllPosts()
console.log('currentPosts', currentPosts.length)
posts = currentPosts
}
}
createCSV(allContent, 'facebookGroupPostsAndComments.csv')
console.log('allContent', allContent)
console.log('done!')
console.log(
`Congrats! 🎉 You scraped a sh*t ton of posts! If you need any custom scrapers built, email me: adrian@thewebscrapingguy.com`,
)
}
let scrolls = 50
// NOTE: Only gets the first level comments
await run()
@timstew
Copy link

timstew commented Oct 24, 2024

This sort of works, but doesn't capture any of the post text that i can see... I don't see any code to click the 'see more' to expand the content, however I'm not even getting the unexpanded content

@natereprogle
Copy link

natereprogle commented Nov 2, 2024

This sort of works, but doesn't capture any of the post text that i can see... I don't see any code to click the 'see more' to expand the content, however I'm not even getting the unexpanded content

I was able to fix it by changing line 139 from const posterName = post?.querySelector("h3")?.textContent to const posterName = post?.querySelector("h2")?.textContent || post?.querySelector("h3")?.textContent;

It appears Facebook groups now use h2 tags as the header. You'll also have to adjust the "parseTopLevelJson" function, since the APIs now return data in a different format (At least, it does for me in a private page). Here's my changes to that function:

const actor =
    json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections
      ?.context_layout?.story?.comet_sections?.actor_photo?.story?.actors?.[0];

  const postText =
    json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.content
      ?.story?.comet_sections?.message_container?.story?.message?.text;

  const postId =
    json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.feedback
      ?.story?.id;

@adrianhorning08
Copy link
Author

@natereprogle Thanks! Just updated the gist!

@jaycruz17
Copy link

jaycruz17 commented Nov 26, 2024

@adrianhorning08 your code is working but I need to tweak it a lil bit to meet specific columns. Is there anyway to get the comments to the nth level?

@adrianhorning08
Copy link
Author

@jaycruz17 not sure

@RevazRevazashvili
Copy link

Hi, thank you for the code. I am wondering whether it scrapes replies to the comments or not?

@jaycruz17
Copy link

@RevazRevazashvili It only scrapes 1st level comments. It cannot scrapes sublevels comment.

@RevazRevazashvili
Copy link

@RevazRevazashvili It only scrapes 1st level comments. It cannot scrapes sublevels comment.

Is it not possible to add that feature?

@jaycruz17
Copy link

@RevazRevazashvili I was able to improve this code to get the comments even the sublevels comments.

@RevazRevazashvili
Copy link

@RevazRevazashvili I was able to improve this code to get the comments even the sublevels comments.

Could you send me the code? and also does it have limitation? I mean if I run it for large group shall facebook block me?

@timstew
Copy link

timstew commented Dec 18, 2024 via email

@bolognaa
Copy link

it's partly working, is there a chance for a fix? :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment