-
-
Save adrianhorning08/42d4e4c0826c2a11a084cd1513b130b0 to your computer and use it in GitHub Desktop.
const allContent = [] | |
function createCSV(data, fileName) { | |
const headers = [ | |
'id', | |
'email', | |
'firstName', | |
'lastName', | |
'postId', | |
'postText', | |
'postAuthor', | |
'postAuthorId', | |
'postAuthorUrl', | |
'commentId', | |
'commentText', | |
'commentAuthorName', | |
'commentAuthorId', | |
'commentAuthorUrl', | |
'timestamp', | |
'commentUrl', | |
] | |
const csvContent = [ | |
headers.join(','), | |
...data.map((row) => | |
headers | |
.map((header) => { | |
const value = row[header] | |
if (value === null) return 'null' | |
if (typeof value === 'string') { | |
// Wrap all fields, including those without commas, in double quotes | |
return `"${value.replace(/"/g, '""')}"` | |
} | |
return value | |
}) | |
.join(','), | |
), | |
].join('\n') | |
const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' }) | |
const link = document.createElement('a') | |
if (navigator.msSaveBlob) { | |
// IE 10+ | |
navigator.msSaveBlob(blob, fileName) | |
} else { | |
const url = URL.createObjectURL(blob) | |
link.setAttribute('href', url) | |
link.setAttribute('download', fileName || 'data.csv') | |
document.body.appendChild(link) | |
link.click() | |
document.body.removeChild(link) | |
URL.revokeObjectURL(url) | |
} | |
} | |
async function scrollDown() { | |
// const wrapper = document.querySelector("#search-page-list-container"); | |
const wrapper = window | |
await new Promise((resolve, reject) => { | |
var totalHeight = 0 | |
var distance = 800 | |
var timer = setInterval(async () => { | |
var scrollHeightBefore = wrapper.scrollHeight | |
wrapper.scrollBy(0, distance) | |
totalHeight += distance | |
clearInterval(timer) | |
resolve() | |
}, 400) | |
}) | |
await new Promise((resolve) => setTimeout(resolve, 1000)) | |
} | |
function getEmailFromText(text) { | |
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g | |
const email = text?.match(emailRegex)?.[0] | |
return email || '' | |
} | |
function clickOnComments(post) { | |
// Get all divs on the page | |
var allDivs = post.getElementsByTagName('div') | |
// Create an array to store matching divs | |
var matchingDivs = [] | |
// Loop through each div | |
for (var i = 0; i < allDivs.length; i++) { | |
// Check if the div has the attribute data-visualcompletion set to "ignore-dynamic" | |
if (allDivs[i].getAttribute('data-visualcompletion') === 'ignore-dynamic') { | |
// Add the matching div to the array | |
matchingDivs.push(allDivs[i]) | |
const thingToClickToOpenComments = | |
allDivs?.[i]?.children?.[0]?.children?.[0]?.children?.[0]?.children?.[0] | |
?.children?.[0]?.children?.[1]?.children?.[1]?.children?.[0] | |
?.children?.[0] | |
if (thingToClickToOpenComments) { | |
thingToClickToOpenComments.click() | |
} | |
} | |
} | |
} | |
// Function to recursively traverse HTML elements and return text in an array | |
function traverseElementsToGetText(element) { | |
var textArray = [] | |
// Check if the element has child nodes | |
if (element.childNodes.length > 0) { | |
// Loop through each child node | |
for (var i = 0; i < element.childNodes.length; i++) { | |
// Recursively call the function for each child node | |
textArray = textArray.concat( | |
traverseElementsToGetText(element.childNodes[i]), | |
) | |
} | |
} else { | |
// If the element is a text node and contains non-whitespace text | |
if ( | |
element.nodeType === Node.TEXT_NODE && | |
element.nodeValue.trim() !== '' | |
) { | |
// Push the text into the text array | |
textArray.push(element.nodeValue.trim()) | |
} | |
} | |
return textArray | |
} | |
function getAllPosts() { | |
const posts = document.querySelectorAll('div[role=feed] > div') | |
return [...posts].filter((post) => { | |
const posterName = post?.querySelector('h2')?.textContent | |
if (posterName) { | |
return true | |
} | |
return false | |
}) | |
} | |
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)) | |
function closeDialog() { | |
const closeButton = document?.querySelector('div[aria-label="Close"]') | |
if (!closeButton) { | |
return | |
} | |
closeButton.click() | |
} | |
function formatTopLevelComments(postId, topLevelComments = []) { | |
return topLevelComments.map((c) => { | |
const text = c?.comment.body.text | |
const commentId = c?.comment.id | |
const authorName = c?.comment.author.name | |
const authorId = c?.comment.author.id | |
return { | |
id: commentId, | |
commentId, | |
postId, | |
commentText: text || '', | |
commentAuthorName: authorName, | |
commentAuthorId: authorId, | |
email: getEmailFromText(text), | |
firstName: authorName?.split(' ')?.[0], | |
lastName: authorName?.split(' ')?.[1], | |
} | |
}) | |
} | |
function parseFirstLevelJson(json) { | |
const actor = | |
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.content | |
?.story?.comet_sections?.context_layout?.story?.comet_sections | |
?.actor_photo?.story?.actors?.[0] | |
const postText = | |
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.content | |
?.story?.comet_sections?.message_container?.story?.message?.text | |
const postId = | |
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.feedback | |
?.story?.post_id | |
const post = { | |
id: postId, | |
postId, | |
postText: postText || '', | |
postAuthor: actor?.name, | |
postAuthorId: actor?.id, | |
postAuthorUrl: actor?.url, | |
email: getEmailFromText(postText), | |
firstName: actor?.name?.split(' ')?.[0], | |
lastName: actor?.name?.split(' ')?.[1], | |
} | |
const topLevelComments = formatTopLevelComments( | |
postId, | |
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.feedback | |
?.story?.feedback_context?.interesting_top_level_comments, | |
) | |
return { | |
post, | |
topLevelComments, | |
} | |
} | |
function parseSecondLevelJson(json) { | |
const data2 = json | |
const actor = | |
data2?.data?.node?.comet_sections?.content?.story?.comet_sections | |
?.context_layout?.story?.comet_sections?.actor_photo?.story?.actors?.[0] | |
const posterName = actor?.name | |
const postText = | |
data2?.data?.node?.comet_sections?.content?.story?.comet_sections | |
?.message_container?.story?.message?.text | |
const id = actor?.id | |
const postId = data2?.data?.node?.comet_sections?.feedback?.story?.post_id | |
const url = actor?.url | |
const post = { | |
id: postId, | |
postId, | |
postText: postText || '', | |
postAuthor: posterName, | |
postAuthorId: id, | |
postAuthorUrl: url, | |
email: getEmailFromText(postText), | |
firstName: posterName?.split(' ')?.[0], | |
lastName: posterName?.split(' ')?.[1], | |
} | |
const topLevelComments = formatTopLevelComments( | |
postId, | |
data2?.data?.node?.comet_sections?.feedback?.story?.feedback_context | |
?.interesting_top_level_comments, | |
) | |
return { | |
post, | |
topLevelComments, | |
} | |
} | |
function parseThirdLevelJson(json) { | |
const data3 = json | |
const actor3 = | |
data3?.data?.node?.comet_sections?.content?.story?.comet_sections | |
?.context_layout?.story?.comet_sections?.actor_photo?.story?.actors?.[0] | |
const posterName = actor3?.name | |
const postText = | |
data3?.data?.node?.comet_sections?.content?.story?.comet_sections | |
?.message_container?.story?.message?.text | |
const posterId = actor3?.id | |
const postId = data3?.data?.node?.comet_sections?.feedback?.story?.post_id | |
const url = actor3?.url | |
const post = { | |
id: postId, | |
postId, | |
postText: postText || '', | |
postAuthor: posterName, | |
postAuthorId: posterId, | |
postAuthorUrl: url, | |
email: getEmailFromText(postText), | |
firstName: posterName?.split(' ')?.[0], | |
lastName: posterName?.split(' ')?.[1], | |
} | |
const topLevelComments = formatTopLevelComments( | |
postId, | |
data3?.data?.node?.comet_sections?.feedback?.story?.feedback_context | |
?.interesting_top_level_comments, | |
) | |
return { | |
post, | |
topLevelComments, | |
} | |
} | |
function addCommentsToAllContent(comments = []) { | |
comments.forEach((c) => { | |
if (allContent?.find((f) => f.commentId === c.commentId)) { | |
} else { | |
allContent.push(c) | |
} | |
}) | |
} | |
function interceptRequests() { | |
let oldXHROpen = window.XMLHttpRequest.prototype.open | |
window.XMLHttpRequest.prototype.open = function (method, url, async) { | |
if (!url.includes('graphql')) { | |
return oldXHROpen.apply(this, arguments) | |
} | |
// Capture the request body | |
let requestBody = null | |
// Override the send method to capture the request body | |
let oldXHRSend = this.send | |
this.send = function (data) { | |
requestBody = data | |
oldXHRSend.apply(this, arguments) | |
} | |
// Listen for the 'load' event to capture the response | |
this.addEventListener('load', function () { | |
if ( | |
requestBody?.includes('GroupsCometFeedRegularStoriesPaginationQuery') | |
) { | |
console.log('getting posts') | |
// we're getting posts.... | |
const payload = this.responseText | |
const lines = payload.split('\n') | |
const data1 = JSON.parse(lines[0]) | |
const firstPost = parseFirstLevelJson(data1) | |
console.log('firstPost', firstPost) | |
const data2 = JSON.parse(lines[1]) | |
const secondPost = parseSecondLevelJson(data2) | |
console.log('secondPost', secondPost) | |
const data3 = JSON.parse(lines[2]) | |
const thirdPost = parseThirdLevelJson(data3) | |
console.log('thirdPost', thirdPost) | |
allContent.push(firstPost.post) | |
addCommentsToAllContent(firstPost.topLevelComments) | |
allContent.push(secondPost.post) | |
addCommentsToAllContent(secondPost.topLevelComments) | |
allContent.push(thirdPost.post) | |
addCommentsToAllContent(thirdPost.topLevelComments) | |
// | |
} else if (requestBody?.includes('CometFocusedStoryViewUFIQuery')) { | |
console.log('getting comments') | |
// we're getting comments | |
let data = null | |
try { | |
data = JSON.parse(this.responseText) | |
} catch (e) {} | |
const postId = data?.data?.story_card?.post_id | |
const comments = | |
data?.data?.feedback?.ufi_renderer?.feedback?.comment_list_renderer?.feedback?.comment_rendering_instance_for_feed_location?.comments?.edges?.map( | |
(blah) => { | |
const comment = blah?.node | |
const commentId = comment?.id | |
const commentText = comment?.body?.text | |
const authorName = comment?.author?.name | |
const authorId = comment?.author?.id | |
const authorUrl = comment?.author?.url | |
const timeStuff = comment?.comment_action_links?.find( | |
(f) => f?.__typename === 'XFBCommentTimeStampActionLink', | |
)?.comment | |
const timestamp = timeStuff?.created_time | |
const commentUrl = timeStuff?.url | |
const email = getEmailFromText(commentText) | |
return { | |
id: commentId, | |
commentId, | |
postId, | |
commentText, | |
commentAuthorName: authorName, | |
commentAuthorId: authorId, | |
commentAuthorUrl: authorUrl, | |
timestamp, | |
commentUrl, | |
email, | |
firstName: authorName?.split(' ')?.[0], | |
lastName: authorName?.split(' ')?.[1], | |
} | |
}, | |
) | |
addCommentsToAllContent(comments) | |
console.log('comments', comments) | |
} else { | |
return | |
} | |
}) | |
// Call the original open method | |
return oldXHROpen.apply(this, arguments) | |
} | |
} | |
async function run() { | |
interceptRequests() | |
console.log('starting...') | |
let posts = getAllPosts() | |
console.log('posts.length', posts.length) | |
let i = 0 | |
while (i < posts.length) { | |
const post = posts[i] | |
console.log( | |
`while you're waiting, why not check out https://thewebscrapingguy.com/? 😅`, | |
) | |
clickOnComments(post) | |
await sleep(1000) | |
closeDialog() | |
i++ | |
if (scrolls > 0) { | |
await scrollDown() | |
scrolls-- | |
console.log('scrolls left', scrolls) | |
console.log('old posts', posts.length) | |
const currentPosts = getAllPosts() | |
console.log('currentPosts', currentPosts.length) | |
posts = currentPosts | |
} | |
} | |
createCSV(allContent, 'facebookGroupPostsAndComments.csv') | |
console.log('allContent', allContent) | |
console.log('done!') | |
console.log( | |
`Congrats! 🎉 You scraped a sh*t ton of posts! If you need any custom scrapers built, email me: adrian@thewebscrapingguy.com`, | |
) | |
} | |
let scrolls = 50 | |
// NOTE: Only gets the first level comments | |
await run() |
This sort of works, but doesn't capture any of the post text that i can see... I don't see any code to click the 'see more' to expand the content, however I'm not even getting the unexpanded content
I was able to fix it by changing line 139 from const posterName = post?.querySelector("h3")?.textContent
to const posterName = post?.querySelector("h2")?.textContent || post?.querySelector("h3")?.textContent;
It appears Facebook groups now use h2 tags as the header. You'll also have to adjust the "parseTopLevelJson" function, since the APIs now return data in a different format (At least, it does for me in a private page). Here's my changes to that function:
const actor =
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections
?.context_layout?.story?.comet_sections?.actor_photo?.story?.actors?.[0];
const postText =
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.content
?.story?.comet_sections?.message_container?.story?.message?.text;
const postId =
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.feedback
?.story?.id;
@natereprogle Thanks! Just updated the gist!
@adrianhorning08 your code is working but I need to tweak it a lil bit to meet specific columns. Is there anyway to get the comments to the nth level?
@jaycruz17 not sure
Hi, thank you for the code. I am wondering whether it scrapes replies to the comments or not?
@RevazRevazashvili It only scrapes 1st level comments. It cannot scrapes sublevels comment.
@RevazRevazashvili It only scrapes 1st level comments. It cannot scrapes sublevels comment.
Is it not possible to add that feature?
@RevazRevazashvili I was able to improve this code to get the comments even the sublevels comments.
@RevazRevazashvili I was able to improve this code to get the comments even the sublevels comments.
Could you send me the code? and also does it have limitation? I mean if I run it for large group shall facebook block me?
it's partly working, is there a chance for a fix? :)
This sort of works, but doesn't capture any of the post text that i can see... I don't see any code to click the 'see more' to expand the content, however I'm not even getting the unexpanded content