Skip to content

Instantly share code, notes, and snippets.

@neopunisher
Created January 23, 2024 20:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save neopunisher/e3a32b48e8959b3d3c099901f0a0047a to your computer and use it in GitHub Desktop.
Save neopunisher/e3a32b48e8959b3d3c099901f0a0047a to your computer and use it in GitHub Desktop.
function createCSV(data, fileName) {
const headers = [
'id',
'author_name',
'post',
'is_post',
'comment',
'is_comment',
'first_name',
'last_name',
'email',
]
const csvContent = [
headers.join(','),
...data.map((row) =>
headers
.map((header) => {
const value = row[header]
if (value === null) return 'null'
if (typeof value === 'string') {
// Wrap all fields, including those without commas, in double quotes
return `"${value.replace(/"/g, '""')}"`
}
return value
})
.join(','),
),
].join('\n')
const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' })
const link = document.createElement('a')
if (navigator.msSaveBlob) {
// IE 10+
navigator.msSaveBlob(blob, fileName)
} else {
const url = URL.createObjectURL(blob)
link.setAttribute('href', url)
link.setAttribute('download', fileName || 'data.csv')
document.body.appendChild(link)
link.click()
document.body.removeChild(link)
URL.revokeObjectURL(url)
}
}
async function scrollDown() {
// const wrapper = document.querySelector("#search-page-list-container");
const wrapper = window
await new Promise((resolve, reject) => {
var totalHeight = 0
var distance = 2000
var timer = setInterval(async () => {
var scrollHeightBefore = wrapper.scrollHeight
wrapper.scrollBy(0, distance)
totalHeight += distance
clearInterval(timer)
resolve()
}, 400)
})
await new Promise((resolve) => setTimeout(resolve, 1000))
}
function getEmailFromText(text) {
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g
const email = text?.match(emailRegex)?.[0]
return email || ''
}
function clickOnComments(post) {
// Get all divs on the page
var allDivs = post.getElementsByTagName('div')
// Create an array to store matching divs
var matchingDivs = []
// Loop through each div
for (var i = 0; i < allDivs.length; i++) {
// Check if the div has the attribute data-visualcompletion set to "ignore-dynamic"
if (allDivs[i].getAttribute('data-visualcompletion') === 'ignore-dynamic') {
// Add the matching div to the array
matchingDivs.push(allDivs[i])
const thingToClickToOpenComments =
allDivs?.[i]?.children?.[0]?.children?.[0]?.children?.[0]?.children?.[0]
?.children?.[0]?.children?.[1]?.children?.[1]?.children?.[0]
?.children?.[0]
if (thingToClickToOpenComments) {
thingToClickToOpenComments.click()
}
}
}
}
// Function to recursively traverse HTML elements and return text in an array
function traverseElements(element) {
var textArray = []
// Check if the element has child nodes
if (element.childNodes.length > 0) {
// Loop through each child node
for (var i = 0; i < element.childNodes.length; i++) {
// Recursively call the function for each child node
textArray = textArray.concat(traverseElements(element.childNodes[i]))
}
} else {
// If the element is a text node and contains non-whitespace text
if (
element.nodeType === Node.TEXT_NODE &&
element.nodeValue.trim() !== ''
) {
// Push the text into the text array
textArray.push(element.nodeValue.trim())
}
}
return textArray
}
function getTextFromComment(textArray) {
return textArray
?.filter((section) => {
if (section === 'Reply') {
return false
}
if (section?.match(/^\d+$/)) {
return false
}
if (section === 'Like') {
return false
}
if (section === 'Top Contributor') {
return false
}
if (section === 'Follow') {
return false
}
if (section === '·') {
return false
}
return true
})
?.slice(1, textArray.length - 3)
?.join(' ')
}
function extractComments(post = undefined) {
let parent = null
if (post) {
parent = post
} else {
const dialog = document?.querySelector('div[role=dialog]')
parent = dialog
}
if (!parent) {
return []
}
var allDivs = parent.getElementsByTagName('div')
// Array to store extracted text arrays
var textArrays = []
// Loop through each div
for (var i = 0; i < allDivs.length; i++) {
// Check if the div has the aria-label attribute starting with "Comment by"
var ariaLabel = allDivs[i].getAttribute('aria-label')
if (ariaLabel && ariaLabel.startsWith('Comment by')) {
// Call the recursive function to traverse and extract text
var elementTextArray = traverseElements(allDivs[i])
// Push the text array to the top-level array
textArrays.push(elementTextArray)
}
}
return textArrays?.map((textArray) => {
// get the text by slicing off the first element and the last 3 elements
const text = getTextFromComment(textArray)
return {
id: textArray?.join('-')?.toLowerCase(),
author_name: textArray?.[0],
comment: text,
email: getEmailFromText(text),
}
})
}
function getAllPosts() {
const posts = document.querySelectorAll('div[role=feed] > div')
return [...posts].filter((post) => {
const posterName = post?.querySelector('h3')?.textContent
if (posterName) {
return true
}
return false
})
}
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
function closeDialog() {
const closeButton = document?.querySelector('div[aria-label="Close"]')
if (!closeButton) {
return
}
closeButton.click()
}
function getPostText(post) {
let postText = post?.querySelector(
'div > div > div > div > div > div > div > div > div > div:nth-child(8) > div > div > div:nth-child(3) > div',
)
// const postText = post?.querySelector('div[data-ad-preview="message"]'); // this doesn't work for all of them for some reason
if (!postText) {
return
}
return traverseElements(postText)?.join(' ')
}
function clickSeeMoreIfItsThere(post) {
const buttonDivs = post.querySelectorAll('div[role="button"]')
for (let i = 0; i < buttonDivs.length; i++) {
const div = buttonDivs[i]
// Check if the text content is "See more"
if (div.textContent.trim() === 'See more') {
// Perform actions on the matched div
div.click()
}
}
}
function getAllCommentsAndFormat(post, comments) {
return comments.map((comment) => {
return {
post: post?.post,
is_comment: true,
...comment,
}
})
}
function getPostId(posterName, postText) {
return `${posterName?.split(' ')?.join('-')}-${postText
?.split(' ')
?.join('-')}`?.toLowerCase()
}
async function run() {
console.log('starting...')
const allContent = []
let posts = getAllPosts()
console.log('posts.length', posts.length)
let i = 0
while (i < posts.length) {
const post = posts[i]
console.log(
`while you're waiting, why not check out https://thewebscrapingguy.com/? 😅`,
)
const posterName = post?.querySelector('h3')?.textContent
console.log('posterName', posterName)
clickSeeMoreIfItsThere(post)
await sleep(1000)
const postText = getPostText(post)
const commentsDisplayedWithoutClicking = extractComments(post)
clickOnComments(post)
await sleep(1000)
const commentsAfterClickingModal = extractComments()
closeDialog()
const content = {
id: getPostId(posterName, postText),
is_post: true,
author_name: posterName,
first_name: posterName?.split(' ')?.[0],
last_name: posterName?.split(' ')?.[1],
post: postText,
email: getEmailFromText(postText),
}
const comments = getAllCommentsAndFormat(content, [
...commentsDisplayedWithoutClicking,
...commentsAfterClickingModal,
])
allContent.push(content)
allContent.push(...comments)
i++
if (scrolls > 0) {
await scrollDown()
scrolls--
const currentPosts = getAllPosts()
console.log('currentPosts', currentPosts.length)
posts = currentPosts
}
}
const unique = []
const uniqueIds = []
allContent.forEach((content) => {
if (!uniqueIds.includes(content.id)) {
unique.push(content)
uniqueIds.push(content.id)
}
})
console.log('done!')
console.log('allContent', unique)
createCSV(unique, 'facebookGroupPostAndComments.csv')
console.log(
`Congrats! 🎉 You scraped a sh*t ton of posts! If you need any custom scrapers built, email me: adrian@thewebscrapingguy.com`,
)
}
// NOTE: to increase the number of posts, increase the "scrolls" variable below
let scrolls = 5
await run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment