Last active
November 4, 2022 09:53
-
-
Save realamirhe/7d3310ca4057afb2e56667b5913c52aa to your computer and use it in GitHub Desktop.
tweet client crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
window.tweetsRef = {}; | |
function debounceEvent(callback, time) { | |
let interval; | |
return () => { | |
clearTimeout(interval); | |
interval = setTimeout(() => { | |
interval = null; | |
callback(arguments); | |
}, time); | |
}; | |
} | |
const crawlTweets = (lang) => () => { | |
window.tweetsRef = [...document.querySelectorAll(`div[id^='id__'][lang='${lang}']`)] | |
.map((tw) => { | |
const tweetId = tw.getAttribute("id"); | |
const parent = document.querySelector(`article[aria-labelledby*='${tweetId}']`); | |
const metaTags = parent.querySelector("div[id][aria-labelledby]")?.querySelectorAll("[src]") || []; | |
const link = parent.querySelector("a[dir='auto'][aria-label][role='link']").href; | |
const username = parent.querySelector(`[data-testid="User-Names"] a[role="link"]`).innerText; | |
const profile = parent.querySelector(`img[src^='https://pbs.twimg.com/profile_images']`)?.src; | |
const dateTime = parent.querySelector("time").dateTime; | |
const meta = [...metaTags] | |
.map((tag) => { | |
const src = tag.getAttribute("src"); | |
// video assets | |
if (tag.tagName === "VIDEO") return { type: "video" }; | |
// image assets | |
if (tag.tagName === "IMG" && tag.getAttribute("draggable") === "true") { | |
const isProfileImage = src.startsWith("https://pbs.twimg.com/profile"); | |
if (isProfileImage) return null; | |
return { type: "image", src }; | |
} | |
}) | |
.filter(Boolean) | |
.reduce( | |
(acc, v, index) => { | |
if (v.type === "video") acc["hasVideo"] = true; | |
if (v.type === "image") { | |
acc[`image-${index + 1}`] = v.src; | |
acc["hasPhotos"] = true; | |
} | |
return acc; | |
}, | |
{ hasVideo: false, hasPhotos: false, username, link, profile, dateTime } | |
); | |
return { id: tweetId, text: tw.innerText, meta }; | |
}) | |
.reduce((acc, tweet) => { | |
acc[tweet.id.replace("id__", "")] = tweet; | |
delete tweet.id; | |
return acc; | |
}, window.tweetsRef); | |
console.log(`${Object.keys(window.tweetsRef).length} '${lang}' tweet has been collected`); | |
}; | |
const lang = "fa"; | |
const crawler = debounceEvent(crawlTweets(lang), 200); | |
window.removeEventListener("scroll", crawler); | |
window.addEventListener("scroll", crawler); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Use: https://www.convertcsv.com/json-to-csv.htm