Skip to content

Instantly share code, notes, and snippets.

@realamirhe
Last active November 4, 2022 09:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save realamirhe/7d3310ca4057afb2e56667b5913c52aa to your computer and use it in GitHub Desktop.
Save realamirhe/7d3310ca4057afb2e56667b5913c52aa to your computer and use it in GitHub Desktop.
tweet client crawler
window.tweetsRef = {};
function debounceEvent(callback, time) {
let interval;
return () => {
clearTimeout(interval);
interval = setTimeout(() => {
interval = null;
callback(arguments);
}, time);
};
}
const crawlTweets = (lang) => () => {
window.tweetsRef = [...document.querySelectorAll(`div[id^='id__'][lang='${lang}']`)]
.map((tw) => {
const tweetId = tw.getAttribute("id");
const parent = document.querySelector(`article[aria-labelledby*='${tweetId}']`);
const metaTags = parent.querySelector("div[id][aria-labelledby]")?.querySelectorAll("[src]") || [];
const link = parent.querySelector("a[dir='auto'][aria-label][role='link']").href;
const username = parent.querySelector(`[data-testid="User-Names"] a[role="link"]`).innerText;
const profile = parent.querySelector(`img[src^='https://pbs.twimg.com/profile_images']`)?.src;
const dateTime = parent.querySelector("time").dateTime;
const meta = [...metaTags]
.map((tag) => {
const src = tag.getAttribute("src");
// video assets
if (tag.tagName === "VIDEO") return { type: "video" };
// image assets
if (tag.tagName === "IMG" && tag.getAttribute("draggable") === "true") {
const isProfileImage = src.startsWith("https://pbs.twimg.com/profile");
if (isProfileImage) return null;
return { type: "image", src };
}
})
.filter(Boolean)
.reduce(
(acc, v, index) => {
if (v.type === "video") acc["hasVideo"] = true;
if (v.type === "image") {
acc[`image-${index + 1}`] = v.src;
acc["hasPhotos"] = true;
}
return acc;
},
{ hasVideo: false, hasPhotos: false, username, link, profile, dateTime }
);
return { id: tweetId, text: tw.innerText, meta };
})
.reduce((acc, tweet) => {
acc[tweet.id.replace("id__", "")] = tweet;
delete tweet.id;
return acc;
}, window.tweetsRef);
console.log(`${Object.keys(window.tweetsRef).length} '${lang}' tweet has been collected`);
};
const lang = "fa";
const crawler = debounceEvent(crawlTweets(lang), 200);
window.removeEventListener("scroll", crawler);
window.addEventListener("scroll", crawler);
@realamirhe
Copy link
Author

  1. Add the following to chrome snippets
  2. Run the snippets
  3. Scroll till desired data has been collected
  • Convert collected data to JSON
console.log(JSON.stringify(window. tweetsRef, null, 2))

Copy printed data

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment