Skip to content

Instantly share code, notes, and snippets.

@m-esm
Last active October 1, 2023 01:57
Show Gist options
  • Save m-esm/79d59b2f6a84f30a5ada23ad4abaf336 to your computer and use it in GitHub Desktop.
Save m-esm/79d59b2f6a84f30a5ada23ad4abaf336 to your computer and use it in GitHub Desktop.
Extract tweets from browser
if (!window.tweets) window.tweets = {};
/**
* @param tweetElem {HTMLElement}
*/
window.parseTweetElement = (tweetElem) => {
const isQuote = !!Array.from(tweetElem.querySelectorAll("span")).find(
(p) => p.textContent === "Quote"
);
let quotedTweet;
if (isQuote) {
quotedTweetElem = Array.from(tweetElem.querySelectorAll("span")).find(
(p) => p.textContent === "Quote"
)?.parentElement.nextSibling;
if (quotedTweetElem)
quotedTweet = window.parseTweetElement(quotedTweetElem);
quotedTweetElem?.remove();
}
const username = Array.from(tweetElem.querySelectorAll("span"))
.map((p) => p?.textContent)
.find((p) => p?.startsWith("@"));
const avatar = tweetElem
.querySelector('[data-testid="Tweet-User-Avatar"] img')
?.getAttribute("src");
const text = tweetElem.querySelector(
'[data-testid="tweetText"]'
)?.textContent;
const time = tweetElem.querySelector("time")?.getAttribute("datetime");
const link = tweetElem
.querySelector("time")
?.parentElement?.getAttribute("href");
const isRetweet = !!tweetElem.querySelector('[data-testid="socialContext"]')
?.textContent;
const retweetedBy = tweetElem.querySelector(
'[data-testid="socialContext"] span span'
)?.textContent;
const images = Array.from(
tweetElem.querySelectorAll('[data-testid="tweetPhoto"] img')
)
.map((p) => p?.getAttribute("src"))
.filter((p) => p);
const tweet = {
username,
text,
avatar,
time,
link: link ? `https://twitter.com${link}` : "",
isRetweet,
retweetedBy,
isQuote,
quotedTweet,
images,
};
return tweet;
};
window.fetchTweets = () => {
document.querySelectorAll('[data-testid="tweet"]').forEach((tweetElem) => {
const tweet = window.parseTweetElement(tweetElem);
if (Object.values(tweet).find((p) => !p)) return;
if (!tweets[tweet.link]) tweets[tweet.link] = tweet;
});
console.log(`Total tweets extracted: ${Object.keys(tweets).length}`);
};
window.scrollAndExtract = async (scrollHeight, iterations) => {
for (let i = 0; i < iterations; i++) {
window.fetchTweets();
window.scrollBy(0, scrollHeight);
await new Promise((resolve) => setTimeout(resolve, 3000));
}
const jsonString = JSON.stringify(Object.values(tweets), null, 2);
// Create a Blob from the JSON string
const blob = new Blob([jsonString], {
type: "application/json",
});
const url = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = url;
a.target = "_blank";
a.download = `tweets_${Date.now()}.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
};
await scrollAndExtract(10000, 3);
@m-esm
Copy link
Author

m-esm commented Sep 27, 2023

Example extracted tweet:

 {
    "username": "@elonmusk",
    "text": "Interesting …",
    "avatar": "https://pbs.twimg.com/profile_images/1683325380441128960/yRsRRjGO_x96.jpg",
    "time": "2023-09-26T23:56:25.000Z",
    "link": "https://twitter.com/elonmusk/status/1706820047421759871",
    "isRetweet": false,
    "isQuote": true,
    "quotedTweet": {
      "username": "@GlobalAffairs",
      "text": "In response to today's discussion about disinformation in Europe, we reiterate that X is committed to complying with the DSA. \n\nThe EU’s own data shows other services saw greater changes in subscriber growth.",
      "avatar": "https://pbs.twimg.com/profile_images/1683512189213200385/i554EDOS_normal.jpg",
      "time": "2023-09-26T21:49:49.000Z",
      "link": null,
      "isRetweet": false,
      "isQuote": false,
      "images": [
        "https://pbs.twimg.com/media/F6-5e3nXcAAz-J9?format=jpg&name=medium"
      ]
    },
    "images": [
      "https://pbs.twimg.com/media/F6-5e3nXcAAz-J9?format=jpg&name=medium"
    ]
  }

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment