Skip to content

Instantly share code, notes, and snippets.

@lobstrio
Last active March 23, 2024 16:21
Show Gist options
  • Star 9 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save lobstrio/fc98c162451541ad7692dd2cf7ceb324 to your computer and use it in GitHub Desktop.
Save lobstrio/fc98c162451541ad7692dd2cf7ceb324 to your computer and use it in GitHub Desktop.
Collect 100 first tweets from any user at any time — and export to a .csv file 🐦
# =============================================================================
# Title: Twitter Users Tweets Scraper
# Language: Python
# Description: This script does scrape the first 100 tweets
# of any Twitter User.
# Author: Sasha Bouloudnine
# Date: 2023-08-08
#
# Usage:
# - Make sure you have the required libraries installed by running:
# `pip install requests`
# - Run the script using `python twitter_scraper.py`.
# - Use the dynamic variables:
# - `--username` to specify the Twitter username from which to scrape tweets.
# - `--limit` to set the maximum number of tweets to scrape.
#
# Notes:
# - As of July 1st, 2023, Twitter removed public access to user tweets.
# - Starting from August 1st, 2023, the script is no longer constrained by the limit
# but can collect a maximum of 100 tweets per user.
#
# =============================================================================
import csv
import json
import requests
import argparse
import datetime
import time
import re
# All values stored here are constant, copy-pasted from the website
FEATURES_USER = '{"hidden_profile_likes_enabled":false,"hidden_profile_subscriptions_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"subscriptions_verification_info_is_identity_verified_enabled":false,"subscriptions_verification_info_verified_since_enabled":true,"highlights_tweets_tab_ui_enabled":true,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":true}'
FEATURES_TWEETS = '{"rweb_lists_timeline_redesign_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"tweetypie_unmention_optimization_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":false,"tweet_awards_web_tipping_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_media_download_video_enabled":false,"responsive_web_enhance_cards_enabled":false}'
AUTHORIZATION_TOKEN = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
HEADERS = {
'authorization': 'Bearer %s' % AUTHORIZATION_TOKEN,
# The Bearer value is a fixed value that is copy-pasted from the website
# 'x-guest-token': None,
}
GET_USER_URL = 'https://twitter.com/i/api/graphql/SAMkL5y_N9pmahSw8yy6gw/UserByScreenName'
GET_TWEETS_URL = 'https://twitter.com/i/api/graphql/XicnWRbyQ3WgVY__VataBQ/UserTweets'
FIELDNAMES = ['id', 'tweet_url', 'name', 'user_id', 'username', 'published_at', 'content', 'views_count', 'retweet_count', 'likes', 'quote_count', 'reply_count', 'bookmarks_count', 'medias']
class TwitterScraper:
def __init__(self, username):
# We do initiate requests Session, and we get the `guest-token` from the HomePage
resp = requests.get("https://twitter.com/")
self.gt = resp.cookies.get_dict().get("gt") or "".join(re.findall(r'(?<=\"gt\=)[^;]+', resp.text))
assert self.gt
HEADERS['x-guest-token'] = getattr(self, 'gt')
# assert self.guest_token
self.HEADERS = HEADERS
assert username
self.username = username
def get_user(self):
# We recover the user_id required to go ahead
arg = {"screen_name": self.username, "withSafetyModeUserFields": True}
params = {
'variables': json.dumps(arg),
'features': FEATURES_USER,
}
response = requests.get(
GET_USER_URL,
params=params,
headers=self.HEADERS
)
try:
json_response = response.json()
except requests.exceptions.JSONDecodeError:
print(response.status_code)
print(response.text)
raise
result = json_response.get("data", {}).get("user", {}).get("result", {})
legacy = result.get("legacy", {})
return {
"id": result.get("rest_id"),
"username": self.username,
"full_name": legacy.get("name")
}
def tweet_parser(
self,
user_id,
full_name,
tweet_id,
item_result,
legacy
):
# It's a static method to parse from a tweet
medias = legacy.get("entities").get("media")
medias = ", ".join(["%s (%s)" % (d.get("media_url_https"), d.get('type')) for d in legacy.get("entities").get("media")]) if medias else None
return {
"id": tweet_id,
"tweet_url": f"https://twitter.com/{self.username}/status/{tweet_id}",
"name": full_name,
"user_id": user_id,
"username": self.username,
"published_at": legacy.get("created_at"),
"content": legacy.get("full_text"),
"views_count": item_result.get("views", {}).get("count"),
"retweet_count": legacy.get("retweet_count"),
"likes": legacy.get("favorite_count"),
"quote_count": legacy.get("quote_count"),
"reply_count": legacy.get("reply_count"),
"bookmarks_count": legacy.get("bookmark_count"),
"medias": medias
}
def iter_tweets(self, limit=120):
# The main navigation method
print(f"[+] scraping: {self.username}")
_user = self.get_user()
full_name = _user.get("full_name")
user_id = _user.get("id")
if not user_id:
print("/!\\ error: no user id found")
raise NotImplementedError
cursor = None
_tweets = []
while True:
var = {
"userId": user_id,
"count": 100,
"cursor": cursor,
"includePromotedContent": True,
"withQuickPromoteEligibilityTweetFields": True,
"withVoice": True,
"withV2Timeline": True
}
params = {
'variables': json.dumps(var),
'features': FEATURES_TWEETS,
}
response = requests.get(
GET_TWEETS_URL,
params=params,
headers=self.HEADERS,
)
json_response = response.json()
result = json_response.get("data", {}).get("user", {}).get("result", {})
timeline = result.get("timeline_v2", {}).get("timeline", {}).get("instructions", {})
entries = [x.get("entries") for x in timeline if x.get("type") == "TimelineAddEntries"]
entries = entries[0] if entries else []
for entry in entries:
content = entry.get("content")
entry_type = content.get("entryType")
tweet_id = entry.get("sortIndex")
if entry_type == "TimelineTimelineItem":
item_result = content.get("itemContent", {}).get("tweet_results", {}).get("result", {})
legacy = item_result.get("legacy")
tweet_data = self.tweet_parser(user_id, full_name, tweet_id, item_result, legacy)
_tweets.append(tweet_data)
if entry_type == "TimelineTimelineCursor" and content.get("cursorType") == "Bottom":
# NB: after 07/01 lock and unlock — no more cursor available if no login provided i.e. max. 100 tweets per username no more
cursor = content.get("value")
if len(_tweets) >= limit:
# We do stop — once reached tweets limit provided by user
break
print(f"[#] tweets scraped: {len(_tweets)}")
if len(_tweets) >= limit or cursor is None or len(entries) == 2:
break
return _tweets
def generate_csv(self, tweets=[]):
import datetime
timestamp = int(datetime.datetime.now().timestamp())
filename = '%s_%s.csv' % (self.username, timestamp)
print('[+] writing %s' % filename)
with open(filename, 'w') as f:
writer = csv.DictWriter(f, fieldnames=FIELDNAMES, delimiter='\t')
writer.writeheader()
for tweet in tweets:
print(tweet['id'], tweet['published_at'])
writer.writerow(tweet)
def main():
print('start')
s = time.perf_counter()
argparser = argparse.ArgumentParser()
argparser.add_argument('--username', '-u', type=str, required=False, help='user to scrape tweets from', default='elonmusk')
argparser.add_argument('--limit', '-l', type=int, required=False, help='max tweets to scrape', default=100)
args = argparser.parse_args()
username = args.username
limit = args.limit
assert all([username, limit])
twitter_scraper = TwitterScraper(username)
tweets = twitter_scraper.iter_tweets(limit=limit)
assert tweets
twitter_scraper.generate_csv(tweets)
print('elapsed %s' % (time.perf_counter()-s))
print('''~~ success
_ _ _
| | | | | |
| | ___ | |__ ___| |_ __ __
| |/ _ \| '_ \/ __| __/| '__|
| | (_) | |_) \__ \ |_ | |
|_|\___/|_.__/|___/\__||_|
''')
if __name__ == '__main__':
main()
@cr0un
Copy link

cr0un commented Jun 2, 2023

Hi! Thanks for the great implementation of getting tweets. I noticed one problem if tweets are 18+ and we get this if legacy is None. I suggest to skip these tweets so our parser doesn't stop with an error. I personally have implemented it for me so that such tweets just skip.

item_result:
{'__typename': 'TweetTombstone', 'tombstone': {'__typename': 'TextTombstone', 'text': {'rtl': False, 'text': 'Age-restricted adult content. This content might not be appropriate for people under 18 years old. To view this media, you’ll need to log in to Twitter. Learn more', 'entities': [{'fromIndex': 134, 'toIndex': 140, 'ref': {'type': 'TimelineUrl', 'url': 'https://twitter.com', 'urlType': 'ExternalUrl'}}, {'fromIndex': 153, 'toIndex': 163, 'ref': {'type': 'TimelineUrl', 'url': 'https://help.twitter.com/rules-and-policies/notices-on-twitter', 'urlType': 'ExternalUrl'}}]}}}

@jorditg
Copy link

jorditg commented Jun 23, 2023

Thanks for the contribution. Unfortunately, not working anymore.

@Fa5g
Copy link

Fa5g commented Jul 16, 2023

Change the line 146 for this one (for example):

if legacy.get('full_text')

and try again.

@pascal182
Copy link

Does anyone know how to get the last 20 tweets? The server brings a random dates

@cr0un
Copy link

cr0un commented Jul 28, 2023

Does anyone know how to get the last 20 tweets? The server brings a random dates

That's how I changed this parameter, the default is 10 I think:
argparser.add_argument('--limit', '-l', type=int, required=False, help='max tweets to scrape', default=10)
args = argparser.parse_args()
limit = args.limit

@pascal182
Copy link

Sorry if I explained wrong. The point is that regardless of the amount of data I request, the server sends a random sample of tweets. If there is any parameter or data that, when added to the request, the server sends the last tweets of the user

@cr0un
Copy link

cr0un commented Jul 28, 2023

Sorry if I explained wrong. The point is that regardless of the amount of data I request, the server sends a random sample of tweets. If there is any parameter or data that, when added to the request, the server sends the last tweets of the user

I'm not ready to give a definitive answer here. I use this library to check word-filtered tweets from several accounts once an hour. If a tweet contains a word or phrase, I process the tweet for further action.

@ebrown2017
Copy link

ebrown2017 commented Jul 29, 2023

Script still working?

python3 twitter_scraper.py --username elonmusk --limit 10 start [+] scraping: elonmusk [#] tweets scraped: 0 Traceback (most recent call last): File "twitter_scraper.py", line 207, in <module> main() File "twitter_scraper.py", line 193, in main assert tweets AssertionError

Seems like it is unable to parse tweets... anyone else facing this?

@cr0un
Copy link

cr0un commented Jul 29, 2023

Script still working?

python3 twitter_scraper.py --username elonmusk --limit 10 start [+] scraping: elonmusk [#] tweets scraped: 0 Traceback (most recent call last): File "twitter_scraper.py", line 207, in <module> main() File "twitter_scraper.py", line 193, in main assert tweets AssertionError

Seems like it is unable to parse tweets... anyone else facing this?

[2023-07-29 10:05:55.110288] Checking for new tweets...
[+] scraping: binance
[#] tweets scraped: 10
[2023-07-29 10:05:57.545959] Found 10 new tweets:

--

[2023-07-29 10:10:02.888401] Checking for new tweets...
[+] scraping: elonmusk
[#] tweets scraped: 10
[2023-07-29 10:10:05.289430] Found 8 new tweets:

--
I don't think you can collect tweets from every user because of their privacy settings, probably or other status of their profile, I won't say for sure.

@ebrown2017
Copy link

I’m confused…

I tried to scrape Elon’s account and it failed for me, but it works for you?

I will try rerunning it later. Perhaps there was some twitter side issue last night.

@MrDebugger
Copy link

@ebrown2017
Twitter made the tweets login based some days ago but now it is back to publicly accessible. You can check now, it should work.

@yan38000
Copy link

hello, I'd like to know how to retrieve the most recent tweets instead. Is it necessary to modify the Header, GET_USER_URL or GET_TWEETS_URL? Thanks for your help.

@Allen-Taylor
Copy link

If you are using a guest token it returns the "ProfileHighlights". You have to log in via the flow token process to get chronologically ordered tweets.

@AshleyMAIN
Copy link

Hello, I tried this code today but it doesn't work. Is this code still valid ?

@ahenaor
Copy link

ahenaor commented Nov 11, 2023

If you are using a guest token it returns the "ProfileHighlights". You have to log in via the flow token process to get chronologically ordered tweets.

Is there any code to use this code with login?

@gitgab22
Copy link

I am not really code with python but could someone help me ?
I have : assert tweets
AssertionError

it seems that the list tweets is empty afer calling iter_tweets.

@Allen-Taylor
Copy link

I am not really code with python but could someone help me ? I have : assert tweets AssertionError

it seems that the list tweets is empty afer calling iter_tweets.

I can fix the code for you: allenmanbear@gmail.com

@pclavell
Copy link

I get the following error when trying to run it. Does anyone know if the code is still working?

Traceback (most recent call last):
  File "twitter_scraper.py", line 233, in <module>
    main()
  File "twitter_scraper.py", line 217, in main
    twitter_scraper = TwitterScraper(username)
  File "twitter_scraper.py", line 54, in __init__
    assert self.gt
AssertionError

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment