|
#!/usr/bin/env python3 |
|
import os |
|
import logging |
|
import time |
|
from twikit import Client |
|
import pandas as pd |
|
|
|
# Enable or disable debug mode |
|
debug = True |
|
|
|
# Load environment variables |
|
username = os.getenv('XUSERNAME') |
|
password = os.getenv('XPASSWORD') |
|
|
|
# Initialize parameters for fetching tweets |
|
batch_size = 60 # Number of tweets per batch |
|
delay_seconds = 60 # Delay in seconds between batches |
|
output_file = f'{username}_tweets.csv' |
|
|
|
# Setup logging |
|
logger = logging.getLogger() |
|
logger.setLevel(logging.DEBUG if debug else logging.INFO) |
|
|
|
# Create file handler |
|
file_handler = logging.FileHandler('dlx2.log') |
|
file_handler.setLevel(logging.DEBUG if debug else logging.INFO) |
|
file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') |
|
file_handler.setFormatter(file_formatter) |
|
logger.addHandler(file_handler) |
|
|
|
# Create console handler |
|
console_handler = logging.StreamHandler() |
|
console_handler.setLevel(logging.DEBUG if debug else logging.INFO) |
|
console_formatter = logging.Formatter('%(message)s') |
|
console_handler.setFormatter(console_formatter) |
|
logger.addHandler(console_handler) |
|
|
|
# Log environment variables |
|
logger.debug(f'Environment variables loaded - XUSERNAME: {username}, XPASSWORD: {password}') |
|
|
|
# Initialize the client |
|
client = Client('en-US') |
|
logger.debug('Twikit client initialized') |
|
|
|
# Check if cookies file exists |
|
cookies_file = 'cookies.json' |
|
if os.path.exists(cookies_file): |
|
logger.debug(f'Cookies file {cookies_file} exists. Loading cookies.') |
|
client.load_cookies(path=cookies_file) |
|
else: |
|
logger.debug(f'Cookies file {cookies_file} does not exist. Logging in.') |
|
# Log in with your credentials |
|
client.login( |
|
auth_info_1=username, |
|
password=password, |
|
) |
|
# Save cookies for future use |
|
client.save_cookies(cookies_file) |
|
logger.debug(f'Cookies saved to {cookies_file}') |
|
|
|
# Retrieve the user object for your account |
|
user = client.get_user_by_screen_name(username) |
|
logger.debug(f'User object retrieved for username: {username}') |
|
|
|
# Create a new CSV file with headers |
|
if not os.path.exists(output_file): |
|
df = pd.DataFrame(columns=['username', 'user_id', 'tweet_id', 'tweet_date', 'tweet_likes', 'tweet_views', 'tweet_comments', 'tweet_text']) |
|
df.to_csv(output_file, index=False) |
|
logger.debug(f'Created new CSV file: {output_file}') |
|
|
|
# Fetch initial batch of tweets |
|
tweets = user.get_tweets('Tweets') |
|
logger.debug('Initial batch of tweets fetched') |
|
|
|
while tweets: |
|
logger.debug(f'Processing batch of {len(tweets)} tweets') |
|
batch_data = [] |
|
for tweet in tweets: |
|
tweet_data = { |
|
'username': tweet.user.name, |
|
'user_id': tweet.user.id, |
|
'tweet_id': tweet.id, |
|
'tweet_date': tweet.created_at, |
|
'tweet_likes': tweet.favorite_count, |
|
'tweet_views': tweet.view_count, |
|
'tweet_comments': tweet.reply_count, |
|
'tweet_text': tweet.full_text, |
|
} |
|
logger.debug(f'Processed tweet data: {tweet_data}') |
|
batch_data.append(tweet_data) |
|
|
|
# Convert batch data to DataFrame and append to CSV |
|
df_batch = pd.DataFrame(batch_data) |
|
df_batch.to_csv(output_file, mode='a', header=False, index=False) |
|
logger.debug(f'Appended batch of {len(batch_data)} tweets to {output_file}') |
|
|
|
try: |
|
tweets = tweets.next() |
|
logger.debug('Fetched next batch of tweets') |
|
|
|
# Introduce a delay to avoid hitting the rate limit |
|
logger.debug(f'Waiting for {delay_seconds} seconds before fetching the next batch...') |
|
time.sleep(delay_seconds) |
|
|
|
except Exception as e: |
|
logger.debug(f"No more tweets to fetch: {e}") |
|
break |
|
|
|
print(f"Successfully scraped tweets for {username}. Check the CSV file for results.") |
|
logger.info(f'Successfully scraped tweets for {username}. Check the CSV file for results.') |