Skip to content

Instantly share code, notes, and snippets.

@vool
Last active January 18, 2020 20:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vool/b59ee265354624d667572de42df3b821 to your computer and use it in GitHub Desktop.
Save vool/b59ee265354624d667572de42df3b821 to your computer and use it in GitHub Desktop.
Script to fetch historic tweets containing a given hashtag and insert them into a database

Get old tweets into a db

Script to fetch historic tweets containing a given hashtag and insert them into a database

Setup

Edit settings in config.json

{
"db_host" : "",
"db_user" : "",
"db_passwd" : "",
"db_name" : "",
"db_table" : "",
"hashtag" : ""
}
#!/usr/bin/env python
"""
Fetch historic tweets containing a given hashtag and insert them into a database
Settings in config.json
"""
from twitterscraper import query_tweets
import mysql.connector as mysql
from datetime import datetime
import json
# load credentials
try:
with open('config.json') as f:
config = json.load(f)
except Exception as e:
print(e)
print('####################################################\n#')
print('# Fetch historic tweets for #{} !\n#'.format(config['hashtag']))
print('####################################################')
# create db connection
try:
db = mysql.connect(
host=config['db_host'],
user=config['db_user'],
passwd=config['db_passwd'],
database=config['db_name']
)
except mysql.Error:
print("Error connecting to Database")
quit()
# create cursor instance
cursor = db.cursor()
# create table if it dosen't exist
cursor.execute(f"""CREATE TABLE IF NOT EXISTS `{config['db_table']}` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`user_id` bigint(20) NOT NULL,
`tweet_id` bigint(20) NOT NULL,
`tweet` mediumtext NOT NULL,
`timestamp` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1 AUTO_INCREMENT=3 ;
""")
db.commit()
# Get tweets
tweets = query_tweets(config['hashtag'], 100000)
# todo move limit to config
print("Scraped {} tweets containing #{}\n".format(
len(tweets), config['hashtag']))
# define the query
query = 'INSERT INTO {} (id, user_id, tweet_id, tweet, timestamp) VALUES (NULL, %s, %s, %s, %s)'.format(
config['db_table'])
for t in tweets:
# only save if the string is a hashtag not just a text string
if('#{}'.format(config['hashtag']) in t.text.lower()):
print('Inserting tweet {} from {} ( @{} )'.format(
t.user_id, t.tweet_id, t.username, t.screen_name))
# convert the timestamp in order to allow it to be JSON encoded
t.timestamp = datetime.strftime(t.timestamp, '%Y-%m-%d %H:%M:%S')
values = (t.user_id, t.tweet_id, json.dumps(t.__dict__), t.timestamp)
# execute the query
cursor.execute(query, values)
# commit to db
db.commit()
print(cursor.rowcount, "record inserted")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment