Skip to content

Instantly share code, notes, and snippets.

@zPrototype
Last active September 10, 2022 21:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zPrototype/29add14089b54f72e9f5063bdda4d2ec to your computer and use it in GitHub Desktop.
Save zPrototype/29add14089b54f72e9f5063bdda4d2ec to your computer and use it in GitHub Desktop.

Build with docker build twitter_scraper . Then run with: docker run --rm -it -v $(pwd):/data/ test_scraper -ck <consumer-key> -cs <consumer-secret> -ak <access-token-key> -as <access-secret> -s NAFO -n 200 -o test.json

-n is for how many tweets you want to scrape

-s is the search term

-o is optional if you want to write the results to an output file

FROM python:3.9.9-slim-bullseye
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y && \
git -C /opt/ clone https://gist.github.com/29add14089b54f72e9f5063bdda4d2ec.git && \
pip3 install python-twitter python-dateutil typing
WORKDIR /data
ENTRYPOINT ["python3", "/opt/29add14089b54f72e9f5063bdda4d2ec/twitter_scraper.py"]
import math
import twitter
import argparse
import json
from dateutil.parser import parse
from typing import Optional
def get_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("-ck", "--consumer-key", help="Api key", required=True, type=str)
parser.add_argument("-cs", "--consumer-secret", help="Api key secret", required=True, type=str)
parser.add_argument("-ak", "--access-token-key", help="Access token", required=True, type=str)
parser.add_argument("-as", "--access-secret", help="Access token secret", required=True, type=str)
parser.add_argument("-s", "--search-term", help="What you want to search for", required=True, type=str)
parser.add_argument("-n", "--number-of-tweets", help="No. of tweets you want", default=100,
type=lambda x: math.ceil(int(x) / 100))
parser.add_argument("-o", "--output", help="Write output to a JSON file", required=False)
args = parser.parse_args()
args.api_rounds = args.number_of_tweets
del args.number_of_tweets
return args
ARGS = get_arguments()
API = twitter.Api(
consumer_key=ARGS.consumer_key,
consumer_secret=ARGS.consumer_secret,
access_token_key=ARGS.access_token_key,
access_token_secret=ARGS.access_secret,
tweet_mode="extended"
)
RES = []
def do_api_query(query: str, max_id: Optional[int] = None) -> list[twitter.Status]:
return API.GetSearch(
term=query,
count=100,
result_type="recent",
max_id=max_id
)
def parse_output(query_results: list[twitter.Status]) -> int:
for res in query_results:
res = res.AsDict()
dictionary = {
"id": res["id"],
"timestamp": parse(res["created_at"]).strftime("%Y-%m-%d %H:%M:%S"),
"user": res["user"]["name"],
"tweet": res["full_text"],
"hashtags": [x["text"] for x in res["hashtags"]]
}
RES.append(dictionary)
return RES[-1]["id"]
def main():
least_recent = None
for _ in range(0, ARGS.api_rounds):
results = do_api_query(ARGS.search_term, max_id=least_recent)
least_recent = parse_output(results)
result_json = json.dumps(RES, indent=4, ensure_ascii=False)
print(result_json)
if ARGS.output:
with open(ARGS.output, "w") as handle:
handle.write(result_json)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment