zPrototype/Dockerfile

## README.md

      
    Raw
  

              README.md
            
          
    Build with docker build twitter_scraper .
Then run with: docker run --rm -it -v $(pwd):/data/ test_scraper -ck <consumer-key> -cs <consumer-secret> -ak <access-token-key> -as <access-secret> -s NAFO -n 200 -o test.json
-n is for how many tweets you want to scrape
-s is the search term
-o is optional if you want to write the results to an output file

  
## Dockerfile
FROM python:3.9.9-slim-bullseye
RUN apt-get update && \
    apt-get upgrade -y && \
    apt-get install git -y && \
    git -C /opt/ clone https://gist.github.com/29add14089b54f72e9f5063bdda4d2ec.git && \
    pip3 install python-twitter python-dateutil typing
WORKDIR /data
ENTRYPOINT ["python3", "/opt/29add14089b54f72e9f5063bdda4d2ec/twitter_scraper.py"]

## twitter_scraper.py
import math
import twitter
import argparse
import json
from dateutil.parser import parse
from typing import Optional


def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument("-ck", "--consumer-key", help="Api key", required=True, type=str)
    parser.add_argument("-cs", "--consumer-secret", help="Api key secret", required=True, type=str)
    parser.add_argument("-ak", "--access-token-key", help="Access token", required=True, type=str)
    parser.add_argument("-as", "--access-secret", help="Access token secret", required=True, type=str)
    parser.add_argument("-s", "--search-term", help="What you want to search for", required=True, type=str)
    parser.add_argument("-n", "--number-of-tweets", help="No. of tweets you want", default=100,
                        type=lambda x: math.ceil(int(x) / 100))
    parser.add_argument("-o", "--output", help="Write output to a JSON file", required=False)

    args = parser.parse_args()
    args.api_rounds = args.number_of_tweets
    del args.number_of_tweets
    return args


ARGS = get_arguments()
API = twitter.Api(
    consumer_key=ARGS.consumer_key,
    consumer_secret=ARGS.consumer_secret,
    access_token_key=ARGS.access_token_key,
    access_token_secret=ARGS.access_secret,
    tweet_mode="extended"
)
RES = []


def do_api_query(query: str, max_id: Optional[int] = None) -> list[twitter.Status]:
    return API.GetSearch(
        term=query,
        count=100,
        result_type="recent",
        max_id=max_id
    )


def parse_output(query_results: list[twitter.Status]) -> int:
    for res in query_results:
        res = res.AsDict()
        dictionary = {
            "id": res["id"],
            "timestamp": parse(res["created_at"]).strftime("%Y-%m-%d %H:%M:%S"),
            "user": res["user"]["name"],
            "tweet": res["full_text"],
            "hashtags": [x["text"] for x in res["hashtags"]]
        }
        RES.append(dictionary)

    return RES[-1]["id"]


def main():
    least_recent = None
    for _ in range(0, ARGS.api_rounds):
        results = do_api_query(ARGS.search_term, max_id=least_recent)
        least_recent = parse_output(results)

    result_json = json.dumps(RES, indent=4, ensure_ascii=False)
    print(result_json)

    if ARGS.output:
        with open(ARGS.output, "w") as handle:
            handle.write(result_json)


if __name__ == '__main__':
    main()
	FROM python:3.9.9-slim-bullseye
	RUN apt-get update && \
	apt-get upgrade -y && \
	apt-get install git -y && \
	git -C /opt/ clone https://gist.github.com/29add14089b54f72e9f5063bdda4d2ec.git && \
	pip3 install python-twitter python-dateutil typing
	WORKDIR /data
	ENTRYPOINT ["python3", "/opt/29add14089b54f72e9f5063bdda4d2ec/twitter_scraper.py"]
	import math
	import twitter
	import argparse
	import json
	from dateutil.parser import parse
	from typing import Optional


	def get_arguments():
	parser = argparse.ArgumentParser()
	parser.add_argument("-ck", "--consumer-key", help="Api key", required=True, type=str)
	parser.add_argument("-cs", "--consumer-secret", help="Api key secret", required=True, type=str)
	parser.add_argument("-ak", "--access-token-key", help="Access token", required=True, type=str)
	parser.add_argument("-as", "--access-secret", help="Access token secret", required=True, type=str)
	parser.add_argument("-s", "--search-term", help="What you want to search for", required=True, type=str)
	parser.add_argument("-n", "--number-of-tweets", help="No. of tweets you want", default=100,
	type=lambda x: math.ceil(int(x) / 100))
	parser.add_argument("-o", "--output", help="Write output to a JSON file", required=False)

	args = parser.parse_args()
	args.api_rounds = args.number_of_tweets
	del args.number_of_tweets
	return args


	ARGS = get_arguments()
	API = twitter.Api(
	consumer_key=ARGS.consumer_key,
	consumer_secret=ARGS.consumer_secret,
	access_token_key=ARGS.access_token_key,
	access_token_secret=ARGS.access_secret,
	tweet_mode="extended"
	)
	RES = []


	def do_api_query(query: str, max_id: Optional[int] = None) -> list[twitter.Status]:
	return API.GetSearch(
	term=query,
	count=100,
	result_type="recent",
	max_id=max_id
	)


	def parse_output(query_results: list[twitter.Status]) -> int:
	for res in query_results:
	res = res.AsDict()
	dictionary = {
	"id": res["id"],
	"timestamp": parse(res["created_at"]).strftime("%Y-%m-%d %H:%M:%S"),
	"user": res["user"]["name"],
	"tweet": res["full_text"],
	"hashtags": [x["text"] for x in res["hashtags"]]
	}
	RES.append(dictionary)

	return RES[-1]["id"]


	def main():
	least_recent = None
	for _ in range(0, ARGS.api_rounds):
	results = do_api_query(ARGS.search_term, max_id=least_recent)
	least_recent = parse_output(results)

	result_json = json.dumps(RES, indent=4, ensure_ascii=False)
	print(result_json)

	if ARGS.output:
	with open(ARGS.output, "w") as handle:
	handle.write(result_json)


	if __name__ == '__main__':
	main()