Skip to content

Instantly share code, notes, and snippets.

@cecyc
Last active August 20, 2019 13:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cecyc/78a4146295fd91ef79cc241019317bee to your computer and use it in GitHub Desktop.
Save cecyc/78a4146295fd91ef79cc241019317bee to your computer and use it in GitHub Desktop.
Sample script for batch processing data and uploading output to S3. This snippet processes batch data from the WHOIS Bulk API based on a CSV list of IPs to search in WHOIS.
import argparse
import boto3
import json
import logging
import os
import pandas
from pandas import DataFrame, read_csv
import requests
import time
import random
# https://www.whoisxmlapi.com/whoisserver/WhoisService?apiKey=KEY&domainName=DOMAIN
logging.basicConfig(filename='whois.log', level=logging.DEBUG)
logging.info("***starting whois api script***")
#flags
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--csv", required=True, help="csv to parse ips from")
parser.add_argument("-l", "--limit", type=int, required=False, help="limit csv rows to get")
parser.add_argument("-o", "--offset", type=int, required=False, help="offset for csv")
flags = vars(parser.parse_args())
csv_file = flags["csv"]
#if these values are None, they are ignored by pandas
limit = flags["limit"]
offset = flags["offset"]
ips = []
logging.info("getting %s records from %s file", limit, csv_file)
#get a batch of ips for processing
f = pandas.read_csv(csv_file, nrows=limit, skiprows=offset)
for i, row in f.iterrows():
domain = "unknown" if pandas.isnull(row[0]) else row[0]
ip = row[1]
if pandas.isnull(ip):
#add some logging here
continue
else:
ips.append({"ip": ip, "domain": domain})
#create or make S3 bucket
whois_bucket = os.environ["BUCKET_NAME"]
s3 = boto3.client('s3')
ls_buckets = s3.list_buckets()
bucket_names = []
for bucket in ls_buckets["Buckets"]:
bucket_names.append(bucket["Name"])
if whois_bucket not in bucket_names:
logging.info("Creating bucket %s", whois_bucket)
s3.create_bucket(Bucket=whois_bucket)
else:
logging.info("Bucket already exists, skipping creation")
#make API requests
base_url = "https://www.whoisxmlapi.com/whoisserver/WhoisService"
api_key = os.environ["WHOIS_API_KEY"]
for ip in ips:
query = {"apiKey": api_key, "domainName": ip["ip"], "outputFormat": "JSON"}
req = requests.get(base_url, params=query)
#exponential backoff for throttled requests
if req.status == 503:
i = 0
while (i < 3):
time.sleep(2^i + random.random())
req = requests.get(base_url, params=query)
if req.status = 503:
i += 1
else:
break
else:
json_data = json.dumps(json.JSONDecoder().decode(req.text), separators=(',',':'))
#prepare data for file name
parsed_domain = ip["domain"].split(".")
parsed_domain = "-".join(parsed_domain)
parsed_ip = ip["ip"].split(".")
parsed_ip = "-".join(parsed_ip)
file_name = "%s-%s.json" % (parsed_ip, parsed_domain)
logging.info("Putting file in s3 %s", file_name)
s3.put_object(Body=json_data, Bucket=whois_bucket, Key=file_name)
logging.info("Last ip processed: %s, domain %s", ip["ip"], ip["domain"])
time.sleep(.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment