Last active
August 20, 2019 13:36
-
-
Save cecyc/78a4146295fd91ef79cc241019317bee to your computer and use it in GitHub Desktop.
Sample script for batch processing data and uploading output to S3. This snippet processes batch data from the WHOIS Bulk API based on a CSV list of IPs to search in WHOIS.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import boto3 | |
import json | |
import logging | |
import os | |
import pandas | |
from pandas import DataFrame, read_csv | |
import requests | |
import time | |
import random | |
# https://www.whoisxmlapi.com/whoisserver/WhoisService?apiKey=KEY&domainName=DOMAIN | |
logging.basicConfig(filename='whois.log', level=logging.DEBUG) | |
logging.info("***starting whois api script***") | |
#flags | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-c", "--csv", required=True, help="csv to parse ips from") | |
parser.add_argument("-l", "--limit", type=int, required=False, help="limit csv rows to get") | |
parser.add_argument("-o", "--offset", type=int, required=False, help="offset for csv") | |
flags = vars(parser.parse_args()) | |
csv_file = flags["csv"] | |
#if these values are None, they are ignored by pandas | |
limit = flags["limit"] | |
offset = flags["offset"] | |
ips = [] | |
logging.info("getting %s records from %s file", limit, csv_file) | |
#get a batch of ips for processing | |
f = pandas.read_csv(csv_file, nrows=limit, skiprows=offset) | |
for i, row in f.iterrows(): | |
domain = "unknown" if pandas.isnull(row[0]) else row[0] | |
ip = row[1] | |
if pandas.isnull(ip): | |
#add some logging here | |
continue | |
else: | |
ips.append({"ip": ip, "domain": domain}) | |
#create or make S3 bucket | |
whois_bucket = os.environ["BUCKET_NAME"] | |
s3 = boto3.client('s3') | |
ls_buckets = s3.list_buckets() | |
bucket_names = [] | |
for bucket in ls_buckets["Buckets"]: | |
bucket_names.append(bucket["Name"]) | |
if whois_bucket not in bucket_names: | |
logging.info("Creating bucket %s", whois_bucket) | |
s3.create_bucket(Bucket=whois_bucket) | |
else: | |
logging.info("Bucket already exists, skipping creation") | |
#make API requests | |
base_url = "https://www.whoisxmlapi.com/whoisserver/WhoisService" | |
api_key = os.environ["WHOIS_API_KEY"] | |
for ip in ips: | |
query = {"apiKey": api_key, "domainName": ip["ip"], "outputFormat": "JSON"} | |
req = requests.get(base_url, params=query) | |
#exponential backoff for throttled requests | |
if req.status == 503: | |
i = 0 | |
while (i < 3): | |
time.sleep(2^i + random.random()) | |
req = requests.get(base_url, params=query) | |
if req.status = 503: | |
i += 1 | |
else: | |
break | |
else: | |
json_data = json.dumps(json.JSONDecoder().decode(req.text), separators=(',',':')) | |
#prepare data for file name | |
parsed_domain = ip["domain"].split(".") | |
parsed_domain = "-".join(parsed_domain) | |
parsed_ip = ip["ip"].split(".") | |
parsed_ip = "-".join(parsed_ip) | |
file_name = "%s-%s.json" % (parsed_ip, parsed_domain) | |
logging.info("Putting file in s3 %s", file_name) | |
s3.put_object(Body=json_data, Bucket=whois_bucket, Key=file_name) | |
logging.info("Last ip processed: %s, domain %s", ip["ip"], ip["domain"]) | |
time.sleep(.5) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment