Skip to content

Instantly share code, notes, and snippets.

@beefy
Last active May 4, 2016 18:29
Show Gist options
  • Save beefy/5eac36342e7c2da582b3 to your computer and use it in GitHub Desktop.
Save beefy/5eac36342e7c2da582b3 to your computer and use it in GitHub Desktop.
A web crawler/scrapper for the Instagram API, never perfected/finished
__author__ = 'Nathaniel'
import requests
import mysql.connector
import time
import unicodedata
# region sql stuff
def sql_filter_param(text):
output = unicodedata.normalize('NFKD', text).encode('ascii','ignore');
output = output.replace("\'"," ").replace("\""," ").replace(","," ").replace(";"," ");
# for letter in range(0, len(text)):
# if (text[letter] > 'a' & text[letter] < 'z') | (text[letter] > 'A' & text[letter] < 'Z'):
# output += text[letter];
return output;
def sql_call(sql):
# #set up connection
cnx = mysql.connector.connect(user='admin', password='$tTX5&3@wWw$',
database='leads')
cursor = cnx.cursor()
#execute
try:
for result in cursor.execute(sql,multi = True):
cursor.fetchone();
pass
except mysql.connector.Error as err:
print("SQL ERR: {}".format(err));
cnx.commit();
#close connection
cursor.close();
cnx.close();
cnx.disconnect();
# endregion
# region print percent complete
def print_percent_complete(lat_iterations, lng_iterations, lat_iterations_max, lng_iterations_max):
#output percent finished
print("iteration num: " + str(lat_iterations*lat_iterations_max + lng_iterations));
print("out of " + str(lat_iterations_max*lng_iterations_max) + " max");
print("lat: " + str(lat_iterations));
print("lng: " + str(lng_iterations));
return;
# endregion
# region define server connection and api terms
# cnx = mysql.connector.connect(user='admin', password='$tTX5&3@wWw$',
# database='leads');
# cursor = cnx.cursor();
#sql_call("SET NAMES ""utf8""");
#payload = {'key1': 'value1', 'key2': 'value2'}
#r = requests.get("http://httpbin.org/get", params=payload)
#url to get code:
# https://api.instagram.com/oauth/authorize/?client_id=CLIENT-ID&redirect_uri=REDIRECT-URI&response_type=code
#then plug code into postman to get access token
access_token = '2080310042.21443eb.b17d1147c2ed46e4868674a2c024908f';
username = 'nateschultz15';
password = 'immabot';
# endregion
# region loop lat/lng range
def loop_range(lat_min, lat_max, lng_min, lng_max) :
sql = "";
lat_iterations_max = int((lat_max-lat_min)/0.01);
lng_iterations_max = int((lng_max-lng_min)/0.01);
#loop by lat/lng for PHL
for lat_iterations in range(30, lat_iterations_max):
for lng_iterations in range(0, lng_iterations_max):
#determine lat, long
lat = lat_min + lat_iterations*0.01;
lng = lng_min + lng_iterations*0.01;
responseGood = True;
#get media by location
try:
response = requests.get('https://api.instagram.com/v1/media/search?lat='+str(lat)+'&lng='+str(lng)+'&access_token='+access_token, auth=(username, password));
data = response.json();
except requests.exceptions.ConnectionError as err:
responseGood = False;
if responseGood & response.status_code == 200:
for pic in range(0, data["data"].__len__()):
for comment in range(0, data["data"][pic]["comments"]["data"].__len__()):
if userIsDentist(data["data"][pic]["comments"]["data"][comment]["text"]) :
user_id = data["data"][pic]["comments"]["data"][comment]["from"]["id"];
user_name = data["data"][pic]["comments"]["data"][comment]["from"]["username"];
comment_text = data["data"][pic]["comments"]["data"][comment]["text"];
instagram_url = "http://instagram.com/" + user_name;
#add to DB
sql = sql + "CALL leads.insert_lead(\""+user_id+"\",\""+user_name+"\",\""+instagram_url+"\",\""+sql_filter_param(comment_text)+"\","+str(lat)+","+str(lng)+"); ";
sql_call(sql);
sql = "";
if (lat_iterations*lat_iterations_max + lng_iterations)%5000 == 0:
time.sleep(60*60);
#else:
#print "API Reponse Err: " + str(response.status_code);
print_percent_complete(lat_iterations, lng_iterations, lat_iterations_max, lng_iterations_max);
return;
# endregion
# region determine if user is dentist
def userIsDentist(comment):
keywords = [
"dentist",
"dental",
"fluoride",
"orthodontist",
"plaque",
"enamel"
];
for i in range(0, keywords.__len__()):
if keywords[i] in comment.lower():
return True;
return False;
# endregion
# #USA
# lat_min = 28.00;
# lat_max = 48.00;
# lng_min = -125.00;
# lng_max = -65.00;
# #Philadelphia (approx)
# lat_min = 39.90;
# lat_max = 40.00;
# lng_min = -75.25;
# lng_max = -75.10;
# San Fran, Sacramento, San Jose
lat_min = 37.05;
lat_max = 38.70;
lng_min = -122.90;
lng_max = -121.05;
time.sleep(60*60);
loop_range(lat_min, lat_max, lng_min, lng_max);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment