Last active
May 4, 2016 18:29
-
-
Save beefy/5eac36342e7c2da582b3 to your computer and use it in GitHub Desktop.
A web crawler/scrapper for the Instagram API, never perfected/finished
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'Nathaniel' | |
import requests | |
import mysql.connector | |
import time | |
import unicodedata | |
# region sql stuff | |
def sql_filter_param(text): | |
output = unicodedata.normalize('NFKD', text).encode('ascii','ignore'); | |
output = output.replace("\'"," ").replace("\""," ").replace(","," ").replace(";"," "); | |
# for letter in range(0, len(text)): | |
# if (text[letter] > 'a' & text[letter] < 'z') | (text[letter] > 'A' & text[letter] < 'Z'): | |
# output += text[letter]; | |
return output; | |
def sql_call(sql): | |
# #set up connection | |
cnx = mysql.connector.connect(user='admin', password='$tTX5&3@wWw$', | |
database='leads') | |
cursor = cnx.cursor() | |
#execute | |
try: | |
for result in cursor.execute(sql,multi = True): | |
cursor.fetchone(); | |
pass | |
except mysql.connector.Error as err: | |
print("SQL ERR: {}".format(err)); | |
cnx.commit(); | |
#close connection | |
cursor.close(); | |
cnx.close(); | |
cnx.disconnect(); | |
# endregion | |
# region print percent complete | |
def print_percent_complete(lat_iterations, lng_iterations, lat_iterations_max, lng_iterations_max): | |
#output percent finished | |
print("iteration num: " + str(lat_iterations*lat_iterations_max + lng_iterations)); | |
print("out of " + str(lat_iterations_max*lng_iterations_max) + " max"); | |
print("lat: " + str(lat_iterations)); | |
print("lng: " + str(lng_iterations)); | |
return; | |
# endregion | |
# region define server connection and api terms | |
# cnx = mysql.connector.connect(user='admin', password='$tTX5&3@wWw$', | |
# database='leads'); | |
# cursor = cnx.cursor(); | |
#sql_call("SET NAMES ""utf8"""); | |
#payload = {'key1': 'value1', 'key2': 'value2'} | |
#r = requests.get("http://httpbin.org/get", params=payload) | |
#url to get code: | |
# https://api.instagram.com/oauth/authorize/?client_id=CLIENT-ID&redirect_uri=REDIRECT-URI&response_type=code | |
#then plug code into postman to get access token | |
access_token = '2080310042.21443eb.b17d1147c2ed46e4868674a2c024908f'; | |
username = 'nateschultz15'; | |
password = 'immabot'; | |
# endregion | |
# region loop lat/lng range | |
def loop_range(lat_min, lat_max, lng_min, lng_max) : | |
sql = ""; | |
lat_iterations_max = int((lat_max-lat_min)/0.01); | |
lng_iterations_max = int((lng_max-lng_min)/0.01); | |
#loop by lat/lng for PHL | |
for lat_iterations in range(30, lat_iterations_max): | |
for lng_iterations in range(0, lng_iterations_max): | |
#determine lat, long | |
lat = lat_min + lat_iterations*0.01; | |
lng = lng_min + lng_iterations*0.01; | |
responseGood = True; | |
#get media by location | |
try: | |
response = requests.get('https://api.instagram.com/v1/media/search?lat='+str(lat)+'&lng='+str(lng)+'&access_token='+access_token, auth=(username, password)); | |
data = response.json(); | |
except requests.exceptions.ConnectionError as err: | |
responseGood = False; | |
if responseGood & response.status_code == 200: | |
for pic in range(0, data["data"].__len__()): | |
for comment in range(0, data["data"][pic]["comments"]["data"].__len__()): | |
if userIsDentist(data["data"][pic]["comments"]["data"][comment]["text"]) : | |
user_id = data["data"][pic]["comments"]["data"][comment]["from"]["id"]; | |
user_name = data["data"][pic]["comments"]["data"][comment]["from"]["username"]; | |
comment_text = data["data"][pic]["comments"]["data"][comment]["text"]; | |
instagram_url = "http://instagram.com/" + user_name; | |
#add to DB | |
sql = sql + "CALL leads.insert_lead(\""+user_id+"\",\""+user_name+"\",\""+instagram_url+"\",\""+sql_filter_param(comment_text)+"\","+str(lat)+","+str(lng)+"); "; | |
sql_call(sql); | |
sql = ""; | |
if (lat_iterations*lat_iterations_max + lng_iterations)%5000 == 0: | |
time.sleep(60*60); | |
#else: | |
#print "API Reponse Err: " + str(response.status_code); | |
print_percent_complete(lat_iterations, lng_iterations, lat_iterations_max, lng_iterations_max); | |
return; | |
# endregion | |
# region determine if user is dentist | |
def userIsDentist(comment): | |
keywords = [ | |
"dentist", | |
"dental", | |
"fluoride", | |
"orthodontist", | |
"plaque", | |
"enamel" | |
]; | |
for i in range(0, keywords.__len__()): | |
if keywords[i] in comment.lower(): | |
return True; | |
return False; | |
# endregion | |
# #USA | |
# lat_min = 28.00; | |
# lat_max = 48.00; | |
# lng_min = -125.00; | |
# lng_max = -65.00; | |
# #Philadelphia (approx) | |
# lat_min = 39.90; | |
# lat_max = 40.00; | |
# lng_min = -75.25; | |
# lng_max = -75.10; | |
# San Fran, Sacramento, San Jose | |
lat_min = 37.05; | |
lat_max = 38.70; | |
lng_min = -122.90; | |
lng_max = -121.05; | |
time.sleep(60*60); | |
loop_range(lat_min, lat_max, lng_min, lng_max); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment