Skip to content

Instantly share code, notes, and snippets.

@jslim89
Forked from marcoqu/scraper.py
Last active April 13, 2018 03:39
Show Gist options
  • Save jslim89/9b0eebaefffcbad10285921cdd8aec70 to your computer and use it in GitHub Desktop.
Save jslim89/9b0eebaefffcbad10285921cdd8aec70 to your computer and use it in GitHub Desktop.
The instagram internal api
import time
import re
from hashlib import md5
import requests
INSTAGRAM_URL = "https://www.instagram.com"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
INSTAGRAM_USER = "youruname"
INSTAGRAM_PASS = "yoursecret"
def get_first_page(hashtag):
return requests.get(INSTAGRAM_URL + "/explore/tags/{}/".format(hashtag), headers={ "user-agent": USER_AGENT })
def get_csrf_token(cookies):
return cookies.get("csrftoken")
def get_query_id(html):
script_path = re.search(r'/static(.*)TagPageContainer\.js/(.*).js', html).group(0)
script_req = requests.get(INSTAGRAM_URL + script_path)
return re.findall('return e.tagMedia.byTagName.get\\(t\\).pagination},queryId:"([^"]*)"', script_req.text)[0]
def get_rhx_gis(html):
return re.search(r'rhx_gis":"([^"]*)"', html).group(1)
def get_end_cursor(html):
return re.search(r'end_cursor":"([^"]*)"', html).group(1)
def get_params(hashtag, end_cursor):
return '{{"tag_name":"{}","first":200,"after":"{}"}}'.format(hashtag, end_cursor)
def get_ig_gis(rhx_gis, csrf_token, params):
return md5((rhx_gis + ":" + csrf_token + ":" + USER_AGENT + ":" + params).encode('utf-8')).hexdigest()
def get_next_page(new_cookies, ig_gis, query_id, params):
headers = {
"x-instagram-gis": ig_gis,
"x-requested-with": "XMLHttpRequest",
"user-agent": USER_AGENT
}
url = INSTAGRAM_URL + "/graphql/query/?query_hash={}&variables={}".format(query_id, params)
time.sleep(10)
next_page = requests.get(url, headers=headers, cookies=new_cookies)
obj = next_page.json()
obj = obj['data']['hashtag']['edge_hashtag_to_media']
end_cursor = obj['page_info']['end_cursor']
data = [o['node'] for o in obj['edges']]
return data, end_cursor
def crawl(hashtag, max_count):
first_page = get_first_page(hashtag)
is_logged_in, cookies = login(INSTAGRAM_USER, INSTAGRAM_PASS)
if (not is_logged_in):
print('Not logged in')
return
csrf_token = get_csrf_token(cookies)
query_id = get_query_id(first_page.text)
rhx_gis = get_rhx_gis(first_page.text)
end_cursor = get_end_cursor(first_page.text)
params = get_params(hashtag, end_cursor)
ig_gis = get_ig_gis(rhx_gis, csrf_token, params)
results = []
while end_cursor:
params = get_params(hashtag, end_cursor)
ig_gis = get_ig_gis(rhx_gis, csrf_token, params)
data, end_cursor = get_next_page(cookies, ig_gis, query_id, params)
# for testing purpose only, to print out all posts link
for o in data:
print('https://www.instagram.com/p/%s' % o['shortcode'])
results.extend(data)
# the maximum number of results
if (len(results) > max_count):
break
return results
def login(username, password):
login_url = INSTAGRAM_URL + '/accounts/login/ajax/'
session = requests.Session()
session.headers = { "user-agent": USER_AGENT }
session.headers.update({'Referer': INSTAGRAM_URL})
req = session.get(INSTAGRAM_URL)
session.headers.update({'X-CSRFToken': req.cookies['csrftoken']})
login = session.post(login_url, data={'username': username, 'password': password}, allow_redirects=True)
session.headers.update({'X-CSRFToken': login.cookies['csrftoken']})
cookies = login.cookies
results = login.json()
if (results['authenticated'] == False):
return False, None
return True, cookies
crawl('antarctica', 5000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment