jslim89/ig-scraper.py

## ig-scraper.py
import time
import re
from hashlib import md5
import requests

INSTAGRAM_URL = "https://www.instagram.com"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
INSTAGRAM_USER = "youruname"
INSTAGRAM_PASS = "yoursecret"

def get_first_page(hashtag):
    return requests.get(INSTAGRAM_URL + "/explore/tags/{}/".format(hashtag), headers={ "user-agent": USER_AGENT })

def get_csrf_token(cookies):
    return cookies.get("csrftoken")

def get_query_id(html):
    script_path = re.search(r'/static(.*)TagPageContainer\.js/(.*).js', html).group(0)
    script_req = requests.get(INSTAGRAM_URL + script_path)
    return re.findall('return e.tagMedia.byTagName.get\\(t\\).pagination},queryId:"([^"]*)"', script_req.text)[0]

def get_rhx_gis(html):
    return re.search(r'rhx_gis":"([^"]*)"', html).group(1)

def get_end_cursor(html):
    return re.search(r'end_cursor":"([^"]*)"', html).group(1)

def get_params(hashtag, end_cursor):
    return '{{"tag_name":"{}","first":200,"after":"{}"}}'.format(hashtag, end_cursor)

def get_ig_gis(rhx_gis, csrf_token, params):
    return md5((rhx_gis + ":" + csrf_token + ":" + USER_AGENT + ":" + params).encode('utf-8')).hexdigest()

def get_next_page(new_cookies, ig_gis, query_id, params):

    headers = {
        "x-instagram-gis": ig_gis,
        "x-requested-with": "XMLHttpRequest",
        "user-agent": USER_AGENT
    }

    url = INSTAGRAM_URL + "/graphql/query/?query_hash={}&variables={}".format(query_id, params)

    time.sleep(10)
    next_page = requests.get(url, headers=headers, cookies=new_cookies)
    obj = next_page.json()
    obj = obj['data']['hashtag']['edge_hashtag_to_media']
    end_cursor = obj['page_info']['end_cursor']
    data = [o['node'] for o in obj['edges']]
    return data, end_cursor

def crawl(hashtag, max_count):
    first_page = get_first_page(hashtag)
    is_logged_in, cookies = login(INSTAGRAM_USER, INSTAGRAM_PASS)
    if (not is_logged_in):
        print('Not logged in')
        return
    csrf_token = get_csrf_token(cookies)
    query_id = get_query_id(first_page.text)
    rhx_gis = get_rhx_gis(first_page.text)
    end_cursor = get_end_cursor(first_page.text)
    params = get_params(hashtag, end_cursor)
    ig_gis = get_ig_gis(rhx_gis, csrf_token, params)

    results = []
    while end_cursor:
        params = get_params(hashtag, end_cursor)
        ig_gis = get_ig_gis(rhx_gis, csrf_token, params)
        data, end_cursor = get_next_page(cookies, ig_gis, query_id, params)

        # for testing purpose only, to print out all posts link
        for o in data:
            print('https://www.instagram.com/p/%s' % o['shortcode'])
        results.extend(data)

        # the maximum number of results
        if (len(results) > max_count):
           	break
    return results

def login(username, password):
    login_url = INSTAGRAM_URL + '/accounts/login/ajax/'
    session = requests.Session()
    session.headers = { "user-agent": USER_AGENT }
    session.headers.update({'Referer': INSTAGRAM_URL})
    req = session.get(INSTAGRAM_URL)
    session.headers.update({'X-CSRFToken': req.cookies['csrftoken']})
    login = session.post(login_url, data={'username': username, 'password': password}, allow_redirects=True)
    session.headers.update({'X-CSRFToken': login.cookies['csrftoken']})
    cookies = login.cookies
    results = login.json()
    if (results['authenticated'] == False):
        return False, None
    return True, cookies

crawl('antarctica', 5000)
	import time
	import re
	from hashlib import md5
	import requests

	INSTAGRAM_URL = "https://www.instagram.com"
	USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
	INSTAGRAM_USER = "youruname"
	INSTAGRAM_PASS = "yoursecret"

	def get_first_page(hashtag):
	return requests.get(INSTAGRAM_URL + "/explore/tags/{}/".format(hashtag), headers={ "user-agent": USER_AGENT })

	def get_csrf_token(cookies):
	return cookies.get("csrftoken")

	def get_query_id(html):
	script_path = re.search(r'/static(.)TagPageContainer\.js/(.).js', html).group(0)
	script_req = requests.get(INSTAGRAM_URL + script_path)
	return re.findall('return e.tagMedia.byTagName.get\\(t\\).pagination},queryId:"([^"]*)"', script_req.text)[0]

	def get_rhx_gis(html):
	return re.search(r'rhx_gis":"([^"]*)"', html).group(1)

	def get_end_cursor(html):
	return re.search(r'end_cursor":"([^"]*)"', html).group(1)

	def get_params(hashtag, end_cursor):
	return '{{"tag_name":"{}","first":200,"after":"{}"}}'.format(hashtag, end_cursor)

	def get_ig_gis(rhx_gis, csrf_token, params):
	return md5((rhx_gis + ":" + csrf_token + ":" + USER_AGENT + ":" + params).encode('utf-8')).hexdigest()

	def get_next_page(new_cookies, ig_gis, query_id, params):

	headers = {
	"x-instagram-gis": ig_gis,
	"x-requested-with": "XMLHttpRequest",
	"user-agent": USER_AGENT
	}

	url = INSTAGRAM_URL + "/graphql/query/?query_hash={}&variables={}".format(query_id, params)

	time.sleep(10)
	next_page = requests.get(url, headers=headers, cookies=new_cookies)
	obj = next_page.json()
	obj = obj['data']['hashtag']['edge_hashtag_to_media']
	end_cursor = obj['page_info']['end_cursor']
	data = [o['node'] for o in obj['edges']]
	return data, end_cursor

	def crawl(hashtag, max_count):
	first_page = get_first_page(hashtag)
	is_logged_in, cookies = login(INSTAGRAM_USER, INSTAGRAM_PASS)
	if (not is_logged_in):
	print('Not logged in')
	return
	csrf_token = get_csrf_token(cookies)
	query_id = get_query_id(first_page.text)
	rhx_gis = get_rhx_gis(first_page.text)
	end_cursor = get_end_cursor(first_page.text)
	params = get_params(hashtag, end_cursor)
	ig_gis = get_ig_gis(rhx_gis, csrf_token, params)

	results = []
	while end_cursor:
	params = get_params(hashtag, end_cursor)
	ig_gis = get_ig_gis(rhx_gis, csrf_token, params)
	data, end_cursor = get_next_page(cookies, ig_gis, query_id, params)

	# for testing purpose only, to print out all posts link
	for o in data:
	print('https://www.instagram.com/p/%s' % o['shortcode'])
	results.extend(data)

	# the maximum number of results
	if (len(results) > max_count):
	break
	return results

	def login(username, password):
	login_url = INSTAGRAM_URL + '/accounts/login/ajax/'
	session = requests.Session()
	session.headers = { "user-agent": USER_AGENT }
	session.headers.update({'Referer': INSTAGRAM_URL})
	req = session.get(INSTAGRAM_URL)
	session.headers.update({'X-CSRFToken': req.cookies['csrftoken']})
	login = session.post(login_url, data={'username': username, 'password': password}, allow_redirects=True)
	session.headers.update({'X-CSRFToken': login.cookies['csrftoken']})
	cookies = login.cookies
	results = login.json()
	if (results['authenticated'] == False):
	return False, None
	return True, cookies

	crawl('antarctica', 5000)