ketankr9/scraper.py

## scraper.py
# pylint: skip-file

import time
import re
import md5
import requests
import json

INSTAGRAM_URL = "https://www.instagram.com"
HASHTAG_ENDPOINT = "/graphql/query/?query_hash={}&variables={}"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"

def get_first_page(username):
    r = requests.get(INSTAGRAM_URL + "/{}/".format(username), headers={"user-agent": USER_AGENT})
    json_obj = json.loads(r.text.split("window._sharedData = ")[1].split(";</script>")[0])
    end_cursor = get_end_cursor_from_html(r.text)
    edges = json_obj["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"]
    return (r, [o['node'] for o in edges], end_cursor)

def get_csrf_token(cookies):
    return cookies.get("csrftoken")

def get_query_id(html):
    script_path = re.search(r'/static(.*)ProfilePageContainer\.js/(.*).js', html).group(0)
    script_req = requests.get(INSTAGRAM_URL + script_path)
    return re.findall('e\\.profilePosts\\.byUserId\\.get\\(t\\)\\)\\?n\\.pagination:n},queryId:"([^"]*)"', script_req.text)[0]

def get_user_id(html):
    return re.search(r'logging_page_id":"([^"]*)"', html).group(1).split("_")[1]

def get_rhx_gis(html):
    return re.search(r'rhx_gis":"([^"]*)"', html).group(1)

def get_end_cursor_from_html(html):
    return re.search(r'end_cursor":"([^"]*)"', html).group(1)

def get_end_cursor_from_json(json_obj):
    return json_obj['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']

def get_params(id, end_cursor):
    return '{{"id":"{}","first":12,"after":"{}"}}'.format(id, end_cursor)

def get_ig_gis(rhx_gis, params):
    return md5.new(rhx_gis + ":" + params).hexdigest()

def get_posts_from_json(json_obj):
    edges = json_obj['data']['user']['edge_owner_to_timeline_media']['edges']
    return [o['node'] for o in edges]

def make_cookies(csrf_token):
    return {
        "csrftoken": csrf_token,
    }

def make_headers(ig_gis):
    return {
        "x-instagram-gis": ig_gis,
        "x-requested-with": "XMLHttpRequest",
        "user-agent": USER_AGENT
    }

def get_next_page(csrf_token, ig_gis, query_id, params):
    cookies = make_cookies(csrf_token)
    headers = make_headers(ig_gis)
    url = INSTAGRAM_URL + HASHTAG_ENDPOINT.format(query_id, params)
    req = requests.get(url, headers=headers, cookies=cookies)
    req.raise_for_status()
    json_obj = req.json()
    end_cursor = get_end_cursor_from_json(json_obj)
    posts = get_posts_from_json(json_obj)
    return posts, end_cursor

def scrape_username(username, sleep=3):
    """
    Yields scraped posts, one by one
    """
    r, posts, end_cursor = get_first_page(username)
    csrf_token = get_csrf_token(r.cookies)
    query_id = get_query_id(r.text)
    rhx_gis = get_rhx_gis(r.text)
    id = get_user_id(r.text)

    for post in posts:
        yield post
    time.sleep(sleep)

    while end_cursor != None:
        params = get_params(id, end_cursor)
        ig_gis = get_ig_gis(rhx_gis, params)
        posts, end_cursor = get_next_page(csrf_token, ig_gis, query_id, params)
        for post in posts:
            yield post
        time.sleep(sleep)


# main
for post in scrape_username("utsav_ketankr9"):
    print post['id'],post['display_url']
    # do stuff
	# pylint: skip-file

	import time
	import re
	import md5
	import requests
	import json

	INSTAGRAM_URL = "https://www.instagram.com"
	HASHTAG_ENDPOINT = "/graphql/query/?query_hash={}&variables={}"
	USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"

	def get_first_page(username):
	r = requests.get(INSTAGRAM_URL + "/{}/".format(username), headers={"user-agent": USER_AGENT})
	json_obj = json.loads(r.text.split("window._sharedData = ")[1].split(";</script>")[0])
	end_cursor = get_end_cursor_from_html(r.text)
	edges = json_obj["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"]
	return (r, [o['node'] for o in edges], end_cursor)

	def get_csrf_token(cookies):
	return cookies.get("csrftoken")

	def get_query_id(html):
	script_path = re.search(r'/static(.)ProfilePageContainer\.js/(.).js', html).group(0)
	script_req = requests.get(INSTAGRAM_URL + script_path)
	return re.findall('e\\.profilePosts\\.byUserId\\.get\\(t\\)\\)\\?n\\.pagination:n},queryId:"([^"]*)"', script_req.text)[0]

	def get_user_id(html):
	return re.search(r'logging_page_id":"([^"]*)"', html).group(1).split("_")[1]

	def get_rhx_gis(html):
	return re.search(r'rhx_gis":"([^"]*)"', html).group(1)

	def get_end_cursor_from_html(html):
	return re.search(r'end_cursor":"([^"]*)"', html).group(1)

	def get_end_cursor_from_json(json_obj):
	return json_obj['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']

	def get_params(id, end_cursor):
	return '{{"id":"{}","first":12,"after":"{}"}}'.format(id, end_cursor)

	def get_ig_gis(rhx_gis, params):
	return md5.new(rhx_gis + ":" + params).hexdigest()

	def get_posts_from_json(json_obj):
	edges = json_obj['data']['user']['edge_owner_to_timeline_media']['edges']
	return [o['node'] for o in edges]

	def make_cookies(csrf_token):
	return {
	"csrftoken": csrf_token,
	}

	def make_headers(ig_gis):
	return {
	"x-instagram-gis": ig_gis,
	"x-requested-with": "XMLHttpRequest",
	"user-agent": USER_AGENT
	}

	def get_next_page(csrf_token, ig_gis, query_id, params):
	cookies = make_cookies(csrf_token)
	headers = make_headers(ig_gis)
	url = INSTAGRAM_URL + HASHTAG_ENDPOINT.format(query_id, params)
	req = requests.get(url, headers=headers, cookies=cookies)
	req.raise_for_status()
	json_obj = req.json()
	end_cursor = get_end_cursor_from_json(json_obj)
	posts = get_posts_from_json(json_obj)
	return posts, end_cursor

	def scrape_username(username, sleep=3):
	"""
	Yields scraped posts, one by one
	"""
	r, posts, end_cursor = get_first_page(username)
	csrf_token = get_csrf_token(r.cookies)
	query_id = get_query_id(r.text)
	rhx_gis = get_rhx_gis(r.text)
	id = get_user_id(r.text)

	for post in posts:
	yield post
	time.sleep(sleep)

	while end_cursor != None:
	params = get_params(id, end_cursor)
	ig_gis = get_ig_gis(rhx_gis, params)
	posts, end_cursor = get_next_page(csrf_token, ig_gis, query_id, params)
	for post in posts:
	yield post
	time.sleep(sleep)


	# main
	for post in scrape_username("utsav_ketankr9"):
	print post['id'],post['display_url']
	# do stuff