Skip to content

Instantly share code, notes, and snippets.

Last active May 23, 2019 11:07
Show Gist options
  • Save marcoqu/e17e1c4414f8d18e6672976d941161fa to your computer and use it in GitHub Desktop.
Save marcoqu/e17e1c4414f8d18e6672976d941161fa to your computer and use it in GitHub Desktop.
# pylint: skip-file
import time
import re
import md5
import requests
import json
HASHTAG_ENDPOINT = "/graphql/query/?query_hash={}&variables={}"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
def get_first_page(hashtag):
return requests.get(INSTAGRAM_URL + "/explore/tags/{}/".format(hashtag), headers={"user-agent": USER_AGENT})
def get_csrf_token(cookies):
return cookies.get("csrftoken")
def get_query_id(html):
script_path ='/static(.*)TagPageContainer\.js/(.*).js', html).group(0)
script_req = requests.get(INSTAGRAM_URL + script_path)
return re.findall('return e.tagMedia.byTagName.get\\(t\\).pagination},queryId:"([^"]*)"', script_req.text)[0]
def get_rhx_gis(html):
return'rhx_gis":"([^"]*)"', html).group(1)
def get_end_cursor_from_html(html):
return'end_cursor":"([^"]*)"', html).group(1)
def get_end_cursor_from_json(json_obj):
return json_obj['data']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
def get_params(hashtag, end_cursor):
return '{{"tag_name":"{}","first":50,"after":"{}"}}'.format(hashtag, end_cursor)
def get_ig_gis(rhx_gis, params):
return + ":" + params).hexdigest()
def get_posts_from_json(json_obj):
edges = json_obj['hashtag']['edge_hashtag_to_media']['edges']
return [o['node'] for o in edges]
def get_posts_from_html(html):
json_str ='window._sharedData = (.*);</script>', html).group(1)
json_obj = json.loads(json_str)
graphql = json_obj["entry_data"]["TagPage"][0]["graphql"]
return get_posts_from_json(graphql)
def make_cookies(csrf_token):
return {
"ig_pr": "2",
"csrftoken": csrf_token,
def make_headers(ig_gis):
return {
"x-instagram-gis": ig_gis,
"x-requested-with": "XMLHttpRequest",
"user-agent": USER_AGENT
def get_next_page(csrf_token, ig_gis, query_id, params):
cookies = make_cookies(csrf_token)
headers = make_headers(ig_gis)
url = INSTAGRAM_URL + HASHTAG_ENDPOINT.format(query_id, params)
req = requests.get(url, headers=headers, cookies=cookies)
json_obj = req.json()
end_cursor = get_end_cursor_from_json(json_obj)
posts = get_posts_from_json(json_obj['data'])
return posts, end_cursor
def scrape_hashtag(hashtag, sleep=3):
Yields scraped posts, one by one
first_page = get_first_page(hashtag)
csrf_token = get_csrf_token(first_page.cookies)
query_id = get_query_id(first_page.text)
rhx_gis = get_rhx_gis(first_page.text)
end_cursor = get_end_cursor_from_html(first_page.text)
home_posts = get_posts_from_html(first_page.text)
for post in home_posts:
yield post
while end_cursor is not None:
params = get_params(hashtag, end_cursor)
ig_gis = get_ig_gis(rhx_gis, params)
posts, end_cursor = get_next_page(csrf_token, ig_gis, query_id, params)
for post in posts:
yield post
# main
for post in scrape_hashtag("summer"):
print post['id']
# do stuff
Copy link

Has somebody had problems with the scraper?

Basically the cookies are not having anymore the csrf token, and so, it is breaking

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment