Skip to content

Instantly share code, notes, and snippets.

@ketankr9
Forked from marcoqu/scraper.py
Last active June 1, 2019 07:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ketankr9/6e48c6c205907e6ae35ef789e7a03634 to your computer and use it in GitHub Desktop.
Save ketankr9/6e48c6c205907e6ae35ef789e7a03634 to your computer and use it in GitHub Desktop.
Script for scraping public instagram profile's timeline photos.
# pylint: skip-file
import time
import re
import md5
import requests
import json
INSTAGRAM_URL = "https://www.instagram.com"
HASHTAG_ENDPOINT = "/graphql/query/?query_hash={}&variables={}"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
def get_first_page(username):
r = requests.get(INSTAGRAM_URL + "/{}/".format(username), headers={"user-agent": USER_AGENT})
json_obj = json.loads(r.text.split("window._sharedData = ")[1].split(";</script>")[0])
end_cursor = get_end_cursor_from_html(r.text)
edges = json_obj["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"]
return (r, [o['node'] for o in edges], end_cursor)
def get_csrf_token(cookies):
return cookies.get("csrftoken")
def get_query_id(html):
script_path = re.search(r'/static(.*)ProfilePageContainer\.js/(.*).js', html).group(0)
script_req = requests.get(INSTAGRAM_URL + script_path)
return re.findall('e\\.profilePosts\\.byUserId\\.get\\(t\\)\\)\\?n\\.pagination:n},queryId:"([^"]*)"', script_req.text)[0]
def get_user_id(html):
return re.search(r'logging_page_id":"([^"]*)"', html).group(1).split("_")[1]
def get_rhx_gis(html):
return re.search(r'rhx_gis":"([^"]*)"', html).group(1)
def get_end_cursor_from_html(html):
return re.search(r'end_cursor":"([^"]*)"', html).group(1)
def get_end_cursor_from_json(json_obj):
return json_obj['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
def get_params(id, end_cursor):
return '{{"id":"{}","first":12,"after":"{}"}}'.format(id, end_cursor)
def get_ig_gis(rhx_gis, params):
return md5.new(rhx_gis + ":" + params).hexdigest()
def get_posts_from_json(json_obj):
edges = json_obj['data']['user']['edge_owner_to_timeline_media']['edges']
return [o['node'] for o in edges]
def make_cookies(csrf_token):
return {
"csrftoken": csrf_token,
}
def make_headers(ig_gis):
return {
"x-instagram-gis": ig_gis,
"x-requested-with": "XMLHttpRequest",
"user-agent": USER_AGENT
}
def get_next_page(csrf_token, ig_gis, query_id, params):
cookies = make_cookies(csrf_token)
headers = make_headers(ig_gis)
url = INSTAGRAM_URL + HASHTAG_ENDPOINT.format(query_id, params)
req = requests.get(url, headers=headers, cookies=cookies)
req.raise_for_status()
json_obj = req.json()
end_cursor = get_end_cursor_from_json(json_obj)
posts = get_posts_from_json(json_obj)
return posts, end_cursor
def scrape_username(username, sleep=3):
"""
Yields scraped posts, one by one
"""
r, posts, end_cursor = get_first_page(username)
csrf_token = get_csrf_token(r.cookies)
query_id = get_query_id(r.text)
rhx_gis = get_rhx_gis(r.text)
id = get_user_id(r.text)
for post in posts:
yield post
time.sleep(sleep)
while end_cursor != None:
params = get_params(id, end_cursor)
ig_gis = get_ig_gis(rhx_gis, params)
posts, end_cursor = get_next_page(csrf_token, ig_gis, query_id, params)
for post in posts:
yield post
time.sleep(sleep)
# main
for post in scrape_username("utsav_ketankr9"):
print post['id'],post['display_url']
# do stuff
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment