Skip to content

Instantly share code, notes, and snippets.

@tomkdickinson
Created January 5, 2017 20:19
Show Gist options
  • Star 11 You must be signed in to star a gist
  • Fork 8 You must be signed in to fork a gist
  • Save tomkdickinson/a093d30523dd77ae970f3ffcf26e1344 to your computer and use it in GitHub Desktop.
Save tomkdickinson/a093d30523dd77ae970f3ffcf26e1344 to your computer and use it in GitHub Desktop.
Followers Extraction Instagram
import json
import requests
import logging as log
log.basicConfig(level=log.DEBUG)
class FollowerExtractor():
"""
Extracts followers for a given profile
"""
def __init__(self, username, password):
self.csrf_token, self.cookie_string = FollowerExtractor.login_instagram(username, password)
log.info("CSRF Token set to %s", self.csrf_token)
log.info("Cookie String set to %s" % self.cookie_string)
@staticmethod
def get_csrf_and_cookie_string():
resp = requests.head("https://www.instagram.com")
return resp.cookies['csrftoken'], resp.headers['set-cookie']
@staticmethod
def login_instagram(username, password):
csrf_token, cookie_string = FollowerExtractor.get_csrf_and_cookie_string()
data = {"username": username, "password": password}
resp = requests.post("https://www.instagram.com/accounts/login/ajax/",
data=data,
headers={
"referer": "https://www.instagram.com/",
"accept": "*/*",
"Accept-Language": "en-GB,en;q=0.8",
"cache-control": "no-cache",
"content-length": "40",
"Content-Type": "application/x-www-form-urlencoded",
"cookie": cookie_string,
"origin": "https://www.instagram.com",
"pragma": "no-cache",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"x-csrftoken": csrf_token,
"x-instagram-ajax": "1",
"X-Requested-With": "XMLHttpRequest"
})
return resp.cookies['csrftoken'], resp.headers['set-cookie']
def extract_followed_by(self, username, user_id=None):
if user_id is None:
user_id = json.loads(requests.get("https://www.instagram.com/%s?__a=1" % username).text)['user']['id']
resp = self.query_followed_by(username, user_id)
followers = resp['followed_by']['nodes']
self.save_followed_by(followers)
while resp['followed_by']['page_info']['has_next_page']:
resp = self.query_followed_by(username, user_id, resp['followed_by']['page_info']['end_cursor'])
followers = resp['followed_by']['nodes']
self.save_followed_by(followers)
followers += resp['followed_by']['nodes']
return followers
def extract_following(self, username, user_id=None):
if user_id is None:
user_id = json.loads(requests.get("https://www.instagram.com/%s?__a=1" % username).text)['user']['id']
resp = self.query_following(username, user_id)
followers = resp['follows']['nodes']
self.save_following(followers)
while resp['follows']['page_info']['has_next_page']:
resp = self.query_following(username, user_id, resp['follows']['page_info']['end_cursor'])
followers = resp['follows']['nodes']
self.save_following(followers)
followers += resp['follows']['nodes']
return followers
def query_following(self, username, user_id, end_cursor=None):
headers = self.get_headers("https://www.instagram.com/%s" % username)
post_data = self.get_following_params(user_id, end_cursor)
req = requests.post("https://www.instagram.com/query/", data=post_data, headers=headers)
return json.loads(req.text)
def query_followed_by(self, username, user_id, end_cursor=None):
headers = self.get_headers("https://www.instagram.com/%s" % username)
post_data = self.get_followed_by_params(user_id, end_cursor)
req = requests.post("https://www.instagram.com/query/", data=post_data, headers=headers)
return json.loads(req.text)
def get_headers(self, referrer):
"""
Returns a bunch of headers we need to use when querying Instagram
:param referrer: The page referrer URL
:return: A dict of headers
"""
return {
"referer": referrer,
"accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "en-GB,en;q=0.8,en-US;q=0.6",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"cookie": self.cookie_string,
"origin": "https://www.instagram.com",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/49.0.2623.87 Safari/537.36",
"x-csrftoken": self.csrf_token,
"x-instagram-ajax": "1",
"X-Requested-With": "XMLHttpRequest"
}
@staticmethod
def get_followed_by_params(user_id, end_cursor=None):
"""
Returns the query params required to load next page on Instagram.
This can be modified to return less information.
:param tag: Tag we're querying
:param end_cursor: The end cursor Instagram specifies
:return: A dict of request parameters
"""
if end_cursor is None:
start_query = "ig_user(%s) { followed_by.first(20) {" % user_id
else:
start_query = "ig_user(%s) { followed_by.after(%s, 20) {" % (user_id, end_cursor)
return {
'q':
start_query +
" count," +
" nodes {" +
" id," +
" is_verified," +
" followed_by_viewer," +
" requested_by_viewer," +
" full_name," +
" profile_pic_url," +
" username" +
" }," +
" page_info {" +
" end_cursor," +
" has_next_page" +
" }" +
"}" +
" }",
"ref": "relationships::follow_list"
}
@staticmethod
def get_following_params(user_id, end_cursor=None):
"""
Returns the query params required to load next page on Instagram.
This can be modified to return less information.
:param tag: Tag we're querying
:param end_cursor: The end cursor Instagram specifies
:return: A dict of request parameters
"""
if end_cursor is None:
start_query = "ig_user(%s) { follows.first(20) {" % user_id
else:
start_query = "ig_user(%s) { follows.after(%s, 20) {" % (user_id, end_cursor)
return {
'q':
start_query +
" count," +
" nodes {" +
" id," +
" is_verified," +
" followed_by_viewer," +
" requested_by_viewer," +
" full_name," +
" profile_pic_url," +
" username" +
" }," +
" page_info {" +
" end_cursor," +
" has_next_page" +
" }" +
"}" +
" }",
"ref": "relationships::follow_list"
}
def save_following(self, following):
"""
Called when a new batch of following users has been extracted from Instagram
:param following: Users who are following user
"""
for user in following:
print("Following: %s" % user['username'])
def save_followed_by(self, followed_by):
"""
Called when a new batch of followed_by users has been extracted from Instagram
:param following: Users who are followed_by
"""
for user in followed_by:
print("Followed By: %s" % user['username'])
if __name__ == '__main__':
instagram_username = "your_username"
instagram_password = "your_password"
followed_extractor = FollowerExtractor(instagram_username, instagram_password)
followed_extractor.extract_following("justintimberlake")
followed_extractor.extract_followed_by("justintimberlake")
@diegofcoelho
Copy link

diegofcoelho commented Jan 5, 2017

Writing here just to make easier, but it was just like before..
Maybe some country related restriction? I tried login with different accounts (not all mine) and it did not work..

`

DEBUG:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): www.instagram.com
DEBUG:requests.packages.urllib3.connectionpool:https://www.instagram.com:443 "HEAD / HTTP/1.1" 200 0
DEBUG:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): www.instagram.com
DEBUG:requests.packages.urllib3.connectionpool:https://www.instagram.com:443 "POST /accounts/login/ajax/ HTTP/1.1" 200 59
INFO:root:CSRF Token set to XXXXXXXXXX
INFO:root:Cookie String set to csrftoken= XXXXXXXXXXX; expires=Thu, 04-Jan-2018 20:32:04 GMT; Max-Age=31449600; Path=/; secure
DEBUG:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): www.instagram.com
DEBUG:requests.packages.urllib3.connectionpool:https://www.instagram.com:443 "GET /dfcoelho?__a=1 HTTP/1.1" 301 0
DEBUG:requests.packages.urllib3.connectionpool:https://www.instagram.com:443 "GET /dfcoelho/?__a=1 HTTP/1.1" 200 414
DEBUG:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): www.instagram.com
DEBUG:requests.packages.urllib3.connectionpool:https://www.instagram.com:443 "POST /query/ HTTP/1.1" 200 43

{"status": "ok", "follows": {"count": 394}}

Traceback (most recent call last):
  File "C:/Users/dfcoelho/Documents/GitHub/ds/dev/beta.py", line 198, in <module>
    followed_extractor.extract_following("dfcoelho")
  File "C:/Users/dfcoelho/Documents/GitHub/ds/dev/beta.py", line 65, in extract_following
    followers = resp['follows']['nodes']
KeyError: 'nodes'

`

@tomkdickinson
Copy link
Author

It's using Python 3, and requests 2.12.4.

The issue I see with that though, is you're trying to extract from (your?) private account. Are the users you're logging in with, following you?

@diegofcoelho
Copy link

diegofcoelho commented Jan 5, 2017

yeap, I am following them and vice versa.
Another account I checking is even public..
So I logged with the public account and tried to get its own users doing that with no success..

Crazy, hah?

Here I am on Python 3.5, same request

One thing that bugs me is that in the response (html) this string appears:

"

This page could not be loaded. If you have cookies disabled in your browser, or you are browsing in Private Mode, please try enabling cookies or turning off Private Mode, and then retrying your action.

"

How come?

@tomkdickinson
Copy link
Author

Hmm, that almost sounds like it's not sending the cookie string to the server.

Try adding this to the top of the script:

import http.client as http_client
http_client.HTTPConnection.debuglevel = 1

It should log the headers. See if a cookie header is being sent after it's logged in.

@diegofcoelho
Copy link

diegofcoelho commented Jan 5, 2017

Until the login is fine:

header: Set-Cookie header: Content-Language header: Expires header: Pragma header: Vary header: Content-Type header: Content-Encoding header: Cache-Control header: Strict-Transport-Security header: Date header: X-Frame-Options header: Set-Cookie header: Set-Cookie header: Connection header: Content-Length send: b'POST /accounts/login/ajax/ HTTP/1.1\r\nHost: www.instagram.com\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36\r\nAccept-Encoding: gzip, deflate\r\naccept: */*\r\nConnection: keep-alive\r\nreferer: https://www.instagram.com/justintimberlake/\r\nAccept-Language: en-GB,en;q=0.8\r\ncache-control: no-cache\r\nContent-Length: 42\r\nContent-Type: application/x-www-form-urlencoded\r\ncookie: sessionid=; expires=Thu, 01-Jan-1970 00:00:00 GMT; Max-Age=0; Path=/; HttpOnly; Domain=instagram.com, mid= **YYYY** ; expires=Wed, 31-Dec-2036 22:06:30 GMT; Max-Age=630720000; Path=/, csrftoken= **XXXX** ; expires=Thu, 04-Jan-2018 22:06:30 GMT; Max-Age=31449600; Path=/; Secure\r\norigin: https://www.instagram.com\r\npragma: no-cache\r\nx-csrftoken: **XXXX** \r\nx-instagram-ajax: 1\r\nX-Requested-With: XMLHttpRequest\r\n\r\n'

send: b'username=U&password=P'

reply: 'HTTP/1.1 200 OK\r\n'

Then this appears:

## reply: 'HTTP/1.1 301 Moved Permanently\r\n'

But apparenty it sent the POST fine:

header: Strict-Transport-Security header: Content-Type header: Vary header: Content-Encoding header: Cache-Control header: Expires header: Content-Language header: Date header: Pragma header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Connection header: Content-Length send: b'POST /query/ HTTP/1.1\r\nHost: www.instagram.com\r\nUser-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36\r\nAccept-Encoding: gzip, deflate\r\naccept: application/json, text/javascript, */*; q=0.01\r\nConnection: keep-alive\r\nreferer: https://www.instagram.com/justintimberlake\r\nAccept-Language: en-GB,en;q=0.8,en-US;q=0.6\r\nContent-Type: application/x-www-form-urlencoded; charset=UTF-8\r\ncookie: ds_user_id=3667668898; expires=Wed, 05-Apr-2017 22:14:58 GMT; Max-Age=7776000; Path=/, csrftoken=**XXXX**; expires=Thu, 04-Jan-2018 22:14:58 GMT; Max-Age=31449600; Path=/; Secure, target=""; expires=Thu, 01-Jan-1970 00:00:00 GMT; Max-Age=0; Path=/, sessionid=**ZZZ**; expires=Wed, 05-Apr-2017 22:14:58 GMT; HttpOnly; Max-Age=7776000; Path=/; Secure\r\norigin: https://www.instagram.com\r\nx-csrftoken: **XXXX**\r\nx-instagram-ajax: 1\r\nX-Requested-With: XMLHttpRequest\r\nContent-Length: 307\r\n\r\n'

@tomkdickinson
Copy link
Author

Not sure if it's the redirect that's causing it. I get that as well, and my response is fine.

I'll see if I can replicate your issue over the weekend. It could be a geo location thing, so I'll try it out on a VPN as well.

@diegofcoelho
Copy link

diegofcoelho commented Jan 7, 2017

301 was due a couple missing slashes. I tried to submit a fix to you showing where but I am not sure if it was pushed properly (I actually have never collaborated in anything on github).

I tried using a VPN to UK and got same results. I also tried on linux. Same. Maybe a user geotag? I might try creating a usr through VPN and testing..

@andreasasprou
Copy link

Has anyone got this working? When I run it, it manages to get the token, but I have a 403 response from the authenticated requests. Any advice?

@zm030215
Copy link

I cannot work it out either, I also got 403 response. I assuem Instagram is forbidden some personal applications.

@samequefarias
Copy link

Good Morning,
I'm trying to find some algorithm in python that can collect geolocation data from instagram here in my city. Do you know any way? Thank you very much in advance.

@OwlGreenApple
Copy link

hi can this code still works ? Instagram give some changes recently. And it need some variable like query_id

@HaiGenkiDes
Copy link

@samequefarias - geolocation instagram
This is one workaround but i'm sure there is a better way,

  1. make a call to facebook graph API with lat lon of city -> get IDs of locations
    https://graph.facebook.com/search?q=&type=place&center=51.5074,-0.1278&distance=5000&access_token=ACCESS-TOKEN
  2. make a call to instagramI with facebook location IDs -> get Instagram location ID's
    https://api.instagram.com/v1/locations/search?facebook_places_id=273471170716&access_token=ACCESS-TOKEN
  3. make a call to instagram api with location ID -> get recent media
    https://api.instagram.com/v1/locations/30824484/media/recent?access_token=ACCESS-TOKEN

@RomanKlimov
Copy link

How You fixed it? I have the same error now..

@fnbrs
Copy link

fnbrs commented May 5, 2018

Seems i can't even log in.

send: b'password=string&username=string'
reply: 'HTTP/1.1 403 Forbidden\r\n'

What I'm doing wrong?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment