Skip to content

Instantly share code, notes, and snippets.

@sashaboulouds
Last active May 23, 2024 16:06
Show Gist options
  • Save sashaboulouds/efdd1aa23135857bc1ea2ba0c3a76406 to your computer and use it in GitHub Desktop.
Save sashaboulouds/efdd1aa23135857bc1ea2ba0c3a76406 to your computer and use it in GitHub Desktop.
Collect up to 3000 likers from any LinkedIn post β€” and export to a .csv file πŸ’žπŸ”΅
# =============================================================================
# Title: LinkedIn Post Likers Scraper
# Description: This script can scrape up to 3000 likers from any LinkedIn post.
# Author: Sasha Bouloudnine
# Date: 2024-03-05
#
# Usage:
# - Install requests using `pip install requests`.
# - Connect to LinkedIn and collect your `li_at` cookie.
# - Get your post URL.
# - Launch the script.
#
# =============================================================================
import requests
import csv
import time
import re
# Update your LI_AT below
# vvvvvvvvvvvvvvvvvvvv
LI_AT = "AQEFARABAAAAAA582xIAAAGN-ncr4gAAAY4eg6_iTQAAs3VybjpsaTplbnRlcnByaXNlQXV0aFRva2VuOmVKeGpaQUFDZGcvUE9TQ2FlMFBHRkJETkVmYjlDU09JWWZlTmJ4V1lJWFhtL0M4R1JnQzJmZ25FXnVybjpsaTplbnRlcnByaXNlUHJvZmlsZToodXJuOmxpOmVudGVycHJpc2VBY2NvdW50OjEyMjE3Nzk0OCwxOTYxMTA0ODQpXnVybjpsaTptZW1iZXI6MzA0NzQwNDUzCJCilw8ToxGdMzR3SPl1TqCZTknBs1duxKFK7L6EsksXVkem6Xq-ZOZRNLuEfpl_6xFR2zcQqQWMPlKPlKJq5AzO8H1mffd4EgVN-MaTu0UEMZdnhd6sLxssWLyAOjDkvPpeab6WM2CfbRitkYiIqyurdTCQrck9Cr3ghlmSBGZlFScZ7xRXu3Xpn3q07cYgenQ5vw"
# Add you URL below
# vvvvvvvvvvvvvvvvvvvv
URL = "https://www.linkedin.com/posts/williamhgates_us-carbon-emissions-fell-in-2023-as-coal-activity-7156808265396285440-EV0P/?utm_source=share&utm_medium=member_desktop"
# Start of the script
HEADERS = {
'authority': 'www.linkedin.com',
'accept': 'application/vnd.linkedin.normalized+json+2.1',
'accept-language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
'cookie': 'li_at=%s; JSESSIONID="%s";',
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'x-li-lang': 'en_US',
'x-restli-protocol-version': '2.0.0',
}
FIELDNAMES = ["position", "url", "name", "reaction", "connections", "image_url", "input_url"]
DATA = []
class LinkedInLikersScraper():
def __init__(self):
self.s = requests.Session()
# self.s.headers = HEADERS
self.csrf_token = None
def get_csrf_token(self):
print('[1] getting valid csrf_token')
response = self.s.get(
url='https://www.linkedin.com/feed/',
headers={
'authority': 'www.linkedin.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'fr-FR,fr;q=0.9',
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
}
)
assert response.ok
cookies_dict = dict(self.s.cookies)
assert 'JSESSIONID' in cookies_dict.keys()
self.csrf_token = cookies_dict["JSESSIONID"]
HEADERS["csrf-token"] = self.csrf_token
HEADERS["cookie"] = HEADERS["cookie"] % (LI_AT, self.csrf_token)
print(self.csrf_token)
print('ok\n')
time.sleep(1)
def get_activity_id(self, url):
print('[2] getting activity_id')
# s.headers = headers_get
_types = ["ugcPost", "activity"]
response = self.s.get(
url=url,
headers={
'authority': 'www.linkedin.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'fr-FR,fr;q=0.9',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
}
)
time.sleep(1)
assert response.ok
_good_type = None
_activity_id = None
for _type in _types:
_regex = '(?<=\(urn:li:%s:)\d+' % _type
activity_ids = re.findall(_regex, response.text)
activity_ids = list(set(activity_ids))
if activity_ids and len(activity_ids) == 1:
_activity_id = "".join(activity_ids)
_good_type = _type
break
assert all([_activity_id, _good_type])
print(_activity_id, _good_type)
print('ok\n')
return _activity_id, _good_type
def iter_reviews(self, activity_id, input_url, _type):
print('[3] collecting reviews')
offset = 0
step = 50
while True:
request_url = "https://www.linkedin.com/voyager/api/graphql?includeWebMetadata=true&variables=(count:%s,start:%s,threadUrn:urn%%3Ali%%3A%s%%3A%s)&queryId=voyagerSocialDashReactions.aefc2c6e769fd6de71df5e638b12f76e" % (step, offset, _type, activity_id)
response = self.s.get(
request_url,
headers=HEADERS
)
print(offset, 'ok')
time.sleep(1)
try:
assert response.ok
except AssertionError:
print(response.text)
raise
_included = response.json()["included"]
if not _included:
break
assert _included and isinstance(_included, list)
for e in _included:
if not 'actorUrn' in e.keys():
continue
reactor_lockup = e["reactorLockup"]
assert reactor_lockup
try:
position = reactor_lockup["subtitle"]["text"]
except TypeError:
position = ''
pass
url = reactor_lockup["navigationUrl"]
name = reactor_lockup["title"]["text"]
reaction = e["reactionType"]
try:
connections = reactor_lockup["label"]["text"]
except TypeError:
connections = ''
pass
try:
_vector_image = reactor_lockup["image"]["attributes"][0]["detailData"]["nonEntityProfilePicture"]["vectorImage"]
_root_url = _vector_image["rootUrl"]
_large_artifact = _vector_image["artifacts"][-1]["fileIdentifyingUrlPathSegment"]
assert all([_root_url, _vector_image])
image_url = _root_url+_large_artifact
except TypeError:
image_url = ''
pass
values = [position, url, name, reaction, connections, image_url, input_url]
assert all([v is not None for v in values])
# print(name, url)
row = dict(zip(FIELDNAMES, values))
DATA.append(row)
offset += step
def save_in_file(self, data):
with open('results_linkedin_post_likers_lobstrio.txt', 'w') as g:
writer = csv.DictWriter(g, delimiter='\t', fieldnames=FIELDNAMES)
writer.writeheader()
for row in data:
writer.writerow(row)
def main():
print('[0] starting %s' % URL)
print('ok\n')
time.sleep(2)
l = LinkedInLikersScraper()
l.get_csrf_token()
activity_id, _type = l.get_activity_id(URL)
assert activity_id
l.iter_reviews(activity_id, URL, _type)
l.save_in_file(DATA)
print('done :Β°')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment