Last active
May 23, 2024 16:06
-
-
Save sashaboulouds/efdd1aa23135857bc1ea2ba0c3a76406 to your computer and use it in GitHub Desktop.
Collect up to 3000 likers from any LinkedIn post β and export to a .csv file ππ΅
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ============================================================================= | |
# Title: LinkedIn Post Likers Scraper | |
# Description: This script can scrape up to 3000 likers from any LinkedIn post. | |
# Author: Sasha Bouloudnine | |
# Date: 2024-03-05 | |
# | |
# Usage: | |
# - Install requests using `pip install requests`. | |
# - Connect to LinkedIn and collect your `li_at` cookie. | |
# - Get your post URL. | |
# - Launch the script. | |
# | |
# ============================================================================= | |
import requests | |
import csv | |
import time | |
import re | |
# Update your LI_AT below | |
# vvvvvvvvvvvvvvvvvvvv | |
LI_AT = "AQEFARABAAAAAA582xIAAAGN-ncr4gAAAY4eg6_iTQAAs3VybjpsaTplbnRlcnByaXNlQXV0aFRva2VuOmVKeGpaQUFDZGcvUE9TQ2FlMFBHRkJETkVmYjlDU09JWWZlTmJ4V1lJWFhtL0M4R1JnQzJmZ25FXnVybjpsaTplbnRlcnByaXNlUHJvZmlsZToodXJuOmxpOmVudGVycHJpc2VBY2NvdW50OjEyMjE3Nzk0OCwxOTYxMTA0ODQpXnVybjpsaTptZW1iZXI6MzA0NzQwNDUzCJCilw8ToxGdMzR3SPl1TqCZTknBs1duxKFK7L6EsksXVkem6Xq-ZOZRNLuEfpl_6xFR2zcQqQWMPlKPlKJq5AzO8H1mffd4EgVN-MaTu0UEMZdnhd6sLxssWLyAOjDkvPpeab6WM2CfbRitkYiIqyurdTCQrck9Cr3ghlmSBGZlFScZ7xRXu3Xpn3q07cYgenQ5vw" | |
# Add you URL below | |
# vvvvvvvvvvvvvvvvvvvv | |
URL = "https://www.linkedin.com/posts/williamhgates_us-carbon-emissions-fell-in-2023-as-coal-activity-7156808265396285440-EV0P/?utm_source=share&utm_medium=member_desktop" | |
# Start of the script | |
HEADERS = { | |
'authority': 'www.linkedin.com', | |
'accept': 'application/vnd.linkedin.normalized+json+2.1', | |
'accept-language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7', | |
'cookie': 'li_at=%s; JSESSIONID="%s";', | |
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"macOS"', | |
'sec-fetch-dest': 'empty', | |
'sec-fetch-mode': 'cors', | |
'sec-fetch-site': 'same-origin', | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', | |
'x-li-lang': 'en_US', | |
'x-restli-protocol-version': '2.0.0', | |
} | |
FIELDNAMES = ["position", "url", "name", "reaction", "connections", "image_url", "input_url"] | |
DATA = [] | |
class LinkedInLikersScraper(): | |
def __init__(self): | |
self.s = requests.Session() | |
# self.s.headers = HEADERS | |
self.csrf_token = None | |
def get_csrf_token(self): | |
print('[1] getting valid csrf_token') | |
response = self.s.get( | |
url='https://www.linkedin.com/feed/', | |
headers={ | |
'authority': 'www.linkedin.com', | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
'accept-language': 'fr-FR,fr;q=0.9', | |
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"macOS"', | |
'sec-fetch-dest': 'document', | |
'sec-fetch-mode': 'navigate', | |
'sec-fetch-site': 'none', | |
'sec-fetch-user': '?1', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', | |
} | |
) | |
assert response.ok | |
cookies_dict = dict(self.s.cookies) | |
assert 'JSESSIONID' in cookies_dict.keys() | |
self.csrf_token = cookies_dict["JSESSIONID"] | |
HEADERS["csrf-token"] = self.csrf_token | |
HEADERS["cookie"] = HEADERS["cookie"] % (LI_AT, self.csrf_token) | |
print(self.csrf_token) | |
print('ok\n') | |
time.sleep(1) | |
def get_activity_id(self, url): | |
print('[2] getting activity_id') | |
# s.headers = headers_get | |
_types = ["ugcPost", "activity"] | |
response = self.s.get( | |
url=url, | |
headers={ | |
'authority': 'www.linkedin.com', | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
'accept-language': 'fr-FR,fr;q=0.9', | |
'cache-control': 'max-age=0', | |
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"macOS"', | |
'sec-fetch-dest': 'document', | |
'sec-fetch-mode': 'navigate', | |
'sec-fetch-site': 'same-origin', | |
'sec-fetch-user': '?1', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', | |
} | |
) | |
time.sleep(1) | |
assert response.ok | |
_good_type = None | |
_activity_id = None | |
for _type in _types: | |
_regex = '(?<=\(urn:li:%s:)\d+' % _type | |
activity_ids = re.findall(_regex, response.text) | |
activity_ids = list(set(activity_ids)) | |
if activity_ids and len(activity_ids) == 1: | |
_activity_id = "".join(activity_ids) | |
_good_type = _type | |
break | |
assert all([_activity_id, _good_type]) | |
print(_activity_id, _good_type) | |
print('ok\n') | |
return _activity_id, _good_type | |
def iter_reviews(self, activity_id, input_url, _type): | |
print('[3] collecting reviews') | |
offset = 0 | |
step = 50 | |
while True: | |
request_url = "https://www.linkedin.com/voyager/api/graphql?includeWebMetadata=true&variables=(count:%s,start:%s,threadUrn:urn%%3Ali%%3A%s%%3A%s)&queryId=voyagerSocialDashReactions.aefc2c6e769fd6de71df5e638b12f76e" % (step, offset, _type, activity_id) | |
response = self.s.get( | |
request_url, | |
headers=HEADERS | |
) | |
print(offset, 'ok') | |
time.sleep(1) | |
try: | |
assert response.ok | |
except AssertionError: | |
print(response.text) | |
raise | |
_included = response.json()["included"] | |
if not _included: | |
break | |
assert _included and isinstance(_included, list) | |
for e in _included: | |
if not 'actorUrn' in e.keys(): | |
continue | |
reactor_lockup = e["reactorLockup"] | |
assert reactor_lockup | |
try: | |
position = reactor_lockup["subtitle"]["text"] | |
except TypeError: | |
position = '' | |
pass | |
url = reactor_lockup["navigationUrl"] | |
name = reactor_lockup["title"]["text"] | |
reaction = e["reactionType"] | |
try: | |
connections = reactor_lockup["label"]["text"] | |
except TypeError: | |
connections = '' | |
pass | |
try: | |
_vector_image = reactor_lockup["image"]["attributes"][0]["detailData"]["nonEntityProfilePicture"]["vectorImage"] | |
_root_url = _vector_image["rootUrl"] | |
_large_artifact = _vector_image["artifacts"][-1]["fileIdentifyingUrlPathSegment"] | |
assert all([_root_url, _vector_image]) | |
image_url = _root_url+_large_artifact | |
except TypeError: | |
image_url = '' | |
pass | |
values = [position, url, name, reaction, connections, image_url, input_url] | |
assert all([v is not None for v in values]) | |
# print(name, url) | |
row = dict(zip(FIELDNAMES, values)) | |
DATA.append(row) | |
offset += step | |
def save_in_file(self, data): | |
with open('results_linkedin_post_likers_lobstrio.txt', 'w') as g: | |
writer = csv.DictWriter(g, delimiter='\t', fieldnames=FIELDNAMES) | |
writer.writeheader() | |
for row in data: | |
writer.writerow(row) | |
def main(): | |
print('[0] starting %s' % URL) | |
print('ok\n') | |
time.sleep(2) | |
l = LinkedInLikersScraper() | |
l.get_csrf_token() | |
activity_id, _type = l.get_activity_id(URL) | |
assert activity_id | |
l.iter_reviews(activity_id, URL, _type) | |
l.save_in_file(DATA) | |
print('done :Β°') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment