Last active
September 8, 2023 10:39
-
-
Save uluQulu/162b0c1d387157ef6e163d796f9cdf7e to your computer and use it in GitHub Desktop.
Get links for tag (Smart APPROACH)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_links_for_tag(browser, | |
tag, | |
amount, | |
logger, | |
media=None, | |
skip_top_posts=True): | |
"""Fetches the number of links specified | |
by amount and returns a list of links""" | |
if media is None: | |
# All known media types | |
media = ['', 'Post', 'Video'] | |
elif media == 'Photo': | |
# Include posts with multiple images in it | |
media = ['', 'Post'] | |
else: | |
# Make it an array to use it in the following part | |
media = [media] | |
browser.get('https://www.instagram.com/explore/tags/' | |
+ (tag[1:] if tag[:1] == '#' else tag)) | |
# update server calls | |
update_activity() | |
sleep(2) | |
top_elements = browser.find_element_by_xpath('//main/article/div[1]') | |
top_posts = top_elements.find_elements_by_tag_name('a') | |
sleep(1) | |
if skip_top_posts: | |
main_elem = browser.find_element_by_xpath('//main/article/div[2]') | |
else: | |
main_elem = browser.find_element_by_tag_name('main') | |
link_elems = main_elem.find_elements_by_tag_name('a') | |
sleep(1) | |
if not link_elems: #this tag does not have `Top Posts` or it really is empty.. | |
main_elem = browser.find_element_by_xpath('//main/article/div[1]') | |
top_posts = [] | |
sleep(2) | |
possible_posts = format_number(browser.find_element_by_xpath( | |
"//span[contains(@class, '_fd86t')]").text) | |
logger.info("desired amount: {} | top posts [{}]: {} | possible posts: {}".format(amount, | |
('enabled' if not skip_top_posts else 'disabled'), len(top_posts), possible_posts)) | |
possible_posts = possible_posts if not skip_top_posts else possible_posts-len(top_posts) | |
amount = possible_posts if amount > possible_posts else amount | |
#sometimes pages do not have the correct amount of posts as it is written there, it may be cos of some posts is deleted but still keeps counted for the tag | |
#Get links | |
links = get_links(browser, tag, logger, media, main_elem) | |
filtered_links = len(links) | |
try_again = 0 | |
sc_rolled = 0 | |
nap = 1.5 | |
put_sleep = 0 | |
try: | |
while filtered_links in range(1, amount): | |
if sc_rolled > 100: | |
logger.info("Scrolled too much! ~ sleeping a bit :>") | |
sleep(600) | |
sc_rolled = 0 | |
for i in range(3): | |
browser.execute_script( | |
"window.scrollTo(0, document.body.scrollHeight);") | |
sc_rolled += 1 | |
update_activity() | |
sleep(nap) #if not slept, and internet speed is low, instagram will only scroll one time, instead of many times you sent scoll command... | |
sleep(3) | |
links.extend(get_links(browser, tag, logger, media, main_elem)) | |
links_all = links #uniqify links while preserving order | |
s = set() | |
links = [] | |
for i in links_all: | |
if i not in s: | |
s.add(i) | |
links.append(i) | |
if len(links) == filtered_links: | |
try_again += 1 | |
nap = 3 if try_again==1 else 5 | |
logger.info("Insufficient amount of links ~ trying again: {}".format(try_again)) | |
sleep(3) | |
if try_again > 2: #you can try again as much as you want by changing this number | |
if put_sleep < 1 and filtered_links <= 21 : | |
logger.info("Cor! Did you send too many requests? ~ let's rest some") | |
sleep(600) | |
put_sleep += 1 | |
browser.execute_script("location.reload()") | |
try_again = 0 | |
sleep(10) | |
main_elem = (browser.find_element_by_xpath('//main/article/div[1]') if not link_elems else | |
browser.find_element_by_xpath('//main/article/div[2]') if skip_top_posts else | |
browser.find_element_by_tag_name('main')) | |
else: | |
logger.info("'{}' tag POSSIBLY has less images than desired...".format(tag[1:] if tag[:1] == '#' else tag)) | |
break | |
else: | |
filtered_links = len(links) | |
try_again = 0 | |
nap = 1.5 | |
except: | |
raise | |
sleep(4) | |
return links[:amount] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Just click on RAW and copy lines