uluQulu/get_links_for_tag.py

## get_links_for_tag.py
def get_links_for_tag(browser,
                      tag,
                      amount,
                      logger,
                      media=None,
                      skip_top_posts=True):
    """Fetches the number of links specified
    by amount and returns a list of links"""

    if media is None:
        # All known media types
        media = ['', 'Post', 'Video']
    elif media == 'Photo':
        # Include posts with multiple images in it
        media = ['', 'Post']
    else:
        # Make it an array to use it in the following part
        media = [media]

    browser.get('https://www.instagram.com/explore/tags/'
                + (tag[1:] if tag[:1] == '#' else tag))
    # update server calls
    update_activity()
    sleep(2)

    top_elements = browser.find_element_by_xpath('//main/article/div[1]')
    top_posts = top_elements.find_elements_by_tag_name('a')
    sleep(1)

    if skip_top_posts:
        main_elem = browser.find_element_by_xpath('//main/article/div[2]')
    else:
        main_elem = browser.find_element_by_tag_name('main')
    link_elems = main_elem.find_elements_by_tag_name('a')
    sleep(1)

    if not link_elems:   #this tag does not have `Top Posts` or it really is empty..
        main_elem = browser.find_element_by_xpath('//main/article/div[1]')
        top_posts = []
    sleep(2)

    possible_posts = format_number(browser.find_element_by_xpath(
                                "//span[contains(@class, '_fd86t')]").text)

    logger.info("desired amount: {}  |  top posts [{}]: {}  |  possible posts: {}".format(amount,
                                      ('enabled' if not skip_top_posts else 'disabled'), len(top_posts), possible_posts))
    possible_posts = possible_posts if not skip_top_posts else possible_posts-len(top_posts)
    amount = possible_posts if amount > possible_posts else amount
    #sometimes pages do not have the correct amount of posts as it is written there, it may be cos of some posts is deleted but still keeps counted for the tag

    #Get links
    links = get_links(browser, tag, logger, media, main_elem)
    filtered_links = len(links)
    try_again = 0
    sc_rolled = 0
    nap = 1.5
    put_sleep = 0
    try:
        while filtered_links in range(1, amount):
            if sc_rolled > 100:
                logger.info("Scrolled too much! ~ sleeping a bit :>")
                sleep(600)
                sc_rolled = 0
            for i in range(3):
                browser.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                sc_rolled += 1
                update_activity()
                sleep(nap)   #if not slept, and internet speed is low, instagram will only scroll one time, instead of many times you sent scoll command...
            sleep(3)
            links.extend(get_links(browser, tag, logger, media, main_elem))

            links_all = links   #uniqify links while preserving order
            s = set()
            links = []
            for i in links_all:
                if i not in s:
                    s.add(i)
                    links.append(i)
            if len(links) == filtered_links:
                try_again += 1
                nap = 3 if try_again==1 else 5
                logger.info("Insufficient amount of links ~ trying again: {}".format(try_again))
                sleep(3)
                if try_again > 2:   #you can try again as much as you want by changing this number
                    if put_sleep < 1 and filtered_links <= 21 :
                        logger.info("Cor! Did you send too many requests? ~ let's rest some")
                        sleep(600)
                        put_sleep += 1
                        browser.execute_script("location.reload()")
                        try_again = 0
                        sleep(10)
                        main_elem = (browser.find_element_by_xpath('//main/article/div[1]') if not link_elems else
                                      browser.find_element_by_xpath('//main/article/div[2]') if skip_top_posts else
                                       browser.find_element_by_tag_name('main'))
                    else:
                        logger.info("'{}' tag POSSIBLY has less images than desired...".format(tag[1:] if tag[:1] == '#' else tag))
                        break
            else:
                filtered_links = len(links)
                try_again = 0
                nap = 1.5
    except:
        raise

    sleep(4)

    return links[:amount]
	def get_links_for_tag(browser,
	tag,
	amount,
	logger,
	media=None,
	skip_top_posts=True):
	"""Fetches the number of links specified
	by amount and returns a list of links"""

	if media is None:
	# All known media types
	media = ['', 'Post', 'Video']
	elif media == 'Photo':
	# Include posts with multiple images in it
	media = ['', 'Post']
	else:
	# Make it an array to use it in the following part
	media = [media]

	browser.get('https://www.instagram.com/explore/tags/'
	+ (tag[1:] if tag[:1] == '#' else tag))
	# update server calls
	update_activity()
	sleep(2)

	top_elements = browser.find_element_by_xpath('//main/article/div[1]')
	top_posts = top_elements.find_elements_by_tag_name('a')
	sleep(1)

	if skip_top_posts:
	main_elem = browser.find_element_by_xpath('//main/article/div[2]')
	else:
	main_elem = browser.find_element_by_tag_name('main')
	link_elems = main_elem.find_elements_by_tag_name('a')
	sleep(1)

	if not link_elems: #this tag does not have `Top Posts` or it really is empty..
	main_elem = browser.find_element_by_xpath('//main/article/div[1]')
	top_posts = []
	sleep(2)

	possible_posts = format_number(browser.find_element_by_xpath(
	"//span[contains(@class, '_fd86t')]").text)

	logger.info("desired amount: {} \| top posts [{}]: {} \| possible posts: {}".format(amount,
	('enabled' if not skip_top_posts else 'disabled'), len(top_posts), possible_posts))
	possible_posts = possible_posts if not skip_top_posts else possible_posts-len(top_posts)
	amount = possible_posts if amount > possible_posts else amount
	#sometimes pages do not have the correct amount of posts as it is written there, it may be cos of some posts is deleted but still keeps counted for the tag

	#Get links
	links = get_links(browser, tag, logger, media, main_elem)
	filtered_links = len(links)
	try_again = 0
	sc_rolled = 0
	nap = 1.5
	put_sleep = 0
	try:
	while filtered_links in range(1, amount):
	if sc_rolled > 100:
	logger.info("Scrolled too much! ~ sleeping a bit :>")
	sleep(600)
	sc_rolled = 0
	for i in range(3):
	browser.execute_script(
	"window.scrollTo(0, document.body.scrollHeight);")
	sc_rolled += 1
	update_activity()
	sleep(nap) #if not slept, and internet speed is low, instagram will only scroll one time, instead of many times you sent scoll command...
	sleep(3)
	links.extend(get_links(browser, tag, logger, media, main_elem))

	links_all = links #uniqify links while preserving order
	s = set()
	links = []
	for i in links_all:
	if i not in s:
	s.add(i)
	links.append(i)
	if len(links) == filtered_links:
	try_again += 1
	nap = 3 if try_again==1 else 5
	logger.info("Insufficient amount of links ~ trying again: {}".format(try_again))
	sleep(3)
	if try_again > 2: #you can try again as much as you want by changing this number
	if put_sleep < 1 and filtered_links <= 21 :
	logger.info("Cor! Did you send too many requests? ~ let's rest some")
	sleep(600)
	put_sleep += 1
	browser.execute_script("location.reload()")
	try_again = 0
	sleep(10)
	main_elem = (browser.find_element_by_xpath('//main/article/div[1]') if not link_elems else
	browser.find_element_by_xpath('//main/article/div[2]') if skip_top_posts else
	browser.find_element_by_tag_name('main'))
	else:
	logger.info("'{}' tag POSSIBLY has less images than desired...".format(tag[1:] if tag[:1] == '#' else tag))
	break
	else:
	filtered_links = len(links)
	try_again = 0
	nap = 1.5
	except:
	raise

	sleep(4)

	return links[:amount]