fabianrios/instagram_scrapper.rb

## instagram_scrapper.rb
#!/usr/bin/env ruby
require 'rubygems'
require 'nokogiri'
require 'mechanize'
require 'open-uri'
require 'json'
require "selenium-webdriver"

BASE_URL = "https://www.instagram.com"
id = ARGV[0].to_i
which = ARGV[1]
fl = ARGV[2]


# Timeout = 5 sec
@wait = Selenium::WebDriver::Wait.new(:timeout => 15)

def navigate (which, fl, id)
    @id = id
    p @id
    p which
    @fl = fl
    p @fl
    url = "#{BASE_URL}/explore/#{which}/"
    @filename = '/Volumes/External/data/'
    @file = "#{fl}.json"
    @driver = Selenium::WebDriver.for :chrome
    @driver.navigate.to url
    p @driver.title
    sleep(40) #Half a second
    # options = @driver.execute_script("return document.querySelectorAll('a')")
    # options[0].click
    # sleep(3)
    @rightarr = @driver.find_elements(:class, "coreSpriteRightPaginationArrow").first
    # @rightarr.click
    article = @wait.until {
        element = @driver.find_elements(:tag_name, 'article')
    }
    def getArticle(article)
        @time = 0.5
        # articles = @driver.execute_script("return document.querySelectorAll('article')")
        unless article.nil?
            str = article.attribute("innerHTML")
            contain = ['#foodie','#foodies','#foodporn','#instafood','#yum','#food','#foodpics','#yummyfood','#geilesessen','#abendessen', '#foodstagram', '#foodgasm', '#coffee', '#cafe', '#japanesefood'].any? { |word| str.include?(word) }
            if contain
                @doc = Nokogiri::HTML(str)
                author = @doc.at('h2').text.strip
                image = @doc.at_css('div div div div img')
                if !image.nil?
                    img = image.attr('src')
                else
                    p 'image nil next'
                    @rightarr.click
                    sleep(1)
                    article = @wait.until {
                        element = @driver.find_elements(:tag_name, 'article')
                    }
                    return getArticle(article.last)
                end
                # check the file
                file = File.read(@filename+@file)
                data_hash = JSON.parse(file)
                saveimage = data_hash.any? { |h| h['m'] == img }
                p saveimage
                p img
                # check if is not there
                if !saveimage
                    @time = 2
                    @id += 1
                    # p str
                    p '-------------------------------'
                    p @id
                    tags = @doc.search('a').text.strip.scan(/#\w+/).flatten
                    text = @doc.search('span').text.strip
                    data = {"id": @id, "author": @doc.at('h2').text.strip, 'm': img, 'text': text, 'url': @driver.current_url}
                    unless tags.nil?
                        # p tags
                        data['tags'] = tags
                    end
                    puts JSON.pretty_generate(data)
                    data_hash.push(data)
                    File.open(@filename+@file, "w") do |f|
                        f.puts JSON.pretty_generate(data_hash)
                    end
                    begin
                        download = open(img)
                        IO.copy_stream(download, "#{@filename}#{@fl}/"+@id.to_s+'.jpg')
                        # open("#{@filename}#{@fl}/"+@id.to_s+'.jpg', 'wb') do |file|
                        #     file << open(img).read
                        # end
                    rescue SystemCallError
                        p 'error saving file'
                        retry
                    end
                end
                @rightarr.click
                sleep(@time)
                article = @wait.until {
                    element = @driver.find_elements(:tag_name, 'article')
                }
                return getArticle(article.last)
            else
                @rightarr.click
                p 'does not contain any tags wait: and again'
                p @time
                sleep(@time)
                article = @wait.until {
                    element = @driver.find_elements(:tag_name, 'article')
                }
                return getArticle(article.last)
            end
        else
            p 'no article found wait'
            p @time
            sleep(@time)
            article = @wait.until {
                element = @driver.find_elements(:tag_name, 'article')
            }
            return  getArticle(article.last)
        end # /unless
    end
    getArticle(article.last)
    # @driver.quit
end

navigate(which, fl, id)
	#!/usr/bin/env ruby
	require 'rubygems'
	require 'nokogiri'
	require 'mechanize'
	require 'open-uri'
	require 'json'
	require "selenium-webdriver"

	BASE_URL = "https://www.instagram.com"
	id = ARGV[0].to_i
	which = ARGV[1]
	fl = ARGV[2]


	# Timeout = 5 sec
	@wait = Selenium::WebDriver::Wait.new(:timeout => 15)

	def navigate (which, fl, id)
	@id = id
	p @id
	p which
	@fl = fl
	p @fl
	url = "#{BASE_URL}/explore/#{which}/"
	@filename = '/Volumes/External/data/'
	@file = "#{fl}.json"
	@driver = Selenium::WebDriver.for :chrome
	@driver.navigate.to url
	p @driver.title
	sleep(40) #Half a second
	# options = @driver.execute_script("return document.querySelectorAll('a')")
	# options[0].click
	# sleep(3)
	@rightarr = @driver.find_elements(:class, "coreSpriteRightPaginationArrow").first
	# @rightarr.click
	article = @wait.until {
	element = @driver.find_elements(:tag_name, 'article')
	}
	def getArticle(article)
	@time = 0.5
	# articles = @driver.execute_script("return document.querySelectorAll('article')")
	unless article.nil?
	str = article.attribute("innerHTML")
	contain = ['#foodie','#foodies','#foodporn','#instafood','#yum','#food','#foodpics','#yummyfood','#geilesessen','#abendessen', '#foodstagram', '#foodgasm', '#coffee', '#cafe', '#japanesefood'].any? { \|word\| str.include?(word) }
	if contain
	@doc = Nokogiri::HTML(str)
	author = @doc.at('h2').text.strip
	image = @doc.at_css('div div div div img')
	if !image.nil?
	img = image.attr('src')
	else
	p 'image nil next'
	@rightarr.click
	sleep(1)
	article = @wait.until {
	element = @driver.find_elements(:tag_name, 'article')
	}
	return getArticle(article.last)
	end
	# check the file
	file = File.read(@filename+@file)
	data_hash = JSON.parse(file)
	saveimage = data_hash.any? { \|h\| h['m'] == img }
	p saveimage
	p img
	# check if is not there
	if !saveimage
	@time = 2
	@id += 1
	# p str
	p '-------------------------------'
	p @id
	tags = @doc.search('a').text.strip.scan(/#\w+/).flatten
	text = @doc.search('span').text.strip
	data = {"id": @id, "author": @doc.at('h2').text.strip, 'm': img, 'text': text, 'url': @driver.current_url}
	unless tags.nil?
	# p tags
	data['tags'] = tags
	end
	puts JSON.pretty_generate(data)
	data_hash.push(data)
	File.open(@filename+@file, "w") do \|f\|
	f.puts JSON.pretty_generate(data_hash)
	end
	begin
	download = open(img)
	IO.copy_stream(download, "#{@filename}#{@fl}/"+@id.to_s+'.jpg')
	# open("#{@filename}#{@fl}/"+@id.to_s+'.jpg', 'wb') do \|file\|
	# file << open(img).read
	# end
	rescue SystemCallError
	p 'error saving file'
	retry
	end
	end
	@rightarr.click
	sleep(@time)
	article = @wait.until {
	element = @driver.find_elements(:tag_name, 'article')
	}
	return getArticle(article.last)
	else
	@rightarr.click
	p 'does not contain any tags wait: and again'
	p @time
	sleep(@time)
	article = @wait.until {
	element = @driver.find_elements(:tag_name, 'article')
	}
	return getArticle(article.last)
	end
	else
	p 'no article found wait'
	p @time
	sleep(@time)
	article = @wait.until {
	element = @driver.find_elements(:tag_name, 'article')
	}
	return getArticle(article.last)
	end # /unless
	end
	getArticle(article.last)
	# @driver.quit
	end

	navigate(which, fl, id)