Skip to content

Instantly share code, notes, and snippets.

@fabianrios
Last active October 28, 2018 15:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fabianrios/6db2612d3382204b6bfe1d870a05f845 to your computer and use it in GitHub Desktop.
Save fabianrios/6db2612d3382204b6bfe1d870a05f845 to your computer and use it in GitHub Desktop.
Script created to fetch instagram post from the web.
#!/usr/bin/env ruby
require 'rubygems'
require 'nokogiri'
require 'mechanize'
require 'open-uri'
require 'json'
require "selenium-webdriver"
BASE_URL = "https://www.instagram.com"
id = ARGV[0].to_i
which = ARGV[1]
fl = ARGV[2]
# Timeout = 5 sec
@wait = Selenium::WebDriver::Wait.new(:timeout => 15)
def navigate (which, fl, id)
@id = id
p @id
p which
@fl = fl
p @fl
url = "#{BASE_URL}/explore/#{which}/"
@filename = '/Volumes/External/data/'
@file = "#{fl}.json"
@driver = Selenium::WebDriver.for :chrome
@driver.navigate.to url
p @driver.title
sleep(40) #Half a second
# options = @driver.execute_script("return document.querySelectorAll('a')")
# options[0].click
# sleep(3)
@rightarr = @driver.find_elements(:class, "coreSpriteRightPaginationArrow").first
# @rightarr.click
article = @wait.until {
element = @driver.find_elements(:tag_name, 'article')
}
def getArticle(article)
@time = 0.5
# articles = @driver.execute_script("return document.querySelectorAll('article')")
unless article.nil?
str = article.attribute("innerHTML")
contain = ['#foodie','#foodies','#foodporn','#instafood','#yum','#food','#foodpics','#yummyfood','#geilesessen','#abendessen', '#foodstagram', '#foodgasm', '#coffee', '#cafe', '#japanesefood'].any? { |word| str.include?(word) }
if contain
@doc = Nokogiri::HTML(str)
author = @doc.at('h2').text.strip
image = @doc.at_css('div div div div img')
if !image.nil?
img = image.attr('src')
else
p 'image nil next'
@rightarr.click
sleep(1)
article = @wait.until {
element = @driver.find_elements(:tag_name, 'article')
}
return getArticle(article.last)
end
# check the file
file = File.read(@filename+@file)
data_hash = JSON.parse(file)
saveimage = data_hash.any? { |h| h['m'] == img }
p saveimage
p img
# check if is not there
if !saveimage
@time = 2
@id += 1
# p str
p '-------------------------------'
p @id
tags = @doc.search('a').text.strip.scan(/#\w+/).flatten
text = @doc.search('span').text.strip
data = {"id": @id, "author": @doc.at('h2').text.strip, 'm': img, 'text': text, 'url': @driver.current_url}
unless tags.nil?
# p tags
data['tags'] = tags
end
puts JSON.pretty_generate(data)
data_hash.push(data)
File.open(@filename+@file, "w") do |f|
f.puts JSON.pretty_generate(data_hash)
end
begin
download = open(img)
IO.copy_stream(download, "#{@filename}#{@fl}/"+@id.to_s+'.jpg')
# open("#{@filename}#{@fl}/"+@id.to_s+'.jpg', 'wb') do |file|
# file << open(img).read
# end
rescue SystemCallError
p 'error saving file'
retry
end
end
@rightarr.click
sleep(@time)
article = @wait.until {
element = @driver.find_elements(:tag_name, 'article')
}
return getArticle(article.last)
else
@rightarr.click
p 'does not contain any tags wait: and again'
p @time
sleep(@time)
article = @wait.until {
element = @driver.find_elements(:tag_name, 'article')
}
return getArticle(article.last)
end
else
p 'no article found wait'
p @time
sleep(@time)
article = @wait.until {
element = @driver.find_elements(:tag_name, 'article')
}
return getArticle(article.last)
end # /unless
end
getArticle(article.last)
# @driver.quit
end
navigate(which, fl, id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment