Last active
October 28, 2018 15:24
-
-
Save fabianrios/6db2612d3382204b6bfe1d870a05f845 to your computer and use it in GitHub Desktop.
Script created to fetch instagram post from the web.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'nokogiri' | |
require 'mechanize' | |
require 'open-uri' | |
require 'json' | |
require "selenium-webdriver" | |
BASE_URL = "https://www.instagram.com" | |
id = ARGV[0].to_i | |
which = ARGV[1] | |
fl = ARGV[2] | |
# Timeout = 5 sec | |
@wait = Selenium::WebDriver::Wait.new(:timeout => 15) | |
def navigate (which, fl, id) | |
@id = id | |
p @id | |
p which | |
@fl = fl | |
p @fl | |
url = "#{BASE_URL}/explore/#{which}/" | |
@filename = '/Volumes/External/data/' | |
@file = "#{fl}.json" | |
@driver = Selenium::WebDriver.for :chrome | |
@driver.navigate.to url | |
p @driver.title | |
sleep(40) #Half a second | |
# options = @driver.execute_script("return document.querySelectorAll('a')") | |
# options[0].click | |
# sleep(3) | |
@rightarr = @driver.find_elements(:class, "coreSpriteRightPaginationArrow").first | |
# @rightarr.click | |
article = @wait.until { | |
element = @driver.find_elements(:tag_name, 'article') | |
} | |
def getArticle(article) | |
@time = 0.5 | |
# articles = @driver.execute_script("return document.querySelectorAll('article')") | |
unless article.nil? | |
str = article.attribute("innerHTML") | |
contain = ['#foodie','#foodies','#foodporn','#instafood','#yum','#food','#foodpics','#yummyfood','#geilesessen','#abendessen', '#foodstagram', '#foodgasm', '#coffee', '#cafe', '#japanesefood'].any? { |word| str.include?(word) } | |
if contain | |
@doc = Nokogiri::HTML(str) | |
author = @doc.at('h2').text.strip | |
image = @doc.at_css('div div div div img') | |
if !image.nil? | |
img = image.attr('src') | |
else | |
p 'image nil next' | |
@rightarr.click | |
sleep(1) | |
article = @wait.until { | |
element = @driver.find_elements(:tag_name, 'article') | |
} | |
return getArticle(article.last) | |
end | |
# check the file | |
file = File.read(@filename+@file) | |
data_hash = JSON.parse(file) | |
saveimage = data_hash.any? { |h| h['m'] == img } | |
p saveimage | |
p img | |
# check if is not there | |
if !saveimage | |
@time = 2 | |
@id += 1 | |
# p str | |
p '-------------------------------' | |
p @id | |
tags = @doc.search('a').text.strip.scan(/#\w+/).flatten | |
text = @doc.search('span').text.strip | |
data = {"id": @id, "author": @doc.at('h2').text.strip, 'm': img, 'text': text, 'url': @driver.current_url} | |
unless tags.nil? | |
# p tags | |
data['tags'] = tags | |
end | |
puts JSON.pretty_generate(data) | |
data_hash.push(data) | |
File.open(@filename+@file, "w") do |f| | |
f.puts JSON.pretty_generate(data_hash) | |
end | |
begin | |
download = open(img) | |
IO.copy_stream(download, "#{@filename}#{@fl}/"+@id.to_s+'.jpg') | |
# open("#{@filename}#{@fl}/"+@id.to_s+'.jpg', 'wb') do |file| | |
# file << open(img).read | |
# end | |
rescue SystemCallError | |
p 'error saving file' | |
retry | |
end | |
end | |
@rightarr.click | |
sleep(@time) | |
article = @wait.until { | |
element = @driver.find_elements(:tag_name, 'article') | |
} | |
return getArticle(article.last) | |
else | |
@rightarr.click | |
p 'does not contain any tags wait: and again' | |
p @time | |
sleep(@time) | |
article = @wait.until { | |
element = @driver.find_elements(:tag_name, 'article') | |
} | |
return getArticle(article.last) | |
end | |
else | |
p 'no article found wait' | |
p @time | |
sleep(@time) | |
article = @wait.until { | |
element = @driver.find_elements(:tag_name, 'article') | |
} | |
return getArticle(article.last) | |
end # /unless | |
end | |
getArticle(article.last) | |
# @driver.quit | |
end | |
navigate(which, fl, id) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment