Skip to content

Instantly share code, notes, and snippets.

@edwardkenfox
Created October 21, 2019 05:34
Show Gist options
  • Save edwardkenfox/19e9f8d4ec2bd401fbcb718e977ff60b to your computer and use it in GitHub Desktop.
Save edwardkenfox/19e9f8d4ec2bd401fbcb718e977ff60b to your computer and use it in GitHub Desktop.
site crawler that summarizes the id & class used in the given URLs

0.

ruby がインストールされてるか確認

$ ruby -v

必要なgemをインストール

$ gem i bunlder
$ bundle

1.

URLのリストを url_list.csv ファイルとして保存

http://example.com
http://example.com/foo.html
http://example.com/bar.html
http://example.com/items/1.html
http://example.com/items/2.html

2.

スクリプトを実行

$ ruby script.rb url_list.csv

3.

結果を確認

$ open results.json
# frozen_string_literal: true
source "https://rubygems.org"
git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
# gem "rails"
gem "pry"
gem "nokogiri"
GEM
remote: https://rubygems.org/
specs:
coderay (1.1.2)
method_source (0.9.2)
mini_portile2 (2.4.0)
nokogiri (1.10.4)
mini_portile2 (~> 2.4.0)
pry (0.12.2)
coderay (~> 1.1.0)
method_source (~> 0.9.0)
PLATFORMS
ruby
DEPENDENCIES
nokogiri
pry
BUNDLED WITH
2.0.2
{"html":{"id":[],"class":["client-nojs"]},"head":{"id":[],"class":[]},"meta":{"id":[],"class":[]},"title":{"id":[],"class":[]},"script":{"id":[],"class":[]},"style":{"id":[],"class":[]},"body":{"id":[],"class":["mediawiki","ltr","sitedir-ltr","mw-hide-empty-elt","ns-0","ns-subject","page-メインページ","rootpage-メインページ","skin-vector","action-view"]},"div":{"id":["mngb","gbar","guser","lga","gac_scont","fll","mw-page-base","mw-head-base","content","siteNotice","bodyContent","siteSub","contentSub","jump-to-nav","mw-content-text","mf-tfa","mf-ni","mf-itn","mf-ist","catlinks","mw-data-after-content","mw-navigation","mw-head","p-personal","left-navigation","p-namespaces","p-variants","right-navigation","p-views","p-cactions","p-search","simpleSearch","mw-panel","p-logo","p-navigation","p-help","p-tb","p-wikibase-otherprojects","p-coll-print_export","p-lang","footer"],"class":["gbh","ds","noprint","mw-body","mw-body-content","mw-indicators","mw-content-ltr","mw-parser-output","floatleft","CategoryTreeTag","CategoryTreeSection","CategoryTreeItem","CategoryTreeChildren","hlist","printfooter","catlinks","catlinks-allhidden","visualClear","read-more-container","vectorTabs","vectorMenu","emptyPortlet","portal","body"]},"nobr":{"id":[],"class":[]},"b":{"id":[],"class":["gb1"]},"a":{"id":["gb_70","top"],"class":["gb1","gb4","mw-jump-link","external","text","image","mw-redirect","new","extiw","mw-wiki-logo","interlanguage-link-target","noprint","stopMobileRedirectToggle"]},"u":{"id":[],"class":[]},"span":{"id":["gbn","gbf","gbe","footer",".E9.81.B8.E3.82.8A.E6.8A.9C.E3.81.8D.E8.A8.98.E4.BA.8B","選り抜き記事",".E6.96.B0.E3.81.97.E3.81.84.E8.A8.98.E4.BA.8B","新しい記事",".E6.96.B0.E3.81.97.E3.81.84.E7.94.BB.E5.83.8F","新しい画像",".E5.BC.B7.E5.8C.96.E8.A8.98.E4.BA.8B","強化記事",".E4.BB.8A.E6.97.A5.E3.81.AE.E4.B8.80.E6.9E.9A","今日の一枚",".E4.BB.8A.E6.97.A5.E3.81.AF.E4.BD.95.E3.81.AE.E6.97.A5_10.E6.9C.8821.E6.97.A5","今日は何の日_10月21日",".E9.A2.A8.E7.89.A9.E8.A9.A9","風物詩",".E3.83.9D.E3.83.BC.E3.82.BF.E3.83.AB","ポータル",".E3.82.A4.E3.83.B3.E3.83.95.E3.82.A9.E3.83.A1.E3.83.BC.E3.82.B7.E3.83.A7.E3.83.B3","インフォメーション",".E3.82.A6.E3.82.A3.E3.82.AD.E3.83.A1.E3.83.87.E3.82.A3.E3.82.A2.E3.83.97.E3.83.AD.E3.82.B8.E3.82.A7.E3.82.AF.E3.83.88","ウィキメディアプロジェクト"],"class":["gbi","gbf","ds","lsbb","plainlinks","mw-headline","nowrap","CategoryTreeBullet","CategoryTreeToggle"]},"center":{"id":[],"class":[]},"br":{"id":["lgpd"],"class":[]},"img":{"id":["hplogo"],"class":[]},"form":{"id":["searchform"],"class":[]},"table":{"id":["mp-right"],"class":["plainlinks"]},"tr":{"id":[],"class":[]},"td":{"id":[],"class":["fl","sblc","globegris"]},"input":{"id":["tsuid1","gbv","searchInput","mw-searchButton","searchButton"],"class":["lst","lsb","vectorMenuCheckbox","searchButton","mw-fallbackSearchButton"]},"p":{"id":[],"class":[]},"h1":{"id":["firstHeading"],"class":["firstHeading"]},"link":{"id":[],"class":[]},"tbody":{"id":[],"class":[]},"h2":{"id":[],"class":[]},"hr":{"id":[],"class":[]},"ul":{"id":["footer-info","footer-places","footer-icons"],"class":["menu","noprint"]},"li":{"id":["pt-anonuserpage","pt-anontalk","pt-anoncontribs","pt-createaccount","pt-login","ca-nstab-main","ca-talk","ca-view","ca-viewsource","ca-history","n-mainpage","n-portal","n-currentevents","n-newpages","n-recentchanges","n-randompage","n-sandbox","n-commonsupload","n-help","n-villagepump","n-notice","n-bugreportspage","n-sitesupport","n-contact","t-whatlinkshere","t-recentchangeslinked","t-upload","t-specialpages","t-permalink","t-info","t-wikibase","t-cite","coll-create_a_book","coll-download-as-rl","t-print","footer-info-lastmod","footer-info-copyright","footer-places-privacy","footer-places-about","footer-places-disclaimer","footer-places-developers","footer-places-cookiestatement","footer-places-mobileview","footer-copyrightico","footer-poweredbyico"],"class":["selected","collapsible","wb-otherproject-link","wb-otherproject-commons","wb-otherproject-mediawiki","wb-otherproject-meta","wb-otherproject-species","wb-otherproject-wikibooks","wb-otherproject-wikidata","wb-otherproject-wikimania","wb-otherproject-wikinews","wb-otherproject-wikiquote","wb-otherproject-wikisource","wb-otherproject-wikiversity","wb-otherproject-wiktionary","interlanguage-link","interwiki-ar","interwiki-az","interwiki-bg","interwiki-ca","interwiki-cs","interwiki-da","interwiki-de","interwiki-el","interwiki-en","interwiki-eo","interwiki-es","interwiki-et","interwiki-eu","interwiki-fa","interwiki-fi","interwiki-fr","interwiki-gl","interwiki-he","interwiki-hi","interwiki-hr","interwiki-hu","interwiki-id","interwiki-it","interwiki-ka","interwiki-kk","interwiki-ko","interwiki-la","interwiki-lt","interwiki-mk","interwiki-ms","interwiki-new","interwiki-nl","interwiki-nn","interwiki-no","interwiki-pl","interwiki-pt","interwiki-ro","interwiki-ru","interwiki-sh","interwiki-simple","interwiki-sk","interwiki-sl","interwiki-sr","interwiki-sv","interwiki-th","interwiki-tl","interwiki-tr","interwiki-uk","interwiki-vi","interwiki-zh"]},"small":{"id":[],"class":[]},"noscript":{"id":[],"class":[]},"h3":{"id":["p-personal-label","p-namespaces-label","p-variants-label","p-views-label","p-cactions-label","p-navigation-label","p-help-label","p-tb-label","p-wikibase-otherprojects-label","p-coll-print_export-label","p-lang-label"],"class":[]},"label":{"id":[],"class":[]}}
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
https://example.com
https://google.com
https://ja.wikipedia.org/
require 'pry'
require 'nokogiri'
require 'open-uri'
class ThreadPool
def initialize(size)
@size = size
@jobs = Queue.new
@pool = Array.new(@size) do |i|
Thread.new do
Thread.current[:id] = i
catch(:exit) do
loop do
job, args = @jobs.pop
job.call(*args)
end
end
end
end
end
def schedule(*args, &block)
@jobs << [block, args]
end
def run!
@size.times do
schedule { throw :exit }
end
@pool.map(&:join)
end
end
FILE_NAME = ARGV[0]
if FILE_NAME.nil?
puts "Run this command as follows:"
puts " $ ruby script.rb /path/to/file.csv"
exit 1
end
THREADS_NUM = 20
pool = ThreadPool.new(THREADS_NUM)
element_id_class_map = {}
open(FILE_NAME) do |csv|
csv.each_line.with_index do |url, idx|
url.strip!
pool.schedule do
doc = Nokogiri::HTML(open(url))
doc.css("*").each do |node|
if element_id_class_map[node.name].nil?
element_id_class_map[node.name] = { id: [], class: [] }
end
if node.attribute("id") != nil
element_id_class_map[node.name][:id] << node.attribute("id").value
element_id_class_map[node.name][:id].uniq!
end
if !node.classes.empty?
element_id_class_map[node.name][:class].concat(node.classes)
element_id_class_map[node.name][:class].uniq!
end
end
end
end
end
pool.run!
puts element_id_class_map.to_json
file = File.open("results.json", "w")
file.puts element_id_class_map.to_json
file.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment