Skip to content

Instantly share code, notes, and snippets.

@funzoneq
Created July 22, 2016 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save funzoneq/ab42de396437c016b40964aed0671bc6 to your computer and use it in GitHub Desktop.
Save funzoneq/ab42de396437c016b40964aed0671bc6 to your computer and use it in GitHub Desktop.
Fetch a domain name, parse the words and check domain names.
require 'optparse'
require 'httparty'
require 'pp'
options = { domain: nil }
optparse = OptionParser.new do |opts|
opts.banner = "Usage: example.rb [options]"
opts.on("-dDOMAIN", "--domain=DOMAIN", "Domain to download") do |n|
options[:domain] = n
end
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
options[:verbose] = v
end
opts.on("-h", "--help", "Prints this help") do
puts opts
exit
end
end.parse!
if options[:domain].nil?
puts "You need to specify a domain to graze content from. Eg. fetch.rb -d leaseweb.com"
puts optparse
exit
end
def wget(domain)
"wget --recursive \
-A html \
--no-clobber \
--html-extension \
--convert-links \
--restrict-file-names=windows \
--domains #{domain} \
--no-parent \
http://#{domain}"
end
def find_html_files(domain)
"find ./#{domain} -name '*.html' -type f -print0 | xargs -0 grep -o -E '\\w+' | sort -u"
end
# TODO: check how old the file is
if !File.exist?('tlds-alpha-by-domain.txt')
response = HTTParty.get('https://data.iana.org/TLD/tlds-alpha-by-domain.txt')
File.write('tlds-alpha-by-domain.txt', response.parsed_response)
end
tlds = File.read('tlds-alpha-by-domain.txt').split("\n").keep_if{ |t| not t =~ /^#/ }.map{ |t| t.downcase }
command = wget(options[:domain])
#puts exec(command)
command = find_html_files(options[:domain])
words = `#{command}`.split("\n").map {|d| d.downcase }.uniq
puts words.length
tlds.each do |tld|
words.each do |word|
if word =~ /#{tld}$/i and tld != word
domain = "#{word.gsub(tld, '')}.#{tld}"
begin
response = HTTParty.get("http://nslookup.io/free/#{domain}")
puts "#{domain} = #{response.parsed_response['Free']}"
rescue
puts "failed for #{domain}"
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment