Last active
May 16, 2020 21:35
-
-
Save hundredwatt/183191d02f1fe0db31c3e910fff8adda to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Idea from: https://twitter.com/nateberkopec/status/1255573064747712515 | |
require 'fileutils' | |
require 'net/http' | |
require 'json' | |
require 'uri' | |
require 'csv' | |
POINT_THRESHOLD = 400 | |
HN_ALGOLIA_URL = 'https://hn.algolia.com/api/v1/search?query=github.com&restrictSearchableAttributes=url&page=%<page>d' | |
GITHUB_API_REPO_URL = 'https://api.github.com/repos/%<owner>s/%<repo>s' | |
GITHUB_API_COMMITS_URL = 'https://api.github.com/repos/%<owner>s/%<repo>s/commits/%<branch>s' | |
GITHUB_API_TOKEN = ENV['GITHUB_API_TOKEN'] | |
FileUtils.mkdir_p('tmp/repos') | |
FileUtils.mkdir_p('tmp/latest_commits') | |
def fetch_hn_stories | |
page = 0 | |
f = File.open('tmp/stories.jsonl', 'w') | |
loop do | |
uri = URI.parse(HN_ALGOLIA_URL % { page: page }) | |
response = Net::HTTP.get_response(uri) | |
results = JSON.parse(response.body) | |
results['hits'].select do |hit| | |
# Remove Github Blog Posts | |
next if hit['url'].include?('github.com/blog/') | |
# Remove Github Feature Pages | |
next if hit['url'].include?('github.com/features/') | |
# Remove other non-repo Github URLs | |
hit['url'].match %r{https?://github.com/[^/]+/[^/]+/?$} | |
end.each do |hit| | |
f.puts hit.to_json | |
end | |
page += 1 | |
break if POINT_THRESHOLD > results['hits'].map { |i| i['points'].to_i }.min | |
end | |
f.close | |
end | |
def fetch_github_repos | |
File.readlines('tmp/stories.jsonl').each do |line| | |
story = JSON.parse(line) | |
match = story['url'].match %r{https?://github.com/([^/]+)/([^/]+)/?$} | |
uri = URI.parse(GITHUB_API_REPO_URL % { owner: match[1], repo: match[2].sub('.git', '') }) | |
response_file = "tmp/repos/#{match[1]}-#{match[2]}.json" | |
next if File.exists?(response_file) | |
response = nil | |
loop do | |
http = Net::HTTP.new(uri.host, uri.port) | |
http.use_ssl = true | |
request = Net::HTTP::Get.new(uri.to_s) | |
request['Authorization'] = "Bearer #{GITHUB_API_TOKEN}" | |
response = http.request(request) | |
case response.code | |
when '403' # Fail | |
fail 'API rate limit exceeded' | |
when '301' # Follow the redirect | |
uri = URI.parse(JSON.parse(response.body)['url']) | |
next | |
when '404' # Skip story | |
response = nil | |
break | |
when '200' # Continue parent loop | |
break | |
else # Fail on other errors | |
fail "API Error: #{response.code}" | |
end | |
end | |
File.open(response_file, 'w') { |f| f.puts response.body } if response | |
end | |
end | |
def fetch_github_latest_commits | |
Dir['tmp/repos/*'].each do |repo_file| | |
repo = JSON.parse(File.read(repo_file)) | |
owner, name, default_branch = repo.values_at('owner', 'name', 'default_branch') | |
owner = owner['login'] | |
uri = URI.parse(GITHUB_API_COMMITS_URL % { owner: owner, repo: name, branch: default_branch }) | |
response_file = "tmp/latest_commits/#{owner}-#{name}.json" | |
next if File.exists?(response_file) | |
http = Net::HTTP.new(uri.host, uri.port) | |
http.use_ssl = true | |
request = Net::HTTP::Get.new(uri.to_s) | |
request['Authorization'] = "Bearer #{GITHUB_API_TOKEN}" | |
response = http.request(request) | |
fail "API Error: #{response.code}" unless response.code == '200' | |
File.open(response_file, 'w') { |f| f.puts response.body } | |
end | |
end | |
def generate_results_csv | |
@rows = [] | |
@headers = [] | |
def h(value) | |
@headers << value unless @headers.include?(value) | |
end | |
File.readlines('tmp/stories.jsonl').each do |line| | |
story = JSON.parse(line) | |
match = story['url'].match %r{https?://github.com/([^/]+)/([^/]+)/?$} | |
repo_file = "tmp/repos/#{match[1]}-#{match[2]}.json" | |
repo = File.exists?(repo_file) ? JSON.parse(File.read(repo_file)) : {} | |
owner, name, default_branch = repo.values_at('owner', 'name', 'default_branch') | |
owner = owner&.dig('login') | |
latest_commit_file = "tmp/latest_commits/#{owner}-#{name}.json" | |
latest_commit = File.exists?(latest_commit_file) ? JSON.parse(File.read(latest_commit_file)) : {} | |
[].tap do |row| | |
h 'HN Points' | |
row << story['points'] | |
h 'HN Posted Date' | |
row << story['created_at'].split("T").first | |
h 'HN Title' | |
row << story['title'] | |
h 'HN Story URL' | |
row << "https://news.ycombinator.com/item?id=#{story['objectID']}" | |
h 'Repo URL' | |
row << repo['html_url'] | |
h 'Repo Name' | |
row << repo['name'] | |
h 'Repo Description' | |
row << repo['description'] | |
h 'Repo Created' | |
row << repo['created_at']&.split("T")&.first | |
h 'Repo Latest Commit At' | |
row << latest_commit.dig('commit', 'committer', 'date')&.split("T")&.first | |
h 'Stars' | |
row << repo['stargazers_count'] | |
h 'Watchers' | |
row << repo['watchers_count'] | |
h 'Forks' | |
row << repo['forks'] | |
h 'Open Issues' | |
row << repo['open_issues'] | |
@rows << row | |
end | |
end | |
File.open('tmp/results.csv', 'w') do |f| | |
f.puts CSV.generate(headers: @headers, write_headers: true) { |csv| @rows.each { |row| csv << row } } | |
end | |
end | |
# fetch_hn_stories | |
# fetch_github_repos | |
# fetch_github_latest_commits | |
# generate_results_csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Results as of 2020-05-16: https://docs.google.com/spreadsheets/u/1/d/1MUCRsbuxzXR_WBQ857RpLlyOdAs6PKsoT6N_gCcfF0Y/edit#gid=1416647086