Skip to content

Instantly share code, notes, and snippets.

@hundredwatt
Last active May 16, 2020 21:35
Show Gist options
  • Save hundredwatt/183191d02f1fe0db31c3e910fff8adda to your computer and use it in GitHub Desktop.
Save hundredwatt/183191d02f1fe0db31c3e910fff8adda to your computer and use it in GitHub Desktop.
# Idea from: https://twitter.com/nateberkopec/status/1255573064747712515
require 'fileutils'
require 'net/http'
require 'json'
require 'uri'
require 'csv'
POINT_THRESHOLD = 400
HN_ALGOLIA_URL = 'https://hn.algolia.com/api/v1/search?query=github.com&restrictSearchableAttributes=url&page=%<page>d'
GITHUB_API_REPO_URL = 'https://api.github.com/repos/%<owner>s/%<repo>s'
GITHUB_API_COMMITS_URL = 'https://api.github.com/repos/%<owner>s/%<repo>s/commits/%<branch>s'
GITHUB_API_TOKEN = ENV['GITHUB_API_TOKEN']
FileUtils.mkdir_p('tmp/repos')
FileUtils.mkdir_p('tmp/latest_commits')
def fetch_hn_stories
page = 0
f = File.open('tmp/stories.jsonl', 'w')
loop do
uri = URI.parse(HN_ALGOLIA_URL % { page: page })
response = Net::HTTP.get_response(uri)
results = JSON.parse(response.body)
results['hits'].select do |hit|
# Remove Github Blog Posts
next if hit['url'].include?('github.com/blog/')
# Remove Github Feature Pages
next if hit['url'].include?('github.com/features/')
# Remove other non-repo Github URLs
hit['url'].match %r{https?://github.com/[^/]+/[^/]+/?$}
end.each do |hit|
f.puts hit.to_json
end
page += 1
break if POINT_THRESHOLD > results['hits'].map { |i| i['points'].to_i }.min
end
f.close
end
def fetch_github_repos
File.readlines('tmp/stories.jsonl').each do |line|
story = JSON.parse(line)
match = story['url'].match %r{https?://github.com/([^/]+)/([^/]+)/?$}
uri = URI.parse(GITHUB_API_REPO_URL % { owner: match[1], repo: match[2].sub('.git', '') })
response_file = "tmp/repos/#{match[1]}-#{match[2]}.json"
next if File.exists?(response_file)
response = nil
loop do
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true
request = Net::HTTP::Get.new(uri.to_s)
request['Authorization'] = "Bearer #{GITHUB_API_TOKEN}"
response = http.request(request)
case response.code
when '403' # Fail
fail 'API rate limit exceeded'
when '301' # Follow the redirect
uri = URI.parse(JSON.parse(response.body)['url'])
next
when '404' # Skip story
response = nil
break
when '200' # Continue parent loop
break
else # Fail on other errors
fail "API Error: #{response.code}"
end
end
File.open(response_file, 'w') { |f| f.puts response.body } if response
end
end
def fetch_github_latest_commits
Dir['tmp/repos/*'].each do |repo_file|
repo = JSON.parse(File.read(repo_file))
owner, name, default_branch = repo.values_at('owner', 'name', 'default_branch')
owner = owner['login']
uri = URI.parse(GITHUB_API_COMMITS_URL % { owner: owner, repo: name, branch: default_branch })
response_file = "tmp/latest_commits/#{owner}-#{name}.json"
next if File.exists?(response_file)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true
request = Net::HTTP::Get.new(uri.to_s)
request['Authorization'] = "Bearer #{GITHUB_API_TOKEN}"
response = http.request(request)
fail "API Error: #{response.code}" unless response.code == '200'
File.open(response_file, 'w') { |f| f.puts response.body }
end
end
def generate_results_csv
@rows = []
@headers = []
def h(value)
@headers << value unless @headers.include?(value)
end
File.readlines('tmp/stories.jsonl').each do |line|
story = JSON.parse(line)
match = story['url'].match %r{https?://github.com/([^/]+)/([^/]+)/?$}
repo_file = "tmp/repos/#{match[1]}-#{match[2]}.json"
repo = File.exists?(repo_file) ? JSON.parse(File.read(repo_file)) : {}
owner, name, default_branch = repo.values_at('owner', 'name', 'default_branch')
owner = owner&.dig('login')
latest_commit_file = "tmp/latest_commits/#{owner}-#{name}.json"
latest_commit = File.exists?(latest_commit_file) ? JSON.parse(File.read(latest_commit_file)) : {}
[].tap do |row|
h 'HN Points'
row << story['points']
h 'HN Posted Date'
row << story['created_at'].split("T").first
h 'HN Title'
row << story['title']
h 'HN Story URL'
row << "https://news.ycombinator.com/item?id=#{story['objectID']}"
h 'Repo URL'
row << repo['html_url']
h 'Repo Name'
row << repo['name']
h 'Repo Description'
row << repo['description']
h 'Repo Created'
row << repo['created_at']&.split("T")&.first
h 'Repo Latest Commit At'
row << latest_commit.dig('commit', 'committer', 'date')&.split("T")&.first
h 'Stars'
row << repo['stargazers_count']
h 'Watchers'
row << repo['watchers_count']
h 'Forks'
row << repo['forks']
h 'Open Issues'
row << repo['open_issues']
@rows << row
end
end
File.open('tmp/results.csv', 'w') do |f|
f.puts CSV.generate(headers: @headers, write_headers: true) { |csv| @rows.each { |row| csv << row } }
end
end
# fetch_hn_stories
# fetch_github_repos
# fetch_github_latest_commits
# generate_results_csv
@hundredwatt
Copy link
Author

hundredwatt commented Apr 29, 2020

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment