hundredwatt/github_projects_mentioned_on_hn.rb

## github_projects_mentioned_on_hn.rb
# Idea from: https://twitter.com/nateberkopec/status/1255573064747712515

require 'fileutils'
require 'net/http'
require 'json'
require 'uri'
require 'csv'

POINT_THRESHOLD = 400
HN_ALGOLIA_URL = 'https://hn.algolia.com/api/v1/search?query=github.com&restrictSearchableAttributes=url&page=%<page>d'
GITHUB_API_REPO_URL = 'https://api.github.com/repos/%<owner>s/%<repo>s'
GITHUB_API_COMMITS_URL = 'https://api.github.com/repos/%<owner>s/%<repo>s/commits/%<branch>s'
GITHUB_API_TOKEN = ENV['GITHUB_API_TOKEN']

FileUtils.mkdir_p('tmp/repos')
FileUtils.mkdir_p('tmp/latest_commits')

def fetch_hn_stories
  page = 0

  f = File.open('tmp/stories.jsonl', 'w')

  loop do
    uri = URI.parse(HN_ALGOLIA_URL % { page: page })
    response = Net::HTTP.get_response(uri)
    results = JSON.parse(response.body)
    results['hits'].select do |hit|
      # Remove Github Blog Posts
      next if hit['url'].include?('github.com/blog/')

      # Remove Github Feature Pages
      next if hit['url'].include?('github.com/features/')

      # Remove other non-repo Github URLs
      hit['url'].match %r{https?://github.com/[^/]+/[^/]+/?$}
    end.each do |hit|
      f.puts hit.to_json
    end

    page += 1

    break if POINT_THRESHOLD > results['hits'].map { |i| i['points'].to_i }.min
  end

  f.close
end

def fetch_github_repos
  File.readlines('tmp/stories.jsonl').each do |line|
    story = JSON.parse(line)
    match = story['url'].match %r{https?://github.com/([^/]+)/([^/]+)/?$}
    uri = URI.parse(GITHUB_API_REPO_URL % { owner: match[1], repo: match[2].sub('.git', '') })
    response_file = "tmp/repos/#{match[1]}-#{match[2]}.json"

    next if File.exists?(response_file)

    response = nil
    loop do
      http = Net::HTTP.new(uri.host, uri.port)
      http.use_ssl = true
      request = Net::HTTP::Get.new(uri.to_s)
      request['Authorization'] = "Bearer #{GITHUB_API_TOKEN}"
      response = http.request(request)
      case response.code
      when '403' # Fail
        fail 'API rate limit exceeded'
      when '301' # Follow the redirect
        uri = URI.parse(JSON.parse(response.body)['url'])
        next
      when '404' # Skip story
        response = nil
        break
      when '200' # Continue parent loop
        break
      else # Fail on other errors
        fail "API Error: #{response.code}"
      end
    end

    File.open(response_file, 'w') { |f| f.puts response.body } if response
  end
end

def fetch_github_latest_commits
  Dir['tmp/repos/*'].each do |repo_file|
    repo = JSON.parse(File.read(repo_file))
    owner, name, default_branch = repo.values_at('owner', 'name', 'default_branch')
    owner = owner['login']

    uri = URI.parse(GITHUB_API_COMMITS_URL % { owner: owner, repo: name, branch: default_branch })
    response_file = "tmp/latest_commits/#{owner}-#{name}.json"

    next if File.exists?(response_file)

    http = Net::HTTP.new(uri.host, uri.port)
    http.use_ssl = true
    request = Net::HTTP::Get.new(uri.to_s)
    request['Authorization'] = "Bearer #{GITHUB_API_TOKEN}"
    response = http.request(request)
    fail "API Error: #{response.code}" unless response.code == '200'

    File.open(response_file, 'w') { |f| f.puts response.body }
  end
end

def generate_results_csv
  @rows = []
  @headers = []

  def h(value)
    @headers << value unless @headers.include?(value)
  end

  File.readlines('tmp/stories.jsonl').each do |line|
    story = JSON.parse(line)
    match = story['url'].match %r{https?://github.com/([^/]+)/([^/]+)/?$}
    repo_file = "tmp/repos/#{match[1]}-#{match[2]}.json"
    repo = File.exists?(repo_file) ? JSON.parse(File.read(repo_file)) : {}
    owner, name, default_branch = repo.values_at('owner', 'name', 'default_branch')
    owner = owner&.dig('login')
    latest_commit_file = "tmp/latest_commits/#{owner}-#{name}.json"
    latest_commit = File.exists?(latest_commit_file) ? JSON.parse(File.read(latest_commit_file)) : {}

    [].tap do |row|
      h 'HN Points'
      row << story['points']

      h 'HN Posted Date'
      row << story['created_at'].split("T").first

      h 'HN Title'
      row << story['title']

      h 'HN Story URL'
      row << "https://news.ycombinator.com/item?id=#{story['objectID']}"

      h 'Repo URL'
      row << repo['html_url']

      h 'Repo Name'
      row << repo['name']

      h 'Repo Description'
      row << repo['description']

      h 'Repo Created'
      row << repo['created_at']&.split("T")&.first

      h 'Repo Latest Commit At'
      row << latest_commit.dig('commit', 'committer', 'date')&.split("T")&.first

      h 'Stars'
      row << repo['stargazers_count']

      h 'Watchers'
      row << repo['watchers_count']

      h 'Forks'
      row << repo['forks']

      h 'Open Issues'
      row << repo['open_issues']

      @rows << row
    end
  end

  File.open('tmp/results.csv', 'w') do |f|
    f.puts CSV.generate(headers: @headers, write_headers: true) { |csv| @rows.each { |row| csv << row } }
  end
end

# fetch_hn_stories
# fetch_github_repos
# fetch_github_latest_commits
# generate_results_csv
	# Idea from: https://twitter.com/nateberkopec/status/1255573064747712515

	require 'fileutils'
	require 'net/http'
	require 'json'
	require 'uri'
	require 'csv'

	POINT_THRESHOLD = 400
	HN_ALGOLIA_URL = 'https://hn.algolia.com/api/v1/search?query=github.com&restrictSearchableAttributes=url&page=%<page>d'
	GITHUB_API_REPO_URL = 'https://api.github.com/repos/%<owner>s/%<repo>s'
	GITHUB_API_COMMITS_URL = 'https://api.github.com/repos/%<owner>s/%<repo>s/commits/%<branch>s'
	GITHUB_API_TOKEN = ENV['GITHUB_API_TOKEN']

	FileUtils.mkdir_p('tmp/repos')
	FileUtils.mkdir_p('tmp/latest_commits')

	def fetch_hn_stories
	page = 0

	f = File.open('tmp/stories.jsonl', 'w')

	loop do
	uri = URI.parse(HN_ALGOLIA_URL % { page: page })
	response = Net::HTTP.get_response(uri)
	results = JSON.parse(response.body)
	results['hits'].select do \|hit\|
	# Remove Github Blog Posts
	next if hit['url'].include?('github.com/blog/')

	# Remove Github Feature Pages
	next if hit['url'].include?('github.com/features/')

	# Remove other non-repo Github URLs
	hit['url'].match %r{https?://github.com/[^/]+/[^/]+/?$}
	end.each do \|hit\|
	f.puts hit.to_json
	end

	page += 1

	break if POINT_THRESHOLD > results['hits'].map { \|i\| i['points'].to_i }.min
	end

	f.close
	end

	def fetch_github_repos
	File.readlines('tmp/stories.jsonl').each do \|line\|
	story = JSON.parse(line)
	match = story['url'].match %r{https?://github.com/([^/]+)/([^/]+)/?$}
	uri = URI.parse(GITHUB_API_REPO_URL % { owner: match[1], repo: match[2].sub('.git', '') })
	response_file = "tmp/repos/#{match[1]}-#{match[2]}.json"

	next if File.exists?(response_file)

	response = nil
	loop do
	http = Net::HTTP.new(uri.host, uri.port)
	http.use_ssl = true
	request = Net::HTTP::Get.new(uri.to_s)
	request['Authorization'] = "Bearer #{GITHUB_API_TOKEN}"
	response = http.request(request)
	case response.code
	when '403' # Fail
	fail 'API rate limit exceeded'
	when '301' # Follow the redirect
	uri = URI.parse(JSON.parse(response.body)['url'])
	next
	when '404' # Skip story
	response = nil
	break
	when '200' # Continue parent loop
	break
	else # Fail on other errors
	fail "API Error: #{response.code}"
	end
	end

	File.open(response_file, 'w') { \|f\| f.puts response.body } if response
	end
	end

	def fetch_github_latest_commits
	Dir['tmp/repos/*'].each do \|repo_file\|
	repo = JSON.parse(File.read(repo_file))
	owner, name, default_branch = repo.values_at('owner', 'name', 'default_branch')
	owner = owner['login']

	uri = URI.parse(GITHUB_API_COMMITS_URL % { owner: owner, repo: name, branch: default_branch })
	response_file = "tmp/latest_commits/#{owner}-#{name}.json"

	next if File.exists?(response_file)

	http = Net::HTTP.new(uri.host, uri.port)
	http.use_ssl = true
	request = Net::HTTP::Get.new(uri.to_s)
	request['Authorization'] = "Bearer #{GITHUB_API_TOKEN}"
	response = http.request(request)
	fail "API Error: #{response.code}" unless response.code == '200'

	File.open(response_file, 'w') { \|f\| f.puts response.body }
	end
	end

	def generate_results_csv
	@rows = []
	@headers = []

	def h(value)
	@headers << value unless @headers.include?(value)
	end

	File.readlines('tmp/stories.jsonl').each do \|line\|
	story = JSON.parse(line)
	match = story['url'].match %r{https?://github.com/([^/]+)/([^/]+)/?$}
	repo_file = "tmp/repos/#{match[1]}-#{match[2]}.json"
	repo = File.exists?(repo_file) ? JSON.parse(File.read(repo_file)) : {}
	owner, name, default_branch = repo.values_at('owner', 'name', 'default_branch')
	owner = owner&.dig('login')
	latest_commit_file = "tmp/latest_commits/#{owner}-#{name}.json"
	latest_commit = File.exists?(latest_commit_file) ? JSON.parse(File.read(latest_commit_file)) : {}

	[].tap do \|row\|
	h 'HN Points'
	row << story['points']

	h 'HN Posted Date'
	row << story['created_at'].split("T").first

	h 'HN Title'
	row << story['title']

	h 'HN Story URL'
	row << "https://news.ycombinator.com/item?id=#{story['objectID']}"

	h 'Repo URL'
	row << repo['html_url']

	h 'Repo Name'
	row << repo['name']

	h 'Repo Description'
	row << repo['description']

	h 'Repo Created'
	row << repo['created_at']&.split("T")&.first

	h 'Repo Latest Commit At'
	row << latest_commit.dig('commit', 'committer', 'date')&.split("T")&.first

	h 'Stars'
	row << repo['stargazers_count']

	h 'Watchers'
	row << repo['watchers_count']

	h 'Forks'
	row << repo['forks']

	h 'Open Issues'
	row << repo['open_issues']

	@rows << row
	end
	end

	File.open('tmp/results.csv', 'w') do \|f\|
	f.puts CSV.generate(headers: @headers, write_headers: true) { \|csv\| @rows.each { \|row\| csv << row } }
	end
	end

	# fetch_hn_stories
	# fetch_github_repos
	# fetch_github_latest_commits
	# generate_results_csv