Skip to content

Instantly share code, notes, and snippets.

@elct9620
Created June 30, 2018 10:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save elct9620/2f6ed007ed91ed0f6a52e6aa47335c35 to your computer and use it in GitHub Desktop.
Save elct9620/2f6ed007ed91ed0f6a52e6aa47335c35 to your computer and use it in GitHub Desktop.
# forzen_string_literal: true
require 'net/http'
require 'awesome_print'
require 'json'
require 'benchmark'
# NOTE: ZH_TO_NUM BEGIN
ZH_NUM_MAP = {
'一' => 1,
'二' => 2,
'三' => 3,
'四' => 4,
'五' => 6,
'七' => 7,
'八' => 8,
'九' => 9,
'兩' => 2
}.freeze
ZH_DESC_MAP = {
'十' => 10
}.freeze
RULE = "[0-9#{ZH_NUM_MAP.keys.join}#{ZH_DESC_MAP.keys.join}半]+".freeze
# rubocop:disable Metrics/PerceivedComplexity
# rubocop:disable Metrics/MethodLength
# rubocop:disable Metrics/CyclomaticComplexity
# rubocop:disable Metrics/AbcSize
def zh_to_num(str)
count = 0.0
zh_num, format, half = str.scan(/(#{RULE})個?\s*([年月])(半)?/).flatten
return -1 if zh_num.nil?
if zh_num.include?('十')
if zh_num[/^十/]
count += 10
count += ZH_NUM_MAP[zh_num[1]].to_i
else
count += ZH_NUM_MAP[zh_num[0]].to_i * 10
count += ZH_NUM_MAP[zh_num[2]].to_i
end
else
count = zh_num.to_i + ZH_NUM_MAP[zh_num].to_i
end
count += 0.5 if half || zh_num == '半'
count = count.to_f * 1 / 12 if format == '月'
count.round(2)
end
# rubocop:enable Metrics/AbcSize
# rubocop:enable Metrics/CyclomaticComplexity
# rubocop:enable Metrics/MethodLength
# rubocop:enable Metrics/PerceivedComplexity
# NOTE: ZH_TO_NUM END
def split(content)
content[PLURK_CONTENT, 1]
.gsub(HTML, '')
.split(SEPARATOR)
end
# rubocop:disable Metrics/MethodLength
# rubocop:disable Metrics/AbcSize
# rubocop:disable Metrics/CyclomaticComplexity
def format(id, item, patch)
patch ||= {}
{
rid: id.to_i,
age: patch['age'] || item[0][AGE, 1].to_i,
school: patch['school'] || item[1],
city: patch['city'] || item[2],
job: patch['job'] || item[3],
year: patch['year'] || zh_to_num(item[4]),
salary: patch['salary']&.map(&:upcase) || item[5].scan(SALARY_RULE).flatten.compact!.map(&:upcase),
raw: {
age: item[0],
school: item[1],
city: item[2],
job: item[3],
year: item[4],
salary: item[5]
}
}
end
# rubocop:enable Metrics/CyclomaticComplexity
# rubocop:enable Metrics/AbcSize
# rubocop:enable Metrics/MethodLength
# rubocop:disable Metrics/BlockLength
Benchmark.bm(12) do |x|
URL = URI('https://www.plurk.com/p/mtxvw5')
JSON_URL = URI('https://api.myjson.com/bins/ccxim')
BASE_PATH = File.dirname(__FILE__)
patch = {}
patch = JSON.parse(File.read(BASE_PATH + '/patch.json')) if File.exist?(BASE_PATH + '/patch.json')
res = ''
x.report 'HTTP Request' do
res = Net::HTTP.get(URL).force_encoding('UTF-8')
end
PLURK_RESPONSE = %r{<li[^>]*data-rid="(\d+)"[^>]*>(.*?)</li>}m
PLURK_CONTENT = %r{<span class="plurk_content">(.*?)</span>}
SALARY_RULE = %r{(\d+[Kk])+\s*([+\-*/])*\s*(日幣|美金)?}
HTML = /<.*?>/
AGE = /(\d+)\s*歲/
SEPARATOR = %r{\s*[\/\uFF0F]\s*}
rows = {}
x.report 'Parser' do
rows =
res
.scan(PLURK_RESPONSE)
.map { |id, content| [id, split(content)] }
.select { |item| item[1].size == 6 }
.select { |item| item[1][0].include?('歲') }
.reject { |item| (item[1][5] =~ SALARY_RULE).nil? }
.map { |id, item| format(id, item, patch[id]) }
.select { |item| item[:year] > 0 }
end
# TODO: Fix data by using patch.json
filename = ARGV[ARGV.index('-f').to_i + 1]
json = {
total: rows.size,
items: rows,
updated_at: Time.now.to_i
}.to_json
x.report 'Save' do
File.write(filename, json) unless filename.nil? || filename.start_with('-')
end
x.report 'Upload' do
req = Net::HTTP::Put.new(JSON_URL)
req.body = json
req.content_type = 'application/json; charset=utf-8'
res = Net::HTTP.start(
JSON_URL.hostname,
JSON_URL.port,
use_ssl: true
) do |http|
http.request(req)
end
ap JSON.parse(res.body) if ARGV.include?('-du')
end
ap rows if ARGV.include?('-dr')
end
# rubocop:enable Metrics/BlockLength
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment