Skip to content

Instantly share code, notes, and snippets.

@prinsss
Created December 27, 2015 04:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save prinsss/8efc424f8033f7f008ed to your computer and use it in GitHub Desktop.
Save prinsss/8efc424f8033f7f008ed to your computer and use it in GitHub Desktop.
Python 一言爬虫
#!/usr/bin/env python
# -*- coding=utf-8 -*-
import sys
# Python version check
if (sys.version_info[0] < 3):
print("This script only works on Python 3+.")
exit(1)
import os, time
import urllib.request
import json
import argparse
# Init global variables
h_dict = {}
start_time = time.time()
repeated_mount = 0
saved_mount = 0
# Save & Load 大法, 麻麻再也不用担心中断辣
# 注意,文件使用 \n 作为换行符,请不要使用记事本打开
def writeToFile(hid, content):
with open(r"hid.txt", 'a') as f:
f.write(str(hid) + "\n")
with open(r"hitokoto.txt", 'a') as f:
f.write(content + "\n")
def loadFromFile():
global h_dict
try:
with open(r"hid.txt", 'r') as f:
loadedCount = 0
for line in f.readlines():
h_dict[int(line.strip())] = "Loaded."
loadedCount += 1
print(str(loadedCount) + " ids successfully loaded from file." + "\n")
except FileNotFoundError as e:
print("No saved files, created." + "\n")
# 显示进度条及调试信息
# @progress as int: range 0 to 100,
# @start_time as float: looks like time.time()
# @msg as string: message to show
def showProgressBar(progress, start_time, msg):
screenWidth = (os.get_terminal_size().columns) // 10 * 10 - 40
barWidth = int(progress * screenWidth / 100)
# Well, it's a fancy progress bar, it looks like this:
# Msg: 50.0% [=========================> ] in 0.9s
progressBar = (msg+": "+" "*10)[:9] + (" "*4+str(int(progress))+"%")[-6:] + " [" + barWidth*"=" + ">" + " "*int(screenWidth-barWidth) + "]" + " in " + str(round(time.time()-start_time, 1)) + "s"
sys.stdout.write(progressBar + "\r")
sys.stdout.flush()
# 普通的 urllib request
# 返回 一言ID 和 一言内容 的 tuple
def getHitokoto():
url = "http://api.hitokoto.us/rand"
req = urllib.request.Request(url)
req.add_header('Referer', 'http://fuck.com/') # 可以在 http://hitokoto.us/apiref.html 看到你的 referer(笑)
json_str = urllib.request.urlopen(req).read().decode('UTF-8')
json_hitokoto = json.loads(json_str)
# 不要的分类目录分别是 原创,来自网络,其他,都是些瞎眼的东西
if (json_hitokoto["cat"] != "e" and json_hitokoto["cat"] != "f" and json_hitokoto["cat"] != "g"):
hid = json_hitokoto["id"]
content = json_hitokoto["hitokoto"]
source = json_hitokoto["source"]
if source != "":
if "《" in source:
hitokoto = content + "——" + source # 有些 source 自带书名号
else:
hitokoto = content + "——《" + source + "》" # 加上带书名号的来源
else:
hitokoto = content # 有些一言没有 source
return (hid, hitokoto)
else:
# 递归大法
return getHitokoto()
# main function, parameters:
# @count as int: count of request
# @delay as float: delay between each request, in second
# @autoReget as bool: to auto reget if hitokoto repeat until get non-repeated hitokoto
def main(count, delay, autoReget):
global h_dict, start_time, repeated_mount, saved_mount
loadFromFile()
i = 0
while i < count:
time.sleep(delay) # 稍微歇一会,请求太频繁被封 IP 可就糟了
hitokoto = getHitokoto()
if not hitokoto[0] in h_dict:
showProgressBar(((i+1) * 100 // count), start_time, "Hit")
writeToFile(hitokoto[0], hitokoto[1])
h_dict[hitokoto[0]] = "Hit"
saved_mount += 1
i += 1
else:
repeated_mount += 1
if autoReget:
showProgressBar(((i+1) * 100 // count), start_time, "Regeting")
else:
showProgressBar(((i+1) * 100 // count), start_time, "Repeated")
i += 1
# 自己看着办,后期可能会有请求 1000 个,999 个是重复的的情况
# if repeated_mount > 5000:
# break
print("\n\n" + str(saved_mount) + " hitokotos saved, " + str(repeated_mount) + " hitokotos repeated.")
# parse command line arguments
# return @conut as int, @auto as bool
def parseArguments():
parser = argparse.ArgumentParser()
parser.add_argument("-c", help="counts of request", type=int)
parser.add_argument("-d", help="delay between each request in second", type=float, default=0.5)
parser.add_argument("-a", help="auto reget if repeated", action="store_true", default=False)
if not parser.parse_args().c:
print("Missing arguments.\nUsage: python hitokoto.py --help")
exit(1)
return parser.parse_args()
if __name__ == '__main__':
args = parseArguments()
main(args.c, args.d, args.a)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment