Created
December 27, 2015 04:32
-
-
Save prinsss/8efc424f8033f7f008ed to your computer and use it in GitHub Desktop.
Python 一言爬虫
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding=utf-8 -*- | |
import sys | |
# Python version check | |
if (sys.version_info[0] < 3): | |
print("This script only works on Python 3+.") | |
exit(1) | |
import os, time | |
import urllib.request | |
import json | |
import argparse | |
# Init global variables | |
h_dict = {} | |
start_time = time.time() | |
repeated_mount = 0 | |
saved_mount = 0 | |
# Save & Load 大法, 麻麻再也不用担心中断辣 | |
# 注意,文件使用 \n 作为换行符,请不要使用记事本打开 | |
def writeToFile(hid, content): | |
with open(r"hid.txt", 'a') as f: | |
f.write(str(hid) + "\n") | |
with open(r"hitokoto.txt", 'a') as f: | |
f.write(content + "\n") | |
def loadFromFile(): | |
global h_dict | |
try: | |
with open(r"hid.txt", 'r') as f: | |
loadedCount = 0 | |
for line in f.readlines(): | |
h_dict[int(line.strip())] = "Loaded." | |
loadedCount += 1 | |
print(str(loadedCount) + " ids successfully loaded from file." + "\n") | |
except FileNotFoundError as e: | |
print("No saved files, created." + "\n") | |
# 显示进度条及调试信息 | |
# @progress as int: range 0 to 100, | |
# @start_time as float: looks like time.time() | |
# @msg as string: message to show | |
def showProgressBar(progress, start_time, msg): | |
screenWidth = (os.get_terminal_size().columns) // 10 * 10 - 40 | |
barWidth = int(progress * screenWidth / 100) | |
# Well, it's a fancy progress bar, it looks like this: | |
# Msg: 50.0% [=========================> ] in 0.9s | |
progressBar = (msg+": "+" "*10)[:9] + (" "*4+str(int(progress))+"%")[-6:] + " [" + barWidth*"=" + ">" + " "*int(screenWidth-barWidth) + "]" + " in " + str(round(time.time()-start_time, 1)) + "s" | |
sys.stdout.write(progressBar + "\r") | |
sys.stdout.flush() | |
# 普通的 urllib request | |
# 返回 一言ID 和 一言内容 的 tuple | |
def getHitokoto(): | |
url = "http://api.hitokoto.us/rand" | |
req = urllib.request.Request(url) | |
req.add_header('Referer', 'http://fuck.com/') # 可以在 http://hitokoto.us/apiref.html 看到你的 referer(笑) | |
json_str = urllib.request.urlopen(req).read().decode('UTF-8') | |
json_hitokoto = json.loads(json_str) | |
# 不要的分类目录分别是 原创,来自网络,其他,都是些瞎眼的东西 | |
if (json_hitokoto["cat"] != "e" and json_hitokoto["cat"] != "f" and json_hitokoto["cat"] != "g"): | |
hid = json_hitokoto["id"] | |
content = json_hitokoto["hitokoto"] | |
source = json_hitokoto["source"] | |
if source != "": | |
if "《" in source: | |
hitokoto = content + "——" + source # 有些 source 自带书名号 | |
else: | |
hitokoto = content + "——《" + source + "》" # 加上带书名号的来源 | |
else: | |
hitokoto = content # 有些一言没有 source | |
return (hid, hitokoto) | |
else: | |
# 递归大法 | |
return getHitokoto() | |
# main function, parameters: | |
# @count as int: count of request | |
# @delay as float: delay between each request, in second | |
# @autoReget as bool: to auto reget if hitokoto repeat until get non-repeated hitokoto | |
def main(count, delay, autoReget): | |
global h_dict, start_time, repeated_mount, saved_mount | |
loadFromFile() | |
i = 0 | |
while i < count: | |
time.sleep(delay) # 稍微歇一会,请求太频繁被封 IP 可就糟了 | |
hitokoto = getHitokoto() | |
if not hitokoto[0] in h_dict: | |
showProgressBar(((i+1) * 100 // count), start_time, "Hit") | |
writeToFile(hitokoto[0], hitokoto[1]) | |
h_dict[hitokoto[0]] = "Hit" | |
saved_mount += 1 | |
i += 1 | |
else: | |
repeated_mount += 1 | |
if autoReget: | |
showProgressBar(((i+1) * 100 // count), start_time, "Regeting") | |
else: | |
showProgressBar(((i+1) * 100 // count), start_time, "Repeated") | |
i += 1 | |
# 自己看着办,后期可能会有请求 1000 个,999 个是重复的的情况 | |
# if repeated_mount > 5000: | |
# break | |
print("\n\n" + str(saved_mount) + " hitokotos saved, " + str(repeated_mount) + " hitokotos repeated.") | |
# parse command line arguments | |
# return @conut as int, @auto as bool | |
def parseArguments(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-c", help="counts of request", type=int) | |
parser.add_argument("-d", help="delay between each request in second", type=float, default=0.5) | |
parser.add_argument("-a", help="auto reget if repeated", action="store_true", default=False) | |
if not parser.parse_args().c: | |
print("Missing arguments.\nUsage: python hitokoto.py --help") | |
exit(1) | |
return parser.parse_args() | |
if __name__ == '__main__': | |
args = parseArguments() | |
main(args.c, args.d, args.a) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment