This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pymysql.cursors | |
# Connect to the database | |
connection = pymysql.connect(host='localhost', | |
user='user', | |
password='passwd', | |
db='db', | |
charset='utf8mb4', | |
cursorclass=pymysql.cursors.DictCursor) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from w3lib.html import remove_tags, strip_html5_whitespace | |
# keep参数为需要保留的标签名称 | |
remove_tags(text, keep=('img',)) | |
# 移除HTML标签,并删除前后的空白字符 | |
def clean_tags(text, which_ones=(), keep=(), encoding=None) -> str: | |
if not text: | |
return None | |
content = remove_tags(text, which_ones, keep, encoding) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from jsonpath import jsonpath as _jsonpath | |
def jsonpath(obj, expr): | |
""" | |
优先项:如果匹配到的结果只有一个,则直接pop出该结果 | |
""" | |
result = _jsonpath(obj, expr) | |
if isinstance(result, list) and len(result) == 1: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unicodedata | |
s = 'T-shirt\xa0\xa0短袖圆领衫,\u3000体恤衫\xa0买一件\t吧' | |
unicodedata.normalize('NFKC', s) | |
# T-shirt 短袖圆领衫, 体恤衫 买一件 吧 | |
# 日常爬虫抓取数据中常遇到此类问题,使用率较高 | |
def unicode_normalize(unistr, form='NFKC'): | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.http import Response | |
# 声明response类型,帮助IDE完成自动补全 | |
def parse(self, response: Response): | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** 通过元素的坐标进行点击事件 | |
* | |
* @param {*} selector 元素选择器 | |
*/ | |
function click_by_bounds(selector){ | |
var b = selector.findOne().bounds(); | |
return click(b.centerX(), b.centerY()); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from functools import wraps | |
def timethis(func): | |
''' | |
Create a decorator . | |
''' | |
@wraps(func) | |
def wrapper(*args, **kwargs): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
db.getCollection("mastersportal_counrse").aggregate([ | |
{ | |
"$project": { | |
"overview": { | |
"$strLenCP": "$overview" | |
}, | |
"about": { | |
"$strLenCP": "$about" | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ratelimit import limits, sleep_and_retry | |
import requests | |
FIFTEEN_MINUTES = 900 | |
@sleep_and_retry | |
@limits(calls=15, period=FIFTEEN_MINUTES) | |
# 900秒内最多请求15次。 | |
def call_api(url): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
# 字典列表 | |
result_list = [] | |
with open('weibo_data.csv', 'w', newline='', encoding='gbk') as csv_file: | |
# header field | |
fieldnames = result_list[0].keys() | |
writer = csv.DictWriter(csv_file, fieldnames=fieldnames) | |
writer.writeheader() | |
writer.writerows(result_list) |